In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Carregar o dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ["ID", "Diagnosis", "Mean_Radius", "Mean_Texture", "Mean_Perimeter", "Mean_Area", "Mean_Smoothness",
           "Mean_Compactness", "Mean_Concavity", "Mean_Concave_Points", "Mean_Symmetry", "Mean_Fractal_Dimension",
           "SE_Radius", "SE_Texture", "SE_Perimeter", "SE_Area", "SE_Smoothness", "SE_Compactness", "SE_Concavity",
           "SE_Concave_Points", "SE_Symmetry", "SE_Fractal_Dimension", "Worst_Radius", "Worst_Texture",
           "Worst_Perimeter", "Worst_Area", "Worst_Smoothness", "Worst_Compactness", "Worst_Concavity",
           "Worst_Concave_Points", "Worst_Symmetry", "Worst_Fractal_Dimension"]
data = pd.read_csv(url, names=columns)

# Separar features e variável alvo
X = data.drop(["ID", "Diagnosis"], axis=1)
y = data["Diagnosis"]

# Dividir o conjunto de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Escalonar as features usando o StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Definir os algoritmos Ensemble
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
ada_boost = AdaBoostClassifier()

# Definir as combinações de hiperparâmetros para o GridSearchCV
param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7]
}

param_grid_ada = {
    "learning_rate": [0.1, 0.05, 0.01, 0.5],
    "n_estimators": [50, 100, 200],
}

# Realizar o GridSearchCV para ajustar os hiperparâmetros do RandomForest
grid_search_rf = GridSearchCV(random_forest, param_grid_rf, scoring="accuracy", cv=5)
grid_search_rf.fit(X_train_scaled, y_train)
best_rf = grid_search_rf.best_estimator_

# Realizar o GridSearchCV para ajustar os hiperparâmetros do AdaBoost
grid_search_ada = GridSearchCV(ada_boost, param_grid_ada, scoring="accuracy", cv=5)
grid_search_ada.fit(X_train_scaled, y_train)
best_ada = grid_search_ada.best_estimator_

# Treinar o modelo RandomForest com a melhor combinação de parâmetros
best_rf.fit(X_train_scaled, y_train)

# Calcular as métricas de avaliação para RandomForest
y_pred_rf = best_rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, pos_label="M")
recall_rf = recall_score(y_test, y_pred_rf, pos_label="M")
f1_rf = f1_score(y_test, y_pred_rf, pos_label="M")

# Treinar o modelo AdaBoost com a melhor combinação de parâmetros
best_ada.fit(X_train_scaled, y_train)

# Calcular as métricas de avaliação para AdaBoost
y_pred_ada = best_ada.predict(X_test_scaled)
accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada, pos_label="M")
recall_ada = recall_score(y_test, y_pred_ada, pos_label="M")
f1_ada = f1_score(y_test, y_pred_ada, pos_label="M")

# Imprimir as métricas de avaliação
print("Random Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)

print("\nAdaBoost Metrics:")
print("Accuracy:", accuracy_ada)
print("Precision:", precision_ada)
print("Recall:", recall_ada)
print("F1-score:", f1_ada)

# Obtendo importância das features do melhor modelo RandomForest
feature_importances = best_rf.feature_importances_
sorted_importances_rf = sorted(zip(feature_importances, X.columns), reverse=True)

# Obtendo importância das features do melhor modelo entre os três usados
models = [decision_tree, best_rf, best_ada]
importances_all = []
for model in models:
    model.fit(X_train_scaled, y_train)
    importances_all.append(model.feature_importances_)
importances_mean = pd.DataFrame(importances_all, columns=X.columns).mean()
sorted_importances_all = sorted(zip(importances_mean, X.columns), reverse=True)

# Selecionar as 10 features mais importantes
top_10_features = [feature for importance, feature in sorted_importances_all[:10]]

# Obter os índices numéricos das colunas correspondentes às 10 features mais importantes
top_10_feature_indices = [X.columns.get_loc(feature) for feature in top_10_features]

# Treinar o melhor modelo RandomForest com as 10 features mais importantes
best_rf_top_features = RandomForestClassifier()
best_rf_top_features.fit(X_train_scaled[:, top_10_feature_indices], y_train)

# Calcular as métricas de avaliação para o melhor modelo RandomForest com as 10 features mais importantes
y_pred_rf_top_features = best_rf_top_features.predict(X_test_scaled[:, top_10_feature_indices])
accuracy_rf_top_features = accuracy_score(y_test, y_pred_rf_top_features)
precision_rf_top_features = precision_score(y_test, y_pred_rf_top_features, pos_label="M")
recall_rf_top_features = recall_score(y_test, y_pred_rf_top_features, pos_label="M")
f1_rf_top_features = f1_score(y_test, y_pred_rf_top_features, pos_label="M")

# Imprimir as métricas de avaliação para o melhor modelo RandomForest com as 10 features mais importantes
print("\nRandom Forest Metrics (Top 10 Features):")
print("Accuracy:", accuracy_rf_top_features)
print("Precision:", precision_rf_top_features)
print("Recall:", recall_rf_top_features)
print("F1-score:", f1_rf_top_features)

Random Forest Metrics:
Accuracy: 0.9649122807017544
Precision: 0.975609756097561
Recall: 0.9302325581395349
F1-score: 0.9523809523809524

AdaBoost Metrics:
Accuracy: 0.9649122807017544
Precision: 0.975609756097561
Recall: 0.9302325581395349
F1-score: 0.9523809523809524

Random Forest Metrics (Top 10 Features):
Accuracy: 0.956140350877193
Precision: 0.9523809523809523
Recall: 0.9302325581395349
F1-score: 0.9411764705882352
