In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Carregar os dados do conjunto de dados Breast Cancer
data = pd.read_csv('dataR2.csv')

# Separar os atributos de entrada (X) e as classes (y)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Normalizar os atributos
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

In [10]:
from sklearn.model_selection import StratifiedKFold

# Definir o número de folds
num_folds = 5

# Criar os folds estratificados
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Variáveis para armazenar as acurácias médias
decision_tree_accuracies = []
knn_accuracies = []

# Loop externo para a avaliação externa
for train_index, test_index in skf.split(X_normalized, y):
    X_train, X_test = X_normalized[train_index], X_normalized[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Definir os valores dos hiperparâmetros
decision_tree_params = {'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': range(10, 101, 10)}
knn_params = {'n_neighbors': [1, 3, 5, 11, 21, 31]}

# Loop interno para a escolha dos hiperparâmetros
decision_tree_grid = GridSearchCV(DecisionTreeClassifier(), decision_tree_params, cv=5)
decision_tree_grid.fit(X_train, y_train)
best_decision_tree = decision_tree_grid.best_estimator_

knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train, y_train)
best_knn = knn_grid.best_estimator_

In [12]:
from sklearn.metrics import accuracy_score

# Avaliar a acurácia média usando o fold de teste
decision_tree_accuracy = accuracy_score(y_test, best_decision_tree.predict(X_test))
knn_accuracy = accuracy_score(y_test, best_knn.predict(X_test))

# Armazenar as acurácias médias
decision_tree_accuracies.append(decision_tree_accuracy)
knn_accuracies.append(knn_accuracy)


In [13]:
# Encontrar o conjunto de hiperparâmetros com a melhor acurácia média
best_decision_tree_index = decision_tree_accuracies.index(max(decision_tree_accuracies))
best_decision_tree_params = decision_tree_grid.cv_results_['params'][best_decision_tree_index]

best_knn_index = knn_accuracies.index(max(knn_accuracies))
best_knn_params = knn_grid.cv_results_['params'][best_knn_index]

In [14]:
# Treinar o modelo Decision Tree com o melhor conjunto de hiperparâmetros
best_decision_tree.fit(X_normalized, y)

# Treinar o modelo KNN com o melhor conjunto de hiperparâmetros
best_knn.fit(X_normalized, y)

In [15]:
from sklearn.metrics import accuracy_score

# Avaliar o desempenho dos modelos com os dados de teste
decision_tree_test_accuracy = accuracy_score(y_test, best_decision_tree.predict(X_test))
knn_test_accuracy = accuracy_score(y_test, best_knn.predict(X_test))

# Restante do código virá dentro do loop externo

# Calcular a acurácia média entre os a_i
decision_tree_avg_accuracy = sum(decision_tree_accuracies) / len(decision_tree_accuracies)
knn_avg_accuracy = sum(knn_accuracies) / len(knn_accuracies)

# Exibir as acurácias médias
print("Acurácia média (Decision Tree):", decision_tree_avg_accuracy)
print("Acurácia média (KNN):", knn_avg_accuracy)


Acurácia média (Decision Tree): 0.7391304347826086
Acurácia média (KNN): 0.782608695652174
