In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [71]:
breast_cancer = pd.read_csv("dataR2.csv")
breast_cancer.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


In [72]:
breast_cancer.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,1.551724
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,0.499475
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,1.0
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474283,6.881763,269.97825,1.0
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,2.0
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,2.0
max,89.0,38.578759,201.0,58.46,25.050342,90.28,38.04,82.1,1698.44,2.0


In [73]:
X = breast_cancer.values[:,:9]
y = breast_cancer.values[:,9]

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify= y, random_state=42)

Acredito que seja preciso normalizar os dados, pois as escalas de alguns dados são muito diferentes e para o knn, variáveis com valores muito altos podem afetar o cálculo da distância e, no caso da árvore de decisão, elas podem ter mais chance de serem escolhidos como atributos de particionamento.

In [75]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_train)

In [76]:
accuracies = []
accuracies_mean = []

In [77]:
valores_k = [1,3,5,11,21,31]

In [78]:
kf = KFold(n_splits=5)
for k in valores_k:
  accuracies = []
  for (train_ind, val_ind) in kf.split(X_norm):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_norm[train_ind, :], y_train[train_ind])
    y_predict = neigh.predict(X_norm[val_ind, :])
    acc = accuracy_score(y_train[val_ind], y_predict)
    accuracies.append(acc)
  accuracies_mean.append(np.mean(accuracies))  

In [79]:
print(valores_k[accuracies_mean.index(max(accuracies_mean))], max(accuracies_mean))

3 0.7485380116959064


In [80]:
criterios = ["gini", "entropy", "log_loss"]
profundidade_maxima = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [81]:
kf = KFold(n_splits=5)
for crit in criterios:
  for depth in profundidade_maxima:
    accuracies = []
    for (train_ind, val_ind) in kf.split(X_norm):
      clf = DecisionTreeClassifier(random_state=42, criterion = crit, max_depth = depth)
      clf.fit(X_norm[train_ind, :], y_train[train_ind])
      y_pred = clf.predict(X_norm[val_ind, :])
      accuracy = accuracy_score(y_train[val_ind], y_pred)
      accuracies.append(accuracy)
    accuracies_mean.append(np.mean(accuracies))  

In [82]:
indice = 0
if accuracies_mean.index(max(accuracies_mean)) > 19:
  indice = 2
elif accuracies_mean.index(max(accuracies_mean)) > 9:
  indice = 1

In [83]:
print(criterios[indice], profundidade_maxima[accuracies_mean.index(max(accuracies_mean))%10], max(accuracies_mean))

gini 20 0.7485380116959064


Tivemos um empate e, portanto, vamos testar com os ambos os seguintes hiperparâmetros e algoritmos:

KNN com k = 3

Árvore de Decisão com critério gini e profundidade máxima de 20

In [84]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_norm, y_train)

In [85]:
X_test_norm = scaler.transform(X_test)
y_predicted = knn.predict(X_test_norm)
print(accuracy_score(y_predicted, y_test))

0.7083333333333334


In [86]:
dtc = DecisionTreeClassifier(random_state=42, criterion = "gini", max_depth = 20)
dtc.fit(X_norm, y_train)

In [87]:
y_p = dtc.predict(X_test_norm)
print(accuracy_score(y_p, y_test))

0.625


Como podemos perceber, os melhores hiperparâmetros e algoritmo foram: knn com k = 3, cuja acurácia final foi 0.7083333333333334