In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score, recall_score, roc_auc_score, precision_score
from sklearn.model_selection import GridSearchCV

In [4]:
#Importando o DataFrame
df = pd.read_csv("/content/drive/MyDrive/MJV/DS School/Limpa_G4")
df = df.iloc[:,1:]
df

# Separando X e y 
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

#Separando os dados em teste e treino

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify = y,  random_state= 6)

#Separando os dados de treinamento em treinamento (final) e validação

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, stratify = y_train, random_state= 6)

# Calculando as proporções do valor 1 entre os splits
prop_real = df[df["Made Donation in March 2007"] == 1]["Made Donation in March 2007"].count()/df["Made Donation in March 2007"].count()
prop_treino = y_train[y_train == 1].count()/y_train.count()
prop_teste = y_test[y_test == 1].count()/y_test.count()
prop_validação = y_val[y_val == 1].count()/y_val.count()

# Criando um Data Frame para avaliar se a proporção de valor 1 é identica nos datasets.
df_train_test_prop = pd.DataFrame([prop_real,prop_treino,prop_teste, prop_validação]).T
df_train_test_prop.rename(columns = {0:"Proporção Real", 1: "Proporção de Treinamento", 2: "Proporção de Teste", 3: "Proporção de Validação"}, 
                          inplace = True)
df_train_test_prop

FileNotFoundError: ignored

In [5]:
# Normalizando X_train

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# Normalizando o resto

X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Separando as métricas
métricas = ["accuracy", "precision", "recall", "f1", "roc_auc"]
range(len(métricas))

range(0, 5)

In [7]:
#from sklearn.neighbors import KNeighborsRegressor

#knn = KNeighborsRegressor(n_neighbors=1) #or 3 or 7

#knn.fit(X_train, y_train)

#y_predict = knn.predict(X_val)
#y_predict

In [12]:
print('Accuracy:',accuracy_score(y_val, y_predict).round(2))
print('F1:',f1_score(y_val, y_predict).round(2))
print('Recall:',recall_score(y_val, y_predict).round(2))

print('Roc',roc_auc_score(y_val, y_predict).round(2))

Accuracy: 0.78
F1: 0.44
Recall: 0.37
Roc 0.64


In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7) #or 3 or 7

knn.fit(X_train, y_train)

y_predict = knn.predict(X_val)
#y_predict

In [10]:
%time
# Grid Search
métricas = ["accuracy", "precision", "recall", "f1_macro", "roc_auc"]

knn = KNeighborsClassifier(n_neighbors = 5)

# Separando os hiperparâmetros
hiperparametros = {"algorithm": ["auto", "ball_tree", "brute", "kd_tree"],
                   "leaf_size": [1,5, 10, 15, 25, 30, 35],
                   "p": [1,2],
                   "metric": ["minkowski", "euclidean"]}

#Realizando o GridSearch
resultados = {}

for i in range(len(métricas)):
  
  GS = GridSearchCV(knn, hiperparametros, scoring = métricas[i])

  GS.fit(X_train, y_train)

  resultados[i] = pd.DataFrame(GS.cv_results_)

resultados["accuracy"] = resultados.pop(0)
resultados["precision"] = resultados.pop(1)
resultados["recall"] = resultados.pop(2)
resultados["f1"] = resultados.pop(3)
resultados["roc_auc"] = resultados.pop(4)

melhores_modelos = {"accuracy": resultados["accuracy"].sort_values("mean_test_score", ascending = False).iloc[0,8],
                    "precision": resultados["precision"].sort_values("mean_test_score", ascending = False).iloc[0,8],
                    "recall":resultados["recall"].sort_values("mean_test_score", ascending = False).iloc[0,8],
                    "f1": resultados["f1"].sort_values("mean_test_score", ascending = False).iloc[0,8],
                    "roc_auc": resultados["roc_auc"].sort_values("mean_test_score", ascending = False).iloc[0,8]}
melhores_modelos

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


{'accuracy': {'algorithm': 'brute',
  'leaf_size': 1,
  'metric': 'minkowski',
  'p': 1},
 'f1': {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'minkowski', 'p': 1},
 'precision': {'algorithm': 'brute',
  'leaf_size': 1,
  'metric': 'minkowski',
  'p': 1},
 'recall': {'algorithm': 'brute',
  'leaf_size': 1,
  'metric': 'minkowski',
  'p': 1},
 'roc_auc': {'algorithm': 'ball_tree',
  'leaf_size': 10,
  'metric': 'minkowski',
  'p': 1}}

In [11]:
%time
# Treinando e testando o knn com os "melhores" hiperparâmetros

knn = KNeighborsClassifier(algorithm = "brute", 
              leaf_size = 10, 
              metric = "minkowski", 
              p = 2, n_neighbors=5).fit(X_train, y_train)

# Prevendo y
y_pred = knn.predict(X_val)

# Separando as métricas-

Acurácia = accuracy_score(y_val, y_pred).round(2)
Precisão =  precision_score(y_val, y_pred).round(2)
Recall = recall_score(y_val, y_pred).round(2)
F1 = f1_score(y_val, y_pred, average = "macro").round(2)
ROC = roc_auc_score(y_val, y_pred).round(2)

# Organizando as métricas em um DataFrame
resultados_val = pd.DataFrame({"Acurácia":Acurácia,
                                "Precisão": Precisão,
                                "Recall": Recall,
                                "F1": F1,
                                "ROC": ROC}, index = [0])

resultados_val

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


Unnamed: 0,Acurácia,Precisão,Recall,F1,ROC
0,0.82,0.68,0.43,0.71,0.69


In [None]:
#kbins Defaul

print('Accuracy:',accuracy_score(y_val, y_predict).round(2))
print('Precision:',precision_score(y_val, y_predict).round(2))
print('Recall:',recall_score(y_val, y_predict).round(2))
print('F1:',f1_score(y_val, y_predict).round(2))
print('Roc',roc_auc_score(y_val, y_predict).round(2))

Accuracy: 0.76
Precision: 0.47
Recall: 0.27
F1: 0.34
Roc 0.59


In [None]:
#
print('Accuracy:',accuracy_score(y_val, y_predict).round(2))
print('Precision:',precision_score(y_val, y_predict).round(2))
print('Recall:',recall_score(y_val, y_predict).round(2))
print('F1:',f1_score(y_val, y_predict).round(2))
print('Roc',roc_auc_score(y_val, y_predict).round(2))

Accuracy: 0.76
Precision: 0.47
Recall: 0.27
F1: 0.34
Roc 0.59


In [None]:
#
print('Accuracy:',accuracy_score(y_val, y_predict).round(2))
print('F1:',f1_score(y_val, y_predict).round(2))
print('Recall:',recall_score(y_val, y_predict).round(2))
print('Roc',roc_auc_score(y_val, y_predict).round(2))

Accuracy: 0.76
F1: 0.34
Recall: 0.27
Roc 0.59
