# K Nearest Neighbors (KNN)

In [1]:
#Importando librearias
import sys
import os
import json
import tempfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append('../')
sys.path.append('../../')
from Resources.mlTracker import *
from Python.Style.styles import  *
from mlflow.tracking import MlflowClient
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score

In [2]:
#Desactivando wrnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
training = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/training_shuffled.parquet")
training_us = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/undersampled_shuffled.parquet")
testing1 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing1.parquet")
testing2 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing2.parquet")

In [4]:
#To delete columns
total_columns = training.columns
tdc = ['serialNumber','serialNumber_neighbor','fixed_path','FE-Comments','Conjunto','PSNumber']
training_cols = list(filter(lambda x: x not in tdc, total_columns))
#Seleccionando columnas
training = training[training_cols]
training_us = training_us[training_cols]
testing1 = testing1[training_cols]
testing2 = testing2[training_cols]

In [5]:
#Separando etiquetas
#Training
y = training['Communicating']
X = training.drop('Communicating', axis=1)
#Training undersampled
y_us = training_us['Communicating']
X_us = training_us.drop('Communicating', axis=1)
#Test1
y_testing_1 = testing1['Communicating']
X_testing_1 = testing1.drop('Communicating', axis=1)
#Test2
y_testing_2 = testing2['Communicating']
X_testing_2 = testing2.drop('Communicating', axis=1)

In [None]:
#Seteando experimento
experiment_name = "ML"
mlflow.set_experiment(experiment_name)

In [7]:
#Parámetros de búsqueda
param_grid = {
    'n_neighbors': [1,2,3] + list(range(4,20,2)),
    'weights': ['uniform', 'distance']
}
#Métricas de interés
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score, average='macro')}

In [8]:
# Configuración del modelo y GridSearchCV
knn = KNeighborsClassifier()
grid_search = RandomizedSearchCV(
                            estimator= knn,
                            param_distributions= param_grid,
                            n_iter= 10,
                            cv= 3,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= -1,
                            verbose= 2)

## Encontrando hiperparámetros para KNN con df Training completo

In [9]:
#Ajustando para training
with mlflow.start_run(run_name="KNN-Training"):
    knn1 = grid_search.fit(X, y)
    #Mejor modelo
    best_model = knn1.best_estimator_
    #Guardando resultados
    mlflow.log_params(knn1.best_params_)
    mlflow.log_metrics({"best_score":knn1.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(knn1, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = knn1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, knn1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, knn1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, knn1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, knn1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ....................n_neighbors=1, weights=distance; total time=44.4min
[CV] END ....................n_neighbors=1, weights=distance; total time=44.5min
[CV] END ....................n_neighbors=4, weights=distance; total time=44.5min
[CV] END ....................n_neighbors=1, weights=distance; total time=44.5min
[CV] END ....................n_neighbors=4, weights=distance; total time=44.5min
[CV] END ....................n_neighbors=4, weights=distance; total time=44.5min
[CV] END .....................n_neighbors=6, weights=uniform; total time=44.5min
[CV] END .....................n_neighbors=6, weights=uniform; total time=44.6min
[CV] END .....................n_neighbors=6, weights=uniform; total time=15.7min
[CV] END .....................n_neighbors=4, weights=uniform; total time=15.7min
[CV] END ....................n_neighbors=10, weights=uniform; total time=15.7min
[CV] END ....................n_neighbors=10, wei

In [11]:
#Entrenando con k = 5
knn_temp = KNeighborsClassifier(n_neighbors=5)
knn_temp.fit(X, y)
#Calculando métricas
f1_test1 = f1_score(y_testing_1, knn_temp.predict(X_testing_1))
f1_test2 = f1_score(y_testing_2, knn_temp.predict(X_testing_2))
accuracy_test1 = accuracy_score(y_testing_1, knn_temp.predict(X_testing_1), average='macro')
accuracy_test2 = accuracy_score(y_testing_2, knn_temp.predict(X_testing_2), average='macro')
#Imprimiendo resultados
print("Resultaods con k = 5")
print("F1 Score Test1: ", f1_test1)
print("F1 Score Test2: ", f1_test2)
print("Accuracy Test1: ", accuracy_test1)
print("Accuracy Test2: ", accuracy_test2)


Resultaods con k = 5
F1 Score Test1:  0.988315122292647
F1 Score Test2:  0.9700495212941566
Accuracy Test1:  0.9774420568640331
Accuracy Test2:  0.9422268639987773


## Encontrando hiperparámetros para KNN con df Training balanceado

In [12]:
#Ajustando para training undersampled
with mlflow.start_run(run_name="KNN-Training-Undersampled"):
    knn2 = grid_search.fit(X_us, y_us)
    #Mejor modelo
    best_model = knn2.best_estimator_
    #Guardando resultados
    mlflow.log_params(knn2.best_params_)
    mlflow.log_metrics({"best_score":knn2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(knn2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = knn2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, knn2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, knn2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, knn2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, knn2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ....................n_neighbors=3, weights=distance; total time=   7.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=   7.0s
[CV] END ....................n_neighbors=3, weights=distance; total time=   7.1s
[CV] END ....................n_neighbors=3, weights=distance; total time=   7.1s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   7.1s
[CV] END ....................n_neighbors=1, weights=distance; total time=   7.2s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   7.2s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   7.2s
[CV] END ....................n_neighbors=1, weights=distance; total time=   7.0s
[CV] END .....................n_neighbors=1, weights=uniform; total time=   6.9s
[CV] END .....................n_neighbors=6, weights=uniform; total time=   7.2s
[CV] END .....................n_neighbors=1, wei