# K Nearest Neighbors (KNN) - 3

In [1]:
#Importando librearias
import sys
import os
import json
import tempfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append('../')
sys.path.append('../../')
from Resources.mlTracker import *
from Python.Style.styles import  *
from mlflow.tracking import MlflowClient
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import make_scorer, accuracy_score, f1_score

In [2]:
#Desactivando wrnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
training = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/training_shuffled.parquet")
training_us = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/undersampled_shuffled.parquet")
training_os = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/oversampled_shuffled.parquet")
testing1 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing1.parquet")
testing2 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing2.parquet")

In [4]:
#To delete columns
total_columns = training.columns
tdc = ['serialNumber','serialNumber_neighbor','fixed_path','FE-Comments','Conjunto','PSNumber','Antenna']
training_cols = list(filter(lambda x: x not in tdc, total_columns))
#Grouping
group_train = training['serialNumber']
group_train_us = training_us['serialNumber']
group_train_os = training_os['serialNumber']
#Seleccionando columnas
training = training[training_cols]
training_us = training_us[training_cols]
training_os = training_os[training_cols]
testing1 = testing1[training_cols]
testing2 = testing2[training_cols]

In [5]:
#Separando etiquetas
#Training
y = training['Communicating']
X = training.drop('Communicating', axis=1)
#Training undersampled
y_us = training_us['Communicating']
X_us = training_us.drop('Communicating', axis=1)
#Training oversampled
y_os = training_os['Communicating']
X_os = training_os.drop('Communicating', axis=1)
#Test1
y_testing_1 = testing1['Communicating']
X_testing_1 = testing1.drop('Communicating', axis=1)
#Test2
y_testing_2 = testing2['Communicating']
X_testing_2 = testing2.drop('Communicating', axis=1)

In [None]:
#Seteando experimento
experiment_name = "ML-3"
mlflow.set_experiment(experiment_name)

In [7]:
#GroupKFold
group_kfold = GroupKFold(n_splits=3)
#Creando group_kfolds
group_kfolds = group_kfold.split(training, training['Communicating'], group_train)
group_kfolds_us = group_kfold.split(training_us, training_us['Communicating'], group_train_us)
group_kfolds_os = group_kfold.split(training_os, training_os['Communicating'], group_train_os)

In [8]:
#Parámetros de búsqueda
param_dist = {
    'n_neighbors': [1,2,3] + list(range(4,20,2)),
    'weights': ['uniform', 'distance']
}
#Métricas de interés
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score, average='weighted')}

In [9]:
# Configuración del modelo y GridSearchCV
knn = KNeighborsClassifier()
knn_randomized_search = RandomizedSearchCV(
                            estimator= knn,
                            param_distributions= param_dist,
                            n_iter= 10,
                            cv= group_kfold,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

## Encontrando hiperparámetros para KNN con df Training Undersampled

In [10]:
#Ajustando para training
with mlflow.start_run(run_name="KNN-Undersampled"):
    mlflow.log_param("model","KNN")
    mlflow.log_param("data","Undersampled")
    #Agregando información de param_dist
    mlflow.log_param("dist_K","1,2,3,4,6,8,10,12,14,16,18")
    mlflow.log_param("dist_weights",'uniform, distance')
    mlflow.log_param("n_iter",10)
    #Entrenando modelo
    knn1 = knn_randomized_search.fit(X_us, y_us, groups= group_train_us)
    #Mejor modelo
    best_model = knn1.best_estimator_
    #Guardando resultados
    mlflow.log_params(knn1.best_params_)
    mlflow.log_metrics({"best_score":knn1.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(knn1, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = knn1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, knn1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, knn1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, knn1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, knn1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END n_neighbors=2, weights=distance; Accuracy: (test=0.638) F1: (test=0.623) total time=   4.1s
[CV 1/3] END n_neighbors=6, weights=distance; Accuracy: (test=0.660) F1: (test=0.652) total time=   4.1s
[CV 2/3] END n_neighbors=6, weights=distance; Accuracy: (test=0.663) F1: (test=0.653) total time=   4.1s
[CV 3/3] END n_neighbors=6, weights=distance; Accuracy: (test=0.660) F1: (test=0.650) total time=   4.2s
[CV 1/3] END n_neighbors=3, weights=distance; Accuracy: (test=0.654) F1: (test=0.644) total time=   3.4s
[CV 2/3] END n_neighbors=2, weights=distance; Accuracy: (test=0.661) F1: (test=0.647) total time=   3.6s
[CV 2/3] END n_neighbors=3, weights=distance; Accuracy: (test=0.659) F1: (test=0.649) total time=   3.6s
[CV 3/3] END n_neighbors=2, weights=distance; Accuracy: (test=0.636) F1: (test=0.618) total time=   3.8s
[CV 1/3] END n_neighbors=4, weights=distance; Accuracy: (test=0.653) F1: (test=0.643) total time=  

## Encontrando hiperparámetros para KNN con df Training Oversampled

In [11]:
# Configuración del modelo y GridSearchCV
knn = KNeighborsClassifier()
knn_randomized_search_2 = RandomizedSearchCV(
                            estimator= knn,
                            param_distributions= param_dist,
                            n_iter= 5,
                            cv= group_kfold,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

In [12]:
#Ajustando para training
with mlflow.start_run(run_name="KNN-Oversampled"):
    mlflow.log_param("model","KNN")
    mlflow.log_param("data","Oversampled")
    #Agregando información de param_dist
    mlflow.log_param("dist_K","1,2,3,4,6,8,10,12,14,16,18")
    mlflow.log_param("dist_weights",'uniform, distance')
    mlflow.log_param("n_iter",5)
    #Entrenando modelo
    knn2 = knn_randomized_search_2.fit(X_os, y_os, groups= group_train_os)
    #Mejor modelo
    best_model = knn2.best_estimator_
    #Guardando resultados
    mlflow.log_params(knn2.best_params_)
    mlflow.log_metrics({"best_score":knn2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(knn2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = knn2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, knn2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, knn2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, knn2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, knn2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END n_neighbors=6, weights=uniform; Accuracy: (test=0.624) F1: (test=0.580) total time=28.4min
[CV 1/3] END n_neighbors=18, weights=uniform; Accuracy: (test=0.651) F1: (test=0.627) total time=28.4min
[CV 2/3] END n_neighbors=6, weights=uniform; Accuracy: (test=0.638) F1: (test=0.599) total time=28.4min
[CV 3/3] END n_neighbors=6, weights=uniform; Accuracy: (test=0.592) F1: (test=0.533) total time=36.8min
[CV 1/3] END n_neighbors=16, weights=distance; Accuracy: (test=0.645) F1: (test=0.615) total time=28.0min
[CV 3/3] END n_neighbors=18, weights=uniform; Accuracy: (test=0.610) F1: (test=0.572) total time=28.1min
[CV 2/3] END n_neighbors=18, weights=uniform; Accuracy: (test=0.666) F1: (test=0.646) total time=28.1min
[CV 2/3] END n_neighbors=16, weights=distance; Accuracy: (test=0.659) F1: (test=0.634) total time=32.6min
[CV 3/3] END n_neighbors=16, weights=distance; Accuracy: (test=0.609) F1: (test=0.565) total time=26.

In [11]:
#Entrenando con k = 5
knn_temp = KNeighborsClassifier(n_neighbors=5)
knn_temp.fit(X, y)
#Calculando métricas
f1_test1 = f1_score(y_testing_1, knn_temp.predict(X_testing_1))
f1_test2 = f1_score(y_testing_2, knn_temp.predict(X_testing_2))
accuracy_test1 = accuracy_score(y_testing_1, knn_temp.predict(X_testing_1), average='macro')
accuracy_test2 = accuracy_score(y_testing_2, knn_temp.predict(X_testing_2), average='macro')
#Imprimiendo resultados
print("Resultaods con k = 5")
print("F1 Score Test1: ", f1_test1)
print("F1 Score Test2: ", f1_test2)
print("Accuracy Test1: ", accuracy_test1)
print("Accuracy Test2: ", accuracy_test2)


Resultaods con k = 5
F1 Score Test1:  0.988315122292647
F1 Score Test2:  0.9700495212941566
Accuracy Test1:  0.9774420568640331
Accuracy Test2:  0.9422268639987773


## Encontrando hiperparámetros para KNN con df Training balanceado

In [12]:
#Ajustando para training undersampled
with mlflow.start_run(run_name="KNN-Training-Undersampled"):
    knn2 = grid_search.fit(X_us, y_us)
    #Mejor modelo
    best_model = knn2.best_estimator_
    #Guardando resultados
    mlflow.log_params(knn2.best_params_)
    mlflow.log_metrics({"best_score":knn2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(knn2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = knn2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, knn2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, knn2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, knn2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, knn2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ....................n_neighbors=3, weights=distance; total time=   7.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=   7.0s
[CV] END ....................n_neighbors=3, weights=distance; total time=   7.1s
[CV] END ....................n_neighbors=3, weights=distance; total time=   7.1s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   7.1s
[CV] END ....................n_neighbors=1, weights=distance; total time=   7.2s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   7.2s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   7.2s
[CV] END ....................n_neighbors=1, weights=distance; total time=   7.0s
[CV] END .....................n_neighbors=1, weights=uniform; total time=   6.9s
[CV] END .....................n_neighbors=6, weights=uniform; total time=   7.2s
[CV] END .....................n_neighbors=1, wei