# Segundo enfoque ML

El enfoque con la validación cruzada causó, según se logran interpretar los resultados, sobreajuste, de modo que tiene un desempeño excelente en el conjunto de datos de prueba 1, pero falla rotundamente en el conjunto de prueba 2. Por tal motivo se intentará seleccionar el modelo con mejor desempeño en un conjunto de datos de validación, en vez de recurrir a la validación cruzada.

In [2]:
#Importando librearias
import sys
import os
import json
import tempfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append('../')
sys.path.append('../../')
from Resources.mlTracker import *
from Python.Style.styles import  *
from scipy.stats import uniform, randint
from mlflow.tracking import MlflowClient
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score

In [3]:
#Desactivando wrnings
import warnings
warnings.filterwarnings('ignore')

In [13]:
#Leyendo datos
training = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/training_shuffled.parquet")
training_us = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/undersampled_shuffled.parquet")
training_os = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/oversampled_shuffled.parquet")
testing1 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing1.parquet")
testing2 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing2.parquet")

In [14]:
training.sample(5).to_csv("training_sample.csv", index=False)

In [16]:
training['Antena'].unique()

array(['0.0', '2.0', '1.0'], dtype=object)

In [17]:
#To delete columns
total_columns = training.columns
tdc = ['serialNumber','serialNumber_neighbor','fixed_path','FE-Comments','Conjunto','PSNumber','Antena']
training_cols = list(filter(lambda x: x not in tdc, total_columns))
#Grouping
group_train = training['serialNumber']
group_train_us = training_us['serialNumber']
group_train_os = training_os['serialNumber']
#Seleccionando columnas
training = training[training_cols]
training_us = training_us[training_cols]
training_os = training_os[training_cols]
testing1 = testing1[training_cols]
testing2 = testing2[training_cols]

In [18]:
#Separando etiquetas
#Training
y = training['Communicating']
X = training.drop('Communicating', axis=1)
#Training undersampled
y_us = training_us['Communicating']
X_us = training_us.drop('Communicating', axis=1)
#Training oversampled
y_os = training_os['Communicating']
X_os = training_os.drop('Communicating', axis=1)
#Test1
y_testing_1 = testing1['Communicating']
X_testing_1 = testing1.drop('Communicating', axis=1)
#Test2
y_testing_2 = testing2['Communicating']
X_testing_2 = testing2.drop('Communicating', axis=1)

# GBoost training_us

In [None]:
#Seteando experimento
experiment_name = "ML-2"
mlflow.set_experiment(experiment_name)

In [7]:
#GroupKFold
group_kfold = GroupKFold(n_splits=3)
#Creando group_kfolds
group_kfolds = group_kfold.split(training, training['Communicating'], group_train)
group_kfolds_us = group_kfold.split(training_us, training_us['Communicating'], group_train_us)
group_kfolds_os = group_kfold.split(training_os, training_os['Communicating'], group_train_os)

In [8]:
#GBoost training_us
#Definiendo parametros
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'learning_rate': uniform(0.01, 0.5),
    'subsample': uniform(0.5, 0.5)
}
#Métricas de interés
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score, average='macro')}

In [9]:
# Configuración del modelo y GridSearchCV
gb_model = GradientBoostingClassifier()
gb_randomized_search = RandomizedSearchCV(
                            estimator= gb_model,
                            param_distributions= param_dist,
                            n_iter= 10,
                            cv= group_kfold,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

In [10]:
#Ajustando para training
with mlflow.start_run(run_name="gkf_training_us"):
    mlflow.log_param("model","GradientBoostingClassifier")
    mlflow.log_param("data","training_undersampled")
    #Almacenando información de param_dist
    mlflow.log_param("dist_n_estimators","randint(50, 500)")
    mlflow.log_param("dist_max_depth","randint(3, 10)")
    mlflow.log_param("dist_min_samples_split","randint(2, 10)")
    mlflow.log_param("dist_min_samples_leaf","randint(1, 10)")
    mlflow.log_param("dist_learning_rate","uniform(0.01, 0.5)")
    mlflow.log_param("dist_subsample","uniform(0.5, 0.5)")
    mlflow.log_param("n_iter",10)
    #Entrenando modelo
    gbc1 = gb_randomized_search.fit(X_us, y_us, groups=group_train_us)
    #Mejor modelo
    best_model = gbc1.best_estimator_
    #Guardando resultados
    mlflow.log_params(gbc1.best_params_)
    mlflow.log_metrics({"best_score":gbc1.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(gbc1, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = gbc1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, gbc1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, gbc1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, gbc1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, gbc1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END learning_rate=0.15274512893576042, max_depth=9, min_samples_leaf=5, min_samples_split=2, n_estimators=255, subsample=0.7630178812459444; Accuracy: (test=0.574) F1: (test=0.485) total time= 3.7min
[CV 1/3] END learning_rate=0.1122456587680226, max_depth=9, min_samples_leaf=4, min_samples_split=8, n_estimators=325, subsample=0.938883983735987; Accuracy: (test=0.576) F1: (test=0.489) total time= 5.6min
[CV 2/3] END learning_rate=0.1122456587680226, max_depth=9, min_samples_leaf=4, min_samples_split=8, n_estimators=325, subsample=0.938883983735987; Accuracy: (test=0.576) F1: (test=0.487) total time= 5.6min
[CV 3/3] END learning_rate=0.1122456587680226, max_depth=9, min_samples_leaf=4, min_samples_split=8, n_estimators=325, subsample=0.938883983735987; Accuracy: (test=0.609) F1: (test=0.541) total time= 5.6min
[CV 2/3] END learning_rate=0.15274512893576042, max_depth=9, min_samples_leaf=5, min_samples_split=2, n_estim

# GBoost training_os

In [12]:
# GBoost training_os
# Configuración del modelo y GridSearchCV
gb_model = GradientBoostingClassifier()
gb_randomized_search_2 = RandomizedSearchCV(
                            estimator= gb_model,
                            param_distributions= param_dist,
                            n_iter= 5,
                            cv= group_kfold,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)


In [14]:
#Ajustando para training
with mlflow.start_run(run_name="gkf_training_os"):
    mlflow.log_param("model","GradientBoostingClassifier")
    mlflow.log_param("data","training_oversampled")
    #Almacenando información de param_dist
    mlflow.log_param("dist_n_estimators","randint(50, 500)")
    mlflow.log_param("dist_max_depth","randint(3, 10)")
    mlflow.log_param("dist_min_samples_split","randint(2, 10)")
    mlflow.log_param("dist_min_samples_leaf","randint(1, 10)")
    mlflow.log_param("dist_learning_rate","uniform(0.01, 0.5)")
    mlflow.log_param("dist_subsample","uniform(0.5, 0.5)")
    mlflow.log_param("n_iter",5)
    #Entrenando modelo
    gbc2 = gb_randomized_search_2.fit(X_os, y_os, groups=group_train_os)
    #Mejor modelo
    best_model = gbc2.best_estimator_
    #Guardando resultados
    mlflow.log_params(gbc2.best_params_)
    mlflow.log_metrics({"best_score":gbc2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(gbc2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = gbc2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, gbc2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, gbc2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, gbc2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, gbc2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 2/3] END learning_rate=0.22143479728043058, max_depth=5, min_samples_leaf=2, min_samples_split=7, n_estimators=339, subsample=0.553720302291829; Accuracy: (test=0.576) F1: (test=0.487) total time=58.0min
[CV 1/3] END learning_rate=0.22143479728043058, max_depth=5, min_samples_leaf=2, min_samples_split=7, n_estimators=339, subsample=0.553720302291829; Accuracy: (test=0.586) F1: (test=0.503) total time=58.2min
[CV 3/3] END learning_rate=0.22143479728043058, max_depth=5, min_samples_leaf=2, min_samples_split=7, n_estimators=339, subsample=0.553720302291829; Accuracy: (test=0.577) F1: (test=0.490) total time=58.3min
[CV 1/3] END learning_rate=0.3606667582493658, max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estimators=55, subsample=0.5784539073172077; Accuracy: (test=0.596) F1: (test=0.520) total time=19.9min
[CV 2/3] END learning_rate=0.3606667582493658, max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estima