# Gradient Boosting Classifier

In [1]:
#Importando librearias
import sys
import os
import json
import tempfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append('../')
sys.path.append('../../')
from Resources.mlTracker import *
from Python.Style.styles import  *
from scipy.stats import uniform, randint
from mlflow.tracking import MlflowClient
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score

In [2]:
#Desactivando wrnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
training = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/training_shuffled.parquet")
training_us = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/undersampled_shuffled.parquet")
training_os = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/oversampled_shuffled.parquet")
testing1 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing1.parquet")
testing2 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing2.parquet")

In [4]:
#To delete columns
total_columns = training.columns
tdc = ['serialNumber','serialNumber_neighbor','fixed_path','FE-Comments','Conjunto','PSNumber']
training_cols = list(filter(lambda x: x not in tdc, total_columns))
#Seleccionando columnas
training = training[training_cols]
training_us = training_us[training_cols]
training_os = training_os[training_cols]
testing1 = testing1[training_cols]
testing2 = testing2[training_cols]

In [5]:
#Separando etiquetas
#Training
y = training['Communicating']
X = training.drop('Communicating', axis=1)
#Training undersampled
y_us = training_us['Communicating']
X_us = training_us.drop('Communicating', axis=1)
#Training oversampled
y_os = training_os['Communicating']
X_os = training_os.drop('Communicating', axis=1)
#Test1
y_testing_1 = testing1['Communicating']
X_testing_1 = testing1.drop('Communicating', axis=1)
#Test2
y_testing_2 = testing2['Communicating']
X_testing_2 = testing2.drop('Communicating', axis=1)

In [None]:
#Seteando experimento
experiment_name = "ML"
mlflow.set_experiment(experiment_name)

In [7]:
#Parámetros de búsqueda
param_dist = {
    'n_estimators': randint(50, 500),  #Número de árboles
    'max_depth': randint(3, 10),  #Profundidad máxima de cada árbol
    'learning_rate': uniform(0.01, 0.3),  #Tasa de aprendizaje
    'subsample': uniform(0.7, 0.3),  #Fracción de datos a usar
    'min_samples_split': randint(2, 20),  #Número mínimo de muestras para dividir un nodo
    'min_samples_leaf': randint(1, 20)  #Número mínimo de muestras en una hoja
}
#Métricas de interés
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score, average='macro')}

In [8]:
# Configuración del modelo y GridSearchCV
gb_model = GradientBoostingClassifier()
randomized_search = RandomizedSearchCV(
                            estimator= gb_model,
                            param_distributions= param_dist,
                            n_iter= 10,
                            cv= 3,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

## Encontrando hiperparámetros para Gradient Boosting Classifier con conjunto de datos con submuestreo de la clase mayoritaria

In [11]:
#Ajustando para training
with mlflow.start_run(run_name="gbc_training_undersampled"):
    mlflow.log_param("model","GradientBoostingClassifier")
    mlflow.log_param("data","training_undersampled")
    #Almacenando información de param_dist
    mlflow.log_param("dist_n_estimators","randint(50, 500)")
    mlflow.log_param("dist_max_depth","randint(3, 10)")
    mlflow.log_param("dist_learning_rate","uniform(0.01, 0.3)")
    mlflow.log_param("dist_subsample","uniform(0.7, 0.3)")
    mlflow.log_param("dist_min_samples_split","randint(2, 20)")
    mlflow.log_param("dist_min_samples_leaf","randint(1, 20)")
    #Entrenando modelo
    gbc1 = randomized_search.fit(X_us, y_us)
    #Mejor modelo
    best_model = gbc1.best_estimator_
    #Guardando resultados
    mlflow.log_params(gbc1.best_params_)
    mlflow.log_metrics({"best_score":gbc1.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(gbc1, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = gbc1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, gbc1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, gbc1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, gbc1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, gbc1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END learning_rate=0.03818087739594792, max_depth=9, min_samples_leaf=10, min_samples_split=9, n_estimators=259, subsample=0.8093923083779435; Accuracy: (test=0.984) F1: (test=0.984) total time= 4.1min
[CV 2/3] END learning_rate=0.23598549696520182, max_depth=6, min_samples_leaf=11, min_samples_split=17, n_estimators=489, subsample=0.7037728112408794; Accuracy: (test=0.991) F1: (test=0.991) total time= 4.7min
[CV 1/3] END learning_rate=0.23598549696520182, max_depth=6, min_samples_leaf=11, min_samples_split=17, n_estimators=489, subsample=0.7037728112408794; Accuracy: (test=0.990) F1: (test=0.990) total time= 4.7min
[CV 3/3] END learning_rate=0.23598549696520182, max_depth=6, min_samples_leaf=11, min_samples_split=17, n_estimators=489, subsample=0.7037728112408794; Accuracy: (test=0.991) F1: (test=0.991) total time= 4.7min
[CV 2/3] END learning_rate=0.11532514955824168, max_depth=4, min_samples_leaf=6, min_samples_spl

## Encontrando hiperparámetros para Support Vector Machine con conjunto de datos con sobremuestreo de la clase minoritaria - exp3

In [9]:
# Configuración del modelo y GridSearchCV
gb_model = GradientBoostingClassifier()
randomized_search_2 = RandomizedSearchCV(
                            estimator= gb_model,
                            param_distributions= param_dist,
                            n_iter= 5,
                            cv= 3,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

In [10]:
#Ajustando para training
with mlflow.start_run(run_name="gbc_training_oversampled"):
    mlflow.log_param("model","GradientBoostingClassifier")
    mlflow.log_param("data","training_oversampled")
    #Almacenando información de param_dist
    mlflow.log_param("dist_n_estimators","randint(50, 500)")
    mlflow.log_param("dist_max_depth","randint(3, 10)")
    mlflow.log_param("dist_learning_rate","uniform(0.01, 0.3)")
    mlflow.log_param("dist_subsample","uniform(0.7, 0.3)")
    mlflow.log_param("dist_min_samples_split","randint(2, 20)")
    mlflow.log_param("dist_min_samples_leaf","randint(1, 20)")
    mlflow.log_param("dist_n_iter",5)
    #Entrenando modelo
    gbc2 = randomized_search_2.fit(X_os, y_os)
    #Mejor modelo
    best_model = gbc2.best_estimator_
    #Guardando resultados
    mlflow.log_params(gbc2.best_params_)
    mlflow.log_metrics({"best_score":gbc2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(gbc2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = gbc2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, gbc2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, gbc2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, gbc2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, gbc2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END learning_rate=0.11293338338643794, max_depth=6, min_samples_leaf=13, min_samples_split=19, n_estimators=84, subsample=0.7812698247892147; Accuracy: (test=0.965) F1: (test=0.965) total time=20.1min
[CV 2/3] END learning_rate=0.11293338338643794, max_depth=6, min_samples_leaf=13, min_samples_split=19, n_estimators=84, subsample=0.7812698247892147; Accuracy: (test=0.967) F1: (test=0.967) total time=20.1min
[CV 3/3] END learning_rate=0.11293338338643794, max_depth=6, min_samples_leaf=13, min_samples_split=19, n_estimators=84, subsample=0.7812698247892147; Accuracy: (test=0.965) F1: (test=0.965) total time=20.1min
[CV 1/3] END learning_rate=0.07853035947338263, max_depth=4, min_samples_leaf=10, min_samples_split=14, n_estimators=167, subsample=0.9209126637387717; Accuracy: (test=0.932) F1: (test=0.932) total time=29.7min
[CV 3/3] END learning_rate=0.07853035947338263, max_depth=4, min_samples_leaf=10, min_samples_split

El error anterior es independiente del entrenamiento, fue causado debido a los tipos de datos en una columna particular. Por tal motivo se almacenará en el run que fue interrumpido, sin volver a entrenar los modelos

In [18]:
run_id = get_run_id(experiment_name, "RF-Training")
#Guardando resultados pendientes
with mlflow.start_run(run_id=run_id):
    #Guardando df con
    results = rf1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

## Encontrando hiperparámetros para Random Forest con df Training balanceado

In [19]:
#Ajustando para training undersampled
with mlflow.start_run(run_name="RF-Training-Undersampled"):
    rf2 = randomized_search.fit(X_us, y_us)
    #Mejor modelo
    best_model = rf2.best_estimator_
    #Guardando resultados
    mlflow.log_params(rf2.best_params_)
    mlflow.log_metrics({"best_score":rf2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(rf2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = rf2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[CV 1/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.801) F1: (test=0.801) total time=  10.9s
[CV 2/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.798) F1: (test=0.798) total time=  10.4s
[CV 3/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.794) F1: (test=0.793) total time=  10.5s
[CV 1/3] END bootstrap=True, criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; Accuracy: (test=0.973) F1: (test=0.973) total time=  10.8s
[CV 1/3] END bootstrap=False, criterion=entropy, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; Accuracy: (test=0.981) F1: (test=0.981) total time=  46.3