# Random Forest

In [1]:
#Importando librearias
import sys
import os
import json
import tempfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append('../')
sys.path.append('../../')
from Resources.mlTracker import *
from Python.Style.styles import  *
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score

In [2]:
#Desactivando wrnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
training = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/training_shuffled.parquet")
training_us = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/undersampled_shuffled.parquet")
training_os = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/oversampled_shuffled.parquet")
testing1 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing1.parquet")
testing2 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing2.parquet")

In [4]:
#To delete columns
total_columns = training.columns
tdc = ['serialNumber','serialNumber_neighbor','fixed_path','FE-Comments','Conjunto','PSNumber']
training_cols = list(filter(lambda x: x not in tdc, total_columns))
#Seleccionando columnas
training = training[training_cols]
training_us = training_us[training_cols]
training_os = training_os[training_cols]
testing1 = testing1[training_cols]
testing2 = testing2[training_cols]

In [5]:
#Separando etiquetas
#Training
y = training['Communicating']
X = training.drop('Communicating', axis=1)
#Training undersampled
y_us = training_us['Communicating']
X_us = training_us.drop('Communicating', axis=1)
#Training oversampled
y_os = training_os['Communicating']
X_os = training_os.drop('Communicating', axis=1)
#Test1
y_testing_1 = testing1['Communicating']
X_testing_1 = testing1.drop('Communicating', axis=1)
#Test2
y_testing_2 = testing2['Communicating']
X_testing_2 = testing2.drop('Communicating', axis=1)

In [None]:
#Seteando experimento
experiment_name = "ML"
mlflow.set_experiment(experiment_name)

In [7]:
#Parámetros de búsqueda
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 1000],
    'max_features': ['auto', 'sqrt', 'log2', 0.3],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 0.01],
    'min_samples_leaf': [1, 2, 4, 0.01],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}
#Métricas de interés
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score, average='macro')}

In [8]:
# Configuración del modelo y GridSearchCV
rf = RandomForestClassifier()
randomized_search = RandomizedSearchCV(
                            estimator= rf,
                            param_distributions= param_grid,
                            n_iter= 10,
                            cv= 3,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

## Encontrando hiperparámetros para Random Forest con df Training completo

In [None]:
#Ajustando para training
with mlflow.start_run(run_name="RF-Training"):
    rf1 = randomized_search.fit(X, y)
    #Mejor modelo
    best_model = rf1.best_estimator_
    #Guardando resultados
    mlflow.log_params(rf1.best_params_)
    mlflow.log_metrics({"best_score":rf1.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(rf1, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = rf1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)


El error anterior es independiente del entrenamiento, fue causado debido a los tipos de datos en una columna particular. Por tal motivo se almacenará en el run que fue interrumpido, sin volver a entrenar los modelos

In [18]:
run_id = get_run_id(experiment_name, "RF-Training")
#Guardando resultados pendientes
with mlflow.start_run(run_id=run_id):
    #Guardando df con
    results = rf1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

## Encontrando hiperparámetros para Random Forest con df Training balanceado

In [19]:
#Ajustando para training undersampled
with mlflow.start_run(run_name="RF-Training-Undersampled"):
    rf2 = randomized_search.fit(X_us, y_us)
    #Mejor modelo
    best_model = rf2.best_estimator_
    #Guardando resultados
    mlflow.log_params(rf2.best_params_)
    mlflow.log_metrics({"best_score":rf2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(rf2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = rf2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[CV 1/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.801) F1: (test=0.801) total time=  10.9s
[CV 2/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.798) F1: (test=0.798) total time=  10.4s
[CV 3/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.794) F1: (test=0.793) total time=  10.5s
[CV 1/3] END bootstrap=True, criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; Accuracy: (test=0.973) F1: (test=0.973) total time=  10.8s
[CV 1/3] END bootstrap=False, criterion=entropy, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; Accuracy: (test=0.981) F1: (test=0.981) total time=  46.3

## Encontrando hiperparámetros para Random Forest con df Training balanceado con oversampling

In [10]:
# Configuración del modelo y GridSearchCV
rf = RandomForestClassifier()
randomized_search_3 = RandomizedSearchCV(
                            estimator= rf,
                            param_distributions= param_grid,
                            n_iter= 5,
                            cv= 3,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

In [11]:
#Ajustando para training undersampled
with mlflow.start_run(run_name="RF-Training-Oversampled"):
    #Guardando información de experimento
    mlflow.log_param("data","oversampled")
    mlflow.log_param("model","RandomForest")
    mlflow.log_param("dist_n_estimators","100, 200, 300, 400, 500, 1000")
    mlflow.log_param("dist_max_features","auto, sqrt, log2, 0.3")
    mlflow.log_param("dist_max_depth","None, 10, 20, 30, 40, 50")
    mlflow.log_param("dist_min_samples_split","2, 5, 10, 0.01")
    mlflow.log_param("dist_min_samples_leaf","1, 2, 4, 0.01")
    mlflow.log_param("dist_bootstrap","True, False")
    mlflow.log_param("dist_criterion","gini, entropy")
    mlflow.log_param("n_iter",5)
    rf3 = randomized_search_3.fit(X_os, y_os)
    #Mejor modelo
    best_model = rf3.best_estimator_
    #Guardando resultados
    mlflow.log_params(rf3.best_params_)
    mlflow.log_metrics({"best_score":rf3.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(rf3, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = rf3.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf3.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf3.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf3.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf3.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END bootstrap=False, criterion=entropy, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; Accuracy: (test=0.923) F1: (test=0.923) total time=32.2min
[CV 3/3] END bootstrap=False, criterion=entropy, max_depth=40, max_features=0.3, min_samples_leaf=1, min_samples_split=0.01, n_estimators=400; Accuracy: (test=0.919) F1: (test=0.919) total time=50.6min
[CV 2/3] END bootstrap=False, criterion=entropy, max_depth=40, max_features=0.3, min_samples_leaf=1, min_samples_split=0.01, n_estimators=400; Accuracy: (test=0.918) F1: (test=0.918) total time=50.7min
[CV 1/3] END bootstrap=True, criterion=entropy, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; Accuracy: (test=nan) F1: (test=nan) total time=   0.6s
[CV 2/3] END bootstrap=True, criterion=entropy, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400;