# Support Vector Machine (SVM|)

In [1]:
#Importando librearias
import sys
import os
import json
import tempfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append('../')
sys.path.append('../../')
from sklearn.svm import SVC
from Resources.mlTracker import *
from Python.Style.styles import  *
from scipy.stats import expon, randint
from mlflow.tracking import MlflowClient
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score

In [2]:
#Desactivando wrnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
training = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/training_shuffled.parquet")
training_us = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/undersampled_shuffled.parquet")
testing1 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing1.parquet")
testing2 = pd.read_parquet("../../Data/DataMart/Views/TrainingViews/testing2.parquet")

In [4]:
#To delete columns
total_columns = training.columns
tdc = ['serialNumber','serialNumber_neighbor','fixed_path','FE-Comments','Conjunto','PSNumber']
training_cols = list(filter(lambda x: x not in tdc, total_columns))
#Seleccionando columnas
training = training[training_cols]
training_us = training_us[training_cols]
testing1 = testing1[training_cols]
testing2 = testing2[training_cols]

In [5]:
#Separando etiquetas
#Training
y = training['Communicating']
X = training.drop('Communicating', axis=1)
#Training undersampled
y_us = training_us['Communicating']
X_us = training_us.drop('Communicating', axis=1)
#Test1
y_testing_1 = testing1['Communicating']
X_testing_1 = testing1.drop('Communicating', axis=1)
#Test2
y_testing_2 = testing2['Communicating']
X_testing_2 = testing2.drop('Communicating', axis=1)

In [None]:
#Seteando experimento
experiment_name = "ML"
mlflow.set_experiment(experiment_name)

In [7]:
#Parámetros de búsqueda
param_dist = {
    'C': expon(scale=100),
    'gamma': expon(scale=.1),
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'class_weight': ['balanced', None]
}
#Métricas de interés
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score, average='macro')}

In [10]:
# Configuración del modelo y GridSearchCV
svm = SVC()
randomized_search = RandomizedSearchCV(
                            estimator= svm,
                            param_distributions= param_dist,
                            n_iter= 5,
                            cv= 3,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

## Encontrando hiperparámetros para Support Vector Machine con conjunto de datos con submuestreo de la clase mayoritaria

In [11]:
#Ajustando para training
with mlflow.start_run(run_name="SVM-Undersampled-2"):
    svm1 = randomized_search.fit(X_us, y_us)
    #Mejor modelo
    best_model = svm1.best_estimator_
    #Guardando resultados
    mlflow.log_params(svm1.best_params_)
    mlflow.log_metrics({"best_score":svm1.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(svm1, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = svm1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, svm1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, svm1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, svm1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, svm1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END C=51.05083011988179, class_weight=None, gamma=0.13386430541158195, kernel=rbf; Accuracy: (test=0.917) F1: (test=0.917) total time= 5.5min
[CV 2/3] END C=51.05083011988179, class_weight=None, gamma=0.13386430541158195, kernel=rbf; Accuracy: (test=0.921) F1: (test=0.921) total time= 6.8min
[CV 3/3] END C=51.05083011988179, class_weight=None, gamma=0.13386430541158195, kernel=rbf; Accuracy: (test=0.917) F1: (test=0.917) total time= 6.6min
[CV 3/3] END C=47.455796263279495, class_weight=balanced, gamma=0.343890224078124, kernel=linear; Accuracy: (test=0.767) F1: (test=0.767) total time=29.8min
[CV 2/3] END C=47.455796263279495, class_weight=balanced, gamma=0.343890224078124, kernel=linear; Accuracy: (test=0.762) F1: (test=0.762) total time=29.8min
[CV 1/3] END C=47.455796263279495, class_weight=balanced, gamma=0.343890224078124, kernel=linear; Accuracy: (test=0.767) F1: (test=0.767) total time=30.0min
[CV 1/3] END C=1

## Encontrando hiperparámetros para Support Vector Machine con conjunto de datos con submuestreo de la clase mayoritaria - exp3

In [17]:
#Parámetros de búsqueda
param_dist_3 = {
    'C': expon(scale=10),
    'gamma': expon(scale=.1),
    'kernel': ['rbf', 'linear', 'poly'],
    'class_weight': ['balanced']
}
#Métricas de interés
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score, average='macro')}

In [18]:
# Configuración del modelo y GridSearchCV
svm = SVC()
randomized_search_3 = RandomizedSearchCV(
                            estimator= svm,
                            param_distributions= param_dist_3,
                            n_iter= 10,
                            cv= 3,
                            scoring= scoring,
                            refit= 'F1',
                            return_train_score= False,
                            n_jobs= 4,
                            verbose= 3)

In [25]:
#Ajustando para training
with mlflow.start_run(run_name="SVM-Undersampled-3"):
    #Agregando información
    mlflow.log_param("model","SVM")
    mlflow.log_param("data","Undersampled")
    mlflow.log_param("dist_C","expon(scale=10)")
    mlflow.log_param("dist_gamma","expon(scale=.1)")
    mlflow.log_param("dist_kernel","rbf, linear, poly")
    mlflow.log_param("dist_class_weight","balanced")
    mlflow.log_param("n_iter",10)
    svm3 = randomized_search_3.fit(X_us, y_us)
    #Mejor modelo
    best_model = svm3.best_estimator_
    #Guardando resultados
    mlflow.log_params(svm3.best_params_)
    mlflow.log_metrics({"best_score":svm3.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(svm3, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = svm3.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        results.to_parquet(temp.name)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, svm3.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, svm3.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, svm3.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, svm3.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

In [23]:
svm3.best_score_

0.9141182787887407

In [24]:
#Calculando métricas con conjuntos de testeo}
f1_test1 = f1_score(y_testing_1, svm3.predict(X_testing_1), average='macro')
f1_test2 = f1_score(y_testing_2, svm3.predict(X_testing_2), average='macro')
accuracy_test1 = accuracy_score(y_testing_1, svm3.predict(X_testing_1))
accuracy_test2 = accuracy_score(y_testing_2, svm3.predict(X_testing_2))
print(f1_test1, f1_test2, accuracy_test1, accuracy_test2)


0.6883754934247245 0.574162833861682 0.9022238904006756 0.8673994446850243


El error anterior es independiente del entrenamiento, fue causado debido a los tipos de datos en una columna particular. Por tal motivo se almacenará en el run que fue interrumpido, sin volver a entrenar los modelos

In [18]:
run_id = get_run_id(experiment_name, "RF-Training")
#Guardando resultados pendientes
with mlflow.start_run(run_id=run_id):
    #Guardando df con
    results = rf1.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf1.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf1.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf1.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf1.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

## Encontrando hiperparámetros para Random Forest con df Training balanceado

In [19]:
#Ajustando para training undersampled
with mlflow.start_run(run_name="RF-Training-Undersampled"):
    rf2 = randomized_search.fit(X_us, y_us)
    #Mejor modelo
    best_model = rf2.best_estimator_
    #Guardando resultados
    mlflow.log_params(rf2.best_params_)
    mlflow.log_metrics({"best_score":rf2.best_score_})
    mlflow.sklearn.log_model(best_model,"best_model")
    mlflow.sklearn.log_model(rf2, "RandomizedSearchCV")
    #Guardando diccionario de resultados
    results = rf2.cv_results_
    results = pd.DataFrame(results)
    with tempfile.NamedTemporaryFile(mode = "w+", suffix = ".parquet", delete= False) as temp:
        try:
            results.to_parquet(temp.name)
        except:
            results = results.astype(str)
        mlflow.log_artifact(temp.name,"results/results.parquet")
        temp_path = temp.name
    #Almacenando métricas con conjunto de datos
    f1_test1 = f1_score(y_testing_1, rf2.predict(X_testing_1), average='macro')
    f1_test2 = f1_score(y_testing_2, rf2.predict(X_testing_2), average='macro')
    accuracy_test1 = accuracy_score(y_testing_1, rf2.predict(X_testing_1))
    accuracy_test2 = accuracy_score(y_testing_2, rf2.predict(X_testing_2))
    mlflow.log_metrics({"f1_test1":f1_test1, "f1_test2":f1_test2, "accuracy_test1":accuracy_test1, "accuracy_test2":accuracy_test2})
    os.remove(temp_path)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[CV 1/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.801) F1: (test=0.801) total time=  10.9s
[CV 2/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.798) F1: (test=0.798) total time=  10.4s
[CV 3/3] END bootstrap=True, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=0.01, min_samples_split=0.01, n_estimators=200; Accuracy: (test=0.794) F1: (test=0.793) total time=  10.5s
[CV 1/3] END bootstrap=True, criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; Accuracy: (test=0.973) F1: (test=0.973) total time=  10.8s
[CV 1/3] END bootstrap=False, criterion=entropy, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; Accuracy: (test=0.981) F1: (test=0.981) total time=  46.3