# Modelos

En este notebook ejecutamos todos los modelos entrenados para el trabajo y comparamos sus resultados.

In [5]:
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from scipy import stats

from sklearn import metrics
from os import path
from sklearn.svm import SVC
import _pickle as pickle
import gc
import numpy as np
from time import time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from auxiliar_functions import data_from_files
from auxiliar_functions import report
from auxiliar_functions import save_model
from auxiliar_functions import load_model

In [6]:
(X_train, y_train,X_test, y_test) = data_from_files()

n_iter_search = 70

loading data...

Loaded data:
Train shape:  (4422186, 53)
Train shape Y:  (4422186,)
Test shape:  (1895223, 53)
Test shape Y:  (1895223,)


## Linear discriminant Analysis

In [7]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train.values.ravel())
predicted = lda.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:",acc)

print('Full report: \n', metrics.classification_report(y_test, predicted))
# save_model("lda",lda)
del lda
gc.collect()

Accuracy: 0.6295106169564215
Full report: 
              precision    recall  f1-score   support

          0       0.62      0.63      0.63    933345
          1       0.64      0.63      0.63    961878

avg / total       0.63      0.63      0.63   1895223



35

## Quadratic discriminant Analysis

In [8]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train.values.ravel())
predicted = qda.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
# save_model("qda",qda)
del qda
gc.collect()

Accuracy: 0.607134358331447
Full report: 
              precision    recall  f1-score   support

          0       0.62      0.54      0.57    933345
          1       0.60      0.67      0.64    961878

avg / total       0.61      0.61      0.61   1895223



0

## Regularized Discriminant Analisis

In [10]:
parameter_distributions = {'reg_param': stats.uniform(0,1)}
rda = QuadraticDiscriminantAnalysis(priors=2)
random_search = RandomizedSearchCV(rda,param_distributions=parameter_distributions,n_iter=n_iter_search,pre_dispatch=2, n_jobs=-1)

random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))

save_model("rdacv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.621 (std: 0.001)
Parameters: {'reg_param': 0.7010416905212281}

Model with rank: 2
Mean validation score: 0.621 (std: 0.001)
Parameters: {'reg_param': 0.6936071243361783}

Model with rank: 3
Mean validation score: 0.621 (std: 0.001)
Parameters: {'reg_param': 0.714067729503771}

Accuracy: 0.6212023598278409
Full report: 
              precision    recall  f1-score   support

          0       0.62      0.61      0.61    933345
          1       0.62      0.64      0.63    961878

avg / total       0.62      0.62      0.62   1895223



6

## Naive Bayes

In [11]:
gaussianNaiveBayes = GaussianNB()
gaussianNaiveBayes.fit(X_train, y_train.values.ravel())
predicted = gaussianNaiveBayes.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
save_model("naiveBayes", gaussianNaiveBayes)
del gaussianNaiveBayes
gc.collect()

Accuracy: 0.582232275568627
Full report: 
              precision    recall  f1-score   support

          0       0.57      0.60      0.59    933345
          1       0.59      0.56      0.58    961878

avg / total       0.58      0.58      0.58   1895223



0

## Perceptron

In [14]:
param_dist = {"penalty": [None,'l2','l1','elasticnet'],
              "alpha": stats.uniform(0.001, 0.05),
              "fit_intercept": [True, False]
              }


per = Perceptron(n_jobs=-1, warm_start=True)

random_search = RandomizedSearchCV(per, param_distributions=param_dist,
                                   n_iter=n_iter_search,pre_dispatch=3, n_jobs=-1)


random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
save_model("percv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.535 (std: 0.050)
Parameters: {'penalty': 'l1', 'fit_intercept': False, 'alpha': 0.04908952437413702}

Model with rank: 1
Mean validation score: 0.535 (std: 0.050)
Parameters: {'penalty': 'l1', 'fit_intercept': False, 'alpha': 0.05073649366231815}

Model with rank: 3
Mean validation score: 0.523 (std: 0.043)
Parameters: {'penalty': 'l1', 'fit_intercept': True, 'alpha': 0.0336165385384661}

Accuracy: 0.507527082564954
Full report: 
              precision    recall  f1-score   support

          0       0.00      0.00      0.00    933345
          1       0.51      1.00      0.67    961878

avg / total       0.26      0.51      0.34   1895223



66

## Multi Layer Perceptron

In [15]:
param_dist = {'learning_rate': ['constant','invscaling','adaptive'],
                'alpha':stats.uniform(0.0001, 0.05),
                'hidden_layer_sizes': stats.randint(4, 12),
                'activation' : ['identity', 'logistic', 'tanh', 'relu'],
                }
mlp = MLPClassifier(solver= 'adam',warm_start=True)
random_search = RandomizedSearchCV(mlp, param_distributions=param_dist,
                                   n_iter=n_iter_search,pre_dispatch=3, n_jobs=-1)

random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
save_model("mlpcv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.583 (std: 0.035)
Parameters: {'hidden_layer_sizes': 5, 'activation': 'relu', 'learning_rate': 'adaptive', 'alpha': 0.03155274287848141}

Model with rank: 2
Mean validation score: 0.568 (std: 0.039)
Parameters: {'hidden_layer_sizes': 7, 'activation': 'identity', 'learning_rate': 'constant', 'alpha': 0.008395026250799876}

Model with rank: 3
Mean validation score: 0.566 (std: 0.052)
Parameters: {'hidden_layer_sizes': 8, 'activation': 'identity', 'learning_rate': 'invscaling', 'alpha': 0.04163056803662689}

Accuracy: 0.5985622800060996
Full report: 
              precision    recall  f1-score   support

          0       0.56      0.89      0.69    933345
          1       0.75      0.31      0.44    961878

avg / total       0.66      0.60      0.56   1895223



66

## Random Forest

In [16]:
param_dist = {"max_depth": [3, None],
              "max_features": stats.randint(1, 11),
              "min_samples_split": stats.randint(2, 11),
              "min_samples_leaf": stats.randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf = RandomForestClassifier(n_estimators=20)

random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,pre_dispatch=3, n_jobs=-1)


random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
save_model("rfcv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.667 (std: 0.000)
Parameters: {'max_features': 8, 'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 6, 'min_samples_leaf': 9}

Model with rank: 2
Mean validation score: 0.666 (std: 0.000)
Parameters: {'max_features': 8, 'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 8}

Model with rank: 3
Mean validation score: 0.665 (std: 0.000)
Parameters: {'max_features': 9, 'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 7, 'min_samples_leaf': 5}

Accuracy: 0.6724232451801186
Full report: 
              precision    recall  f1-score   support

          0       0.67      0.66      0.66    933345
          1       0.67      0.69      0.68    961878

avg / total       0.67      0.67      0.67   1895223



78