# Modelos

En este notebook ejecutamos todos los modelos entrenados para el trabajo y comparamos sus resultados.

In [1]:
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from scipy import stats

from sklearn import metrics
from os import path
from sklearn.svm import SVC
import _pickle as pickle
import gc
import numpy as np
from time import time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from auxiliar_functions import data_from_files
from auxiliar_functions import report
from auxiliar_functions import save_model
from auxiliar_functions import load_model

In [2]:
(X_train, y_train,X_test, y_test) = data_from_files()

loading data...

Loaded data:
Train shape:  (4422186, 53)
Train shape Y:  (4422186,)
Test shape:  (1895223, 53)
Test shape Y:  (1895223,)


## Linear discriminant Analysis

In [4]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train.values.ravel())
predicted = lda.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:",acc)

print('Full report: \n', metrics.classification_report(y_test, predicted))
# save_model("lda",lda)
del lda
gc.collect()

Accuracy: 0.6298171771870645
Full report: 
              precision    recall  f1-score   support

          0       0.62      0.63      0.63    933345
          1       0.64      0.63      0.63    961878

avg / total       0.63      0.63      0.63   1895223



0

## Quadratic discriminant Analysis

In [6]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train.values.ravel())
predicted = qda.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
# save_model("qda",qda)
del qda
gc.collect()

Accuracy: 0.6109592380421723
Full report: 
              precision    recall  f1-score   support

          0       0.64      0.49      0.55    933345
          1       0.59      0.73      0.66    961878

avg / total       0.62      0.61      0.60   1895223



182

## Regularized Discriminant analissi

In [7]:
parameter_distributions = {'reg_param': stats.uniform(0,1)}
rda = QuadraticDiscriminantAnalysis(priors=2)
random_search = RandomizedSearchCV(rda,param_distributions=parameter_distributions,n_iter=10,pre_dispatch=2, n_jobs=-1)

random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))

save_model("rdacv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.602 (std: 0.000)
Parameters: {'reg_param': 0.18385151664937593}

Model with rank: 2
Mean validation score: 0.598 (std: 0.001)
Parameters: {'reg_param': 0.0740971638350002}

Model with rank: 3
Mean validation score: 0.596 (std: 0.000)
Parameters: {'reg_param': 0.36551448299537026}

Accuracy: 0.6013740863212403
Full report: 
              precision    recall  f1-score   support

          0       0.63      0.47      0.54    933345
          1       0.59      0.73      0.65    961878

avg / total       0.61      0.60      0.59   1895223



6

## Naive Bayes

In [8]:
gaussianNaiveBayes = GaussianNB()
gaussianNaiveBayes.fit(X_train, y_train.values.ravel())
predicted = gaussianNaiveBayes.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
save_model("naiveBayes", gaussianNaiveBayes)
del gaussianNaiveBayes
gc.collect()

Accuracy: 0.5822058934489504
Full report: 
              precision    recall  f1-score   support

          0       0.57      0.60      0.59    933345
          1       0.59      0.56      0.58    961878

avg / total       0.58      0.58      0.58   1895223



0

# Logistic Regression

In [9]:
parameter_distributions = {'penalty':['l1','l2'], 'C':stats.expon(scale=100),
                           'fit_intercept':[True,False]}
lr = LogisticRegression(solver='saga')
random_search = RandomizedSearchCV(lr,param_distributions=parameter_distributions,n_iter=10,pre_dispatch=2, n_jobs=-1)

random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))

save_model("lrcv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l2', 'C': 4.5624333593334, 'fit_intercept': True}

Model with rank: 1
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l2', 'C': 26.24720971166969, 'fit_intercept': True}

Model with rank: 1
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l2', 'C': 13.911634920069089, 'fit_intercept': True}

Accuracy: 0.4924745003622265
Full report: 
              precision    recall  f1-score   support

          0       0.49      1.00      0.66    933345
          1       1.00      0.00      0.00    961878

avg / total       0.75      0.49      0.33   1895223



72

## KNN

In [10]:
params = {'n_neighbors':stats.randint(1, 30), 'weights':['distance','uniform']}
knc = KNeighborsClassifier(n_jobs=-1)
random_search = RandomizedSearchCV(lr,param_distributions=parameter_distributions,n_iter=10,pre_dispatch=2, n_jobs=-1)

random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))

save_model("knncv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l2', 'C': 18.284665950843383, 'fit_intercept': True}

Model with rank: 2
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l1', 'C': 92.4743004174456, 'fit_intercept': True}

Model with rank: 3
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l1', 'C': 115.55403288350519, 'fit_intercept': False}

Model with rank: 3
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l1', 'C': 8.078503060627156, 'fit_intercept': False}

Model with rank: 3
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l1', 'C': 13.04561440421634, 'fit_intercept': False}

Model with rank: 3
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l1', 'C': 7.61616415667644, 'fit_intercept': False}

Model with rank: 3
Mean validation score: 0.493 (std: 0.000)
Parameters: {'penalty': 'l1', 'C': 109.11932779713742, 'fit_intercept': False}

Accuracy: 0.492474500

72

## Random Forest

In [11]:
param_dist = {"max_depth": [3, None],
              "max_features": stats.randint(1, 11),
              "min_samples_split": stats.randint(2, 11),
              "min_samples_leaf": stats.randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf = RandomForestClassifier(n_estimators=20)

n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,pre_dispatch=3, n_jobs=-1)


random_search.fit(X_train, y_train.values.ravel())

report(random_search.cv_results_)
predicted = random_search.predict(X_test)
acc = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc)
print('Full report: \n', metrics.classification_report(y_test, predicted))
save_model("rfcv", random_search)
del random_search
gc.collect()

Model with rank: 1
Mean validation score: 0.662 (std: 0.000)
Parameters: {'min_samples_leaf': 10, 'max_depth': None, 'criterion': 'entropy', 'min_samples_split': 6, 'max_features': 5, 'bootstrap': True}

Model with rank: 2
Mean validation score: 0.661 (std: 0.000)
Parameters: {'min_samples_leaf': 7, 'max_depth': None, 'criterion': 'gini', 'min_samples_split': 7, 'max_features': 2, 'bootstrap': False}

Model with rank: 3
Mean validation score: 0.660 (std: 0.000)
Parameters: {'min_samples_leaf': 5, 'max_depth': None, 'criterion': 'entropy', 'min_samples_split': 8, 'max_features': 2, 'bootstrap': True}

Accuracy: 0.6676169506174208
Full report: 
              precision    recall  f1-score   support

          0       0.67      0.65      0.66    933345
          1       0.67      0.68      0.68    961878

avg / total       0.67      0.67      0.67   1895223



78

## Perceptron