# Avaliação dos algoritmos
* Árvore de Decisão: 98.20
* Random Forest: 98.40
* KNN: 98.60
* Regressão Logística: 96.40
* SVM: 98.80
* Redes Neurais: 99.60

## Bibliotecas

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

## Ligando o Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')
caminho = "/content/drive/MyDrive/CDSI/MaterialApoio/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Base - Credit Data

In [3]:
with open(caminho + 'credit.pkl', 'rb') as f:
  X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [4]:
X_credit_treinamento.shape, y_credit_treinamento.shape

((1500, 3), (1500,))

In [5]:
X_credit_teste.shape, y_credit_teste.shape

((500, 3), (500,))

In [6]:
X_credit = np.concatenate((X_credit_treinamento, X_credit_teste), axis = 0)
X_credit.shape

(2000, 3)

In [7]:
y_credit = np.concatenate((y_credit_treinamento, y_credit_teste), axis = 0)
y_credit.shape

(2000,)

## Árvore de Decisão

In [8]:
parametros = {'criterion':['gini','entropy'],
              'splitter':['best', 'random'],
              'min_samples_split':[2,5,10],
              'min_samples_leaf':[1,5,10]}

In [9]:
gri_search = GridSearchCV(estimator = DecisionTreeClassifier(),
                        param_grid = parametros)
gri_search.fit(X_credit, y_credit)
melhores_parametros = gri_search.best_params_
melhor_resultado = gri_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}
0.983


## Random Forest

In [10]:
parametros = {'criterion':['gini','entropy'],
              'n_estimators':[10, 40, 100, 150],
              'min_samples_split':[2, 5, 10],
              'min_samples_leaf':[1, 5, 10]}

In [11]:
grid_search = GridSearchCV(estimator = RandomForestClassifier(),
                          param_grid = parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
0.9865


## KNN

In [12]:
parametros = {'n_neighbors':[3, 5, 10 ,20],
              'p': [1,2]}

In [13]:
grid_search = GridSearchCV(estimator = KNeighborsClassifier(),
                          param_grid = parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'n_neighbors': 20, 'p': 1}
0.9800000000000001


## Regressão Logística

In [14]:
parametros = {'tol':[0.0001, 0.00001, 0.000001],
              'C':[1.0, 1.5, 2.0],
              'solver':['lbfgs', 'sag', 'saga']}

In [15]:
grid_search = GridSearchCV(estimator = LogisticRegression(),
                          param_grid = parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 1.0, 'solver': 'lbfgs', 'tol': 1e-05}
0.9484999999999999


## SVM

In [16]:
parametros = {'tol':[0.001, 0.0001, 0.00001],
              'C':[1.0, 1.5, 2.0],
              'kernel':['rbf', 'linear', 'poly', 'sigmoid']}

In [17]:
grid_search = GridSearchCV(estimator = SVC(),
                          param_grid = parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 2.0, 'kernel': 'rbf', 'tol': 0.001}
0.983


## Redes Neurais

In [18]:
parametros = {'activation': ['relu', 'logistic', 'tanh'],
              'solver': ['adam', 'sgd'],
              'batch_size': [10, 56]}

In [19]:
grid_search = GridSearchCV(estimator = MLPClassifier(),
                          param_grid = parametros)
grid_search.fit(X_credit, y_credit)
melhores_parametros = grid_search.best_params_
melhor_resultado = grid_search.best_score_



{'activation': 'relu', 'batch_size': 56, 'solver': 'adam'}
0.9970000000000001




In [20]:
print(melhores_parametros)
print(melhor_resultado)

{'activation': 'relu', 'batch_size': 56, 'solver': 'adam'}
0.9970000000000001


## Validação Cruzada

In [21]:
resultados_arvore = []
resultados_random_forest = []
resultados_knn = []
resultados_logistica = []
resultados_svm = []
resultados_redes_neurais = []

for i in range(30):
    print(i)
    kfold = KFold(n_splits = 10, shuffle = True, random_state = i)

#{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
    arvore = DecisionTreeClassifier(criterion = 'entropy', min_samples_leaf = 1, min_samples_split = 5, splitter = 'best')
    scores = cross_val_score(arvore, X_credit, y_credit, cv = kfold)

    resultados_arvore.append(scores.mean())

#{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
    random_forest = RandomForestClassifier(criterion = 'entropy', min_samples_leaf = 1, min_samples_split = 2, n_estimators = 100)
    scores = cross_val_score(random_forest, X_credit, y_credit, cv = kfold)

    resultados_random_forest.append(scores.mean())

#{'n_neighbors': 20, 'p': 1}
    knn = KNeighborsClassifier(n_neighbors = 20, p = 1)
    scores = cross_val_score(knn, X_credit, y_credit, cv = kfold)

    resultados_knn.append(scores.mean())

#{'C': 1.0, 'solver': 'lbfgs', 'tol': 1e-05}
    logistica = LogisticRegression(C = 1.0, solver = 'lbfgs', tol = 1e-05)
    scores = cross_val_score(logistica, X_credit, y_credit, cv = kfold)

    resultados_logistica.append(scores.mean())

#{'C': 2.0, 'kernel': 'rbf', 'tol': 0.001}
    svm = SVC(C = 2.0, kernel = 'rbf', tol = 0.001)
    scores = cross_val_score(svm, X_credit, y_credit, cv = kfold)

    resultados_svm.append(scores.mean())

#{'activation': 'relu', 'batch_size': 56, 'solver': 'adam'}
    redes_neurais = MLPClassifier(activation = 'relu', batch_size = 56, solver = 'adam')
    scores = cross_val_score(redes_neurais, X_credit, y_credit, cv = kfold)
    resultados_redes_neurais.append(scores.mean())

0




1




2




3




4




5




6




7




8




9




10




11




12




13




14




15




16




17




18




19




20




21




22




23




24




25




26




27




28




29




In [27]:
resultados = pd.DataFrame({'Arvore': resultados_arvore,
                           'Random Forest': resultados_random_forest,
                           'KNN': resultados_knn,
                           'Logistica': resultados_logistica,
                           'SVM': resultados_svm,
                           'Redes Neurais': resultados_redes_neurais})
resultados

Unnamed: 0,Arvore,Random Forest,KNN,Logistica,SVM,Redes Neurais
0,0.9865,0.985,0.98,0.948,0.9845,0.997
1,0.9845,0.987,0.98,0.9465,0.9835,0.998
2,0.9905,0.987,0.98,0.947,0.985,0.997
3,0.9875,0.988,0.98,0.946,0.985,0.9965
4,0.9895,0.989,0.9815,0.9465,0.984,0.998
5,0.988,0.9875,0.9785,0.9465,0.984,0.997
6,0.9885,0.989,0.98,0.9475,0.9855,0.9965
7,0.9855,0.988,0.98,0.948,0.9855,0.9975
8,0.987,0.9875,0.981,0.9465,0.9845,0.996
9,0.986,0.9875,0.981,0.9465,0.984,0.997


In [28]:
resultados.describe()

Unnamed: 0,Arvore,Random Forest,KNN,Logistica,SVM,Redes Neurais
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.987233,0.987483,0.980267,0.947033,0.985,0.99715
std,0.001874,0.001477,0.001264,0.000765,0.001145,0.000494
min,0.983,0.985,0.9775,0.9455,0.983,0.996
25%,0.986125,0.98625,0.98,0.9465,0.984125,0.997
50%,0.9875,0.9875,0.98,0.947,0.985,0.997
75%,0.988375,0.9885,0.981,0.9475,0.9855,0.9975
max,0.9905,0.9905,0.9835,0.9485,0.9875,0.998


In [29]:
resultados.var()

Unnamed: 0,0
Arvore,3.512644e-06
Random Forest,2.180747e-06
KNN,1.598851e-06
Logistica,5.850575e-07
SVM,1.310345e-06
Redes Neurais,2.439655e-07


In [30]:
(resultados.std() / resultados.mean())

Unnamed: 0,0
Arvore,0.001898
Random Forest,0.001495
KNN,0.00129
Logistica,0.000808
SVM,0.001162
Redes Neurais,0.000495


In [31]:
pickle.dump(redes_neurais, open(caminho + 'rede_neural_finalizado.sav', 'wb'))

In [32]:
redes_neurais = pickle.load(open(caminho + 'rede_neural_finalizado.sav', 'rb'))