In [15]:
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, train_test_split
from itertools import combinations
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.stats import entropy
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [20]:
# import Data
df = pd.read_excel('dados_artigo_2019.xlsx', index_col=[0])
df = df.dropna()
print(len(df))
#df.head()
X, y = df.drop('CEG',axis=1), df.CEG
y = y.apply(lambda x: 1 if x <= 7.62 else 0)
print(X.shape)
print(y.shape)

1949
(1949, 35)
(1949,)


In [21]:
# Treino e Teste
X_train, X_test, \
y_train, y_test = train_test_split(X, y, random_state=42)


In [32]:
# Classificadores
clf_RF = RandomForestClassifier(n_estimators=50, random_state=42)
clf_GB = GaussianNB()
clf_KNN = KNeighborsClassifier(9)

In [34]:
# Classificadores Isolados
for mod in [clf_RF, clf_GB, clf_KNN]:
    modelo = mod
    modelo.fit(X_train, y_train)
    pr = modelo.predict(X_test)
    hits = pr == y_test
    print(f'Precisao:{(sum(hits)/len(hits)):.3f}')

Precisao:0.791
Precisao:0.576
Precisao:0.699


In [46]:
# Combinacao de classificadores - https://github.com/fboldt/aulasml/blob/master/ensembles.ipynb
# Voting
from sklearn.ensemble import VotingClassifier
modelo_voting = VotingClassifier([
    ('RF', clf_RF),
    ('GB', clf_GB),
    ('KNN', clf_KNN)
], voting='soft')
modelo.fit(X_train, y_train)
vo_pr = modelo.predict(X_test)
vohits = vo_pr == y_test
print(f'Voting:{(sum(vohits)/len(vohits)):.3f}')

# Stacking
modelo_stacking = StackingClassifier([
    ('RF', clf_RF),
    ('GB', clf_GB),
    ('KNN', clf_KNN)
], cv=3, passthrough=True)
modelo.fit(X_train, y_train) 
sc_pr = modelo.predict(X_test)
schits = sc_pr == y_test
print(f'Stacking:{(sum(schits)/len(schits)):.3f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting:0.752
Stacking:0.752


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# Usando o Gridsearch pra explorar parametros - 
# https://github.com/Vinicius42-00/MPCA20201/blob/main/Tarefa_8_Reconhecimento_Padroes_Kaggle_Titanic_Melhora_do_Score.ipynb
parametros_v = {
    'RF__n_estimators': [50, 100, 150],
    'RF__max_depth': [10, 15, 20],
    'RF__criterion': ['gini','entropy'],
    #'KNN__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'KNN__leaf_size': [20, 30, 40],
    #'KNN__n_neighbors': [1,5,10],
    'GB__var_smoothing': [1.0,2.0,3.0]
}
modelo_grid_vot = GridSearchCV(modelo_voting, 
                               param_grid=parametros_v, 
                               scoring='roc_auc_ovr_weighted',
                               n_jobs=-1)
modelo_grid_vot.fit(X_train, y_train) 
grid_vot_pr = modelo_grid_vot.predict(X_test)
grid_vot_hits = grid_vot_pr == y_test
print(f'Precisao do Grid Voting:{(sum(grid_vot_hits)/len(grid_vot_hits)):.3f}')

Precisao do Grid Voting:0.783


In [48]:
modelo_grid_stk = GridSearchCV(modelo_stacking, 
                               param_grid=parametros_v, 
                               scoring='roc_auc_ovr_weighted',
                               n_jobs=-1)
modelo_grid_stk.fit(X_train, y_train) 
grid_stk_pr = modelo_grid_stk.predict(X_test)
grid_stk_hits = grid_stk_pr == y_test
print(f'Precisao do Grid Stacking:{(sum(grid_stk_hits)/len(grid_stk_hits)):.3f}')

Precisao do Grid Stacking:0.748


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
# Melhora dos Classificadores Fracos
parametros_GB = {
    #'KNN__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #'KNN__leaf_size': [20, 30, 40],
    #'KNN__n_neighbors': [1,5,10],
   'var_smoothing': [1.0,2.0,3.0]
}

modelo_grid_gb = GridSearchCV(clf_GB, 
                           param_grid=parametros_GB, 
                           scoring='roc_auc_ovr_weighted',
                           n_jobs=-1)
modelo_grid_gb.fit(X_train, y_train)
pr = modelo_grid_gb.predict(X_test)
hits = pr == y_test
print(f'Precisao Melhorada do GB:{(sum(hits)/len(hits)):.3f}')

Precisao Melhorada do GB:0.654


In [53]:
parametros_KNN = {
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'n_neighbors': [1,5,10],
    #'GB__var_smoothing': [1.0,2.0,3.0]
}

modelo_grid_knn = GridSearchCV(clf_KNN, 
                           param_grid=parametros_KNN, 
                           scoring='roc_auc_ovr_weighted',
                           n_jobs=-1)
modelo_grid_knn.fit(X_train, y_train)
pr = modelo_grid_knn.predict(X_test)
hits = pr == y_test
print(f'Precisao Melhorada do GB:{(sum(hits)/len(hits)):.3f}')

Precisao Melhorada do GB:0.705


In [54]:
# testando Melhor classificador
# Voting 
modelo_voting_1 = VotingClassifier([
    ('RF1', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('RF2', RandomForestClassifier(n_estimators=40, random_state=41)),
    ('RF3', RandomForestClassifier(n_estimators=30, random_state=40))
], voting='soft')
modelo_voting_1.fit(X_train, y_train)
vo_pr = modelo_voting_1.predict(X_test)
vohits = vo_pr == y_test
print(f'Voting Melhor CLF:{(sum(vohits)/len(vohits)):.3f}')

# Stacking
modelo_stacking_2 = StackingClassifier([
    ('RF1', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('RF2', RandomForestClassifier(n_estimators=40, random_state=41)),
    ('RF3', RandomForestClassifier(n_estimators=30, random_state=40))

], cv=3, passthrough=True)
modelo_stacking_2.fit(X_train, y_train) 
sc_pr = modelo_stacking_2.predict(X_test)
schits = sc_pr == y_test
print(f'Stacking Melhor CLF:{(sum(schits)/len(schits)):.3f}')

Voting Melhor CLF:0.799
Stacking Melhor CLF:0.764


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
parametros_v_2 = {
    'RF__n_estimators': [50, 100, 150],
    'RF__max_depth': [10, 15, 20],
    'RF__criterion': ['gini','entropy'],
}
modelo_grid_vot_2 = GridSearchCV(modelo_voting, 
                               param_grid=parametros_v_2, 
                               scoring='roc_auc_ovr_weighted',
                               n_jobs=-1)
modelo_grid_vot_2.fit(X_train, y_train) 
grid_vot_pr = modelo_grid_vot_2.predict(X_test)
grid_vot_hits = grid_vot_pr == y_test
print(f'Precisao do Grid Voting Melhor CLF:{(sum(grid_vot_hits)/len(grid_vot_hits)):.3f}')

Precisao do Grid Voting Melhor CLF:0.703


In [56]:
modelo_grid_stk_2 = GridSearchCV(modelo_stacking, 
                               param_grid=parametros_v, 
                               scoring='roc_auc_ovr_weighted',
                               n_jobs=-1)
modelo_grid_stk_2.fit(X_train, y_train) 
grid_stk_pr = modelo_grid_stk_2.predict(X_test)
grid_stk_hits = grid_stk_pr == y_test
print(f'Precisao do Grid Stacking Melhor CLF:{(sum(grid_stk_hits)/len(grid_stk_hits)):.3f}')

Precisao do Grid Stacking Melhor CLF:0.748


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
