In [2]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

import matplotlib.pyplot as plt

In [3]:
enade_treino = pd.read_csv("../../concept_drift/tabela_final_2017_treinamento.csv")
enade_treino["origin"] = 0

In [4]:
enade_teste = pd.read_csv("../../concept_drift/tabela_final_2021_treinamento.csv")
enade_teste["origin"] = 1

In [5]:
dados_2017_2021 = pd.concat([enade_treino, enade_teste])

# No NannyML parece que não randomiza por causa da separção em chunks
dados_2017_2021 = dados_2017_2021.sample(frac = 1, ignore_index=True)

dados_2017_2021.head()

Unnamed: 0,Numero_Notas_Invalidas,Numero_Faltantes,Numero_Participantes,nulos_UF_Ensino_Medio,RO,AC,AM,RR,PA,AP,...,Não sei responder.4,nulos_Formacao,Muito boa,Boa,Regular,Fraca,Muito fraca,Não sei responder.5,Nota_Conceito_Faixa,origin
0,0.0,0.340426,0.659574,0.212766,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.212766,0.212766,0.255319,0.276596,0.021277,0.0,0.021277,3,1
1,0.0,0.529412,0.470588,0.470588,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.470588,0.235294,0.176471,0.058824,0.0,0.058824,0.0,3,0
2,0.0,0.272727,0.727273,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.181818,0.727273,0.090909,0.0,0.0,0.0,0.0,2,0
3,0.0,0.2,0.8,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.333333,0.266667,0.2,0.0,0.0,0.0,2,0
4,0.0,0.173913,0.826087,0.173913,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.173913,0.217391,0.26087,0.304348,0.043478,0.0,0.0,2,1


In [6]:
numero_caracteristicas = dados_2017_2021.shape[1] - 1
X_enade = dados_2017_2021.iloc[:, 0:numero_caracteristicas]
y_enade = dados_2017_2021.iloc[:, -1]

In [7]:
lista_caracteristicas = enade_treino.columns.values

In [17]:
# NannyML approach

stratifiedSplit = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
y_true_splits = []
y_score_splits = []

for i, (train_index, test_index) in enumerate(stratifiedSplit.split(X_enade, y_enade)):
    enade_treino = dados_2017_2021.iloc[train_index]
    X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
    y_enade_treino = enade_treino.iloc[:, -1]

    enade_teste = dados_2017_2021.iloc[test_index]
    X_enade_teste = enade_teste.iloc[:, 0:numero_caracteristicas]
    y_enade_teste = enade_teste.iloc[:, -1]
    
    normalizador_treinamento = StandardScaler()
    normalizador_treinamento.fit(X_enade_treino)
    treinamento_normalizado = normalizador_treinamento.transform(X_enade_treino)
    
    normalizador_teste = StandardScaler()
    normalizador_teste.fit(X_enade_teste)
    teste_normalizado = normalizador_teste.transform(X_enade_teste)
    
#    parametros_para_busca = {
#    'boosting_type': ['gbdt', 'dart', 'rf'],
#    'num_leaves': [21, 31, 51, 91, 131],
#    'learning_rate': [0.01, 0.05, 0.1],
#    # porcentagem de features para usar
#    'feature_fraction': [0.7, 0.8, 0.9],
#    # porcentagem de dados para usar em cada bagging
#    'bagging_fraction': [0.6, 0.7, 0.8],
#    # número de iterações até o próximo bagging
#    'bagging_freq': [2, 3, 5],
#    }

#    busca = HalvingGridSearchCV(lgbmClassifier, parametros_para_busca, scoring="f1").fit(treinamento_normalizado, y_enade_treino) 
#    estimator = busca.best_estimator_
    
    # LGBMClassifier é o modelo utilizado na biblioteca NannyML
    lgbmClassifier = LGBMClassifier(objective='binary', importance_type='split', verbosity=-1, boosting_type='gbdt', num_leaves=31, learning_rate=0.05, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5)
    lgbmClassifier.fit(treinamento_normalizado, y_enade_treino)
    
    result = permutation_importance(lgbmClassifier, teste_normalizado, y_enade_teste,
                            n_repeats=1,
                            random_state=0)
    
    # importances_mean: Mean of feature importance over n_repeats.
    # importances_std: Standard deviation over n_repeats.
    # argsort: retorna uma lista com os indices correspondentes ao resultado da ordenação por valor, do array em questão
    # [::-1]: retorna a lista na ordem inversa
    # [:5]: Os cinco primeiros elementos
    print(f"Fold {i}\n")
    for i in result.importances_mean.argsort()[::-1][:5]:
        # Se a média da importancia deste atributo menos duas vezes o desvio padrão deste atributo for maior que zero
        if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
            print(f"Atributo: {lista_caracteristicas[i]}"
                 f"\n\tMédia: {result.importances_mean[i]:.3f}"
                 f"\n\tDesvio padrão: +/- {result.importances_std[i]:.3f}\n")
            
    y_score_teste = lgbmClassifier.predict_proba(teste_normalizado)[:, 1]
    
    y_true_splits.extend(y_enade_teste)
    y_score_splits.extend(y_score_teste)
    
    #result = roc_auc_score(y_enade_teste, y_score_teste)
    #print(result)

result = roc_auc_score(y_true_splits, y_score_splits)
print(result)

Fold 0

Atributo: Nenhum
	Média: 0.088
	Desvio padrão: +/- 0.000

Atributo: Não sei responder.2
	Média: 0.030
	Desvio padrão: +/- 0.000

Atributo: Todos.1
	Média: 0.022
	Desvio padrão: +/- 0.000

Atributo: Tem renda/Sustenta
	Média: 0.021
	Desvio padrão: +/- 0.000

Atributo: Numero_Participantes
	Média: 0.021
	Desvio padrão: +/- 0.000

Fold 1

Atributo: Nenhum
	Média: 0.064
	Desvio padrão: +/- 0.000

Atributo: De 3 a 4,5 SM
	Média: 0.024
	Desvio padrão: +/- 0.000

Atributo: Numero_Faltantes
	Média: 0.014
	Desvio padrão: +/- 0.000

Atributo: Numero_Participantes
	Média: 0.014
	Desvio padrão: +/- 0.000

Atributo: Solteiro
	Média: 0.008
	Desvio padrão: +/- 0.000

Fold 2

Atributo: Nenhum
	Média: 0.064
	Desvio padrão: +/- 0.000

Atributo: Numero_Participantes
	Média: 0.045
	Desvio padrão: +/- 0.000

Atributo: Não sei responder.2
	Média: 0.037
	Desvio padrão: +/- 0.000

Atributo: De 3 a 4,5 SM
	Média: 0.021
	Desvio padrão: +/- 0.000

Atributo: Casado
	Média: 0.018
	Desvio padrão: +/- 0.000
