In [88]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import HistGradientBoostingClassifier

import matplotlib.pyplot as plt
import warnings

In [89]:
warnings.filterwarnings('ignore')

In [76]:
enade_treino = pd.read_csv("../../concept_drift/tabela_final_2017_treinamento.csv")
enade_treino["origin"] = 0

In [77]:
enade_teste = pd.read_csv("../../concept_drift/tabela_final_2021_treinamento.csv")
enade_teste["origin"] = 1

In [78]:
dados_2017_2021 = pd.concat([enade_treino, enade_teste])

# No NannyML parece que não randomiza por causa da separção em chunks
#dados_2017_2021 = dados_2017_2021.sample(frac = 1, ignore_index=True)

dados_2017_2021.head()

Unnamed: 0,Numero_Notas_Invalidas,Numero_Faltantes,Numero_Participantes,nulos_UF_Ensino_Medio,RO,AC,AM,RR,PA,AP,...,Não sei responder.4,nulos_Formacao,Muito boa,Boa,Regular,Fraca,Muito fraca,Não sei responder.5,Nota_Conceito_Faixa,origin
0,0.0,0.043478,0.956522,0.043478,0.043478,0.0,0.0,0.0,0.043478,0.0,...,0.043478,0.043478,0.304348,0.434783,0.217391,0.0,0.0,0.0,3,0
1,0.0,0.265823,0.734177,0.316456,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025316,0.316456,0.126582,0.177215,0.316456,0.037975,0.012658,0.012658,4,0
2,0.0,0.205128,0.794872,0.25641,0.0,0.0,0.025641,0.0,0.0,0.0,...,0.0,0.25641,0.25641,0.25641,0.205128,0.0,0.0,0.025641,4,0
3,0.0,0.184615,0.815385,0.169231,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.169231,0.261538,0.215385,0.276923,0.030769,0.030769,0.015385,3,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038462,0.0,0.230769,0.461538,0.269231,0.038462,0.0,0.0,5,0


In [79]:
numero_caracteristicas = dados_2017_2021.shape[1] - 1
X_enade = dados_2017_2021.iloc[:, 0:numero_caracteristicas]
y_enade = dados_2017_2021.iloc[:, -1]

In [80]:
lista_caracteristicas = enade_treino.columns.values

In [101]:
# NannyML approach

stratifiedSplit = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
y_true_splits = []
y_score_splits = []

for i, (train_index, test_index) in enumerate(stratifiedSplit.split(X_enade, y_enade)):
    enade_treino = dados_2017_2021.iloc[train_index]
    X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
    y_enade_treino = enade_treino.iloc[:, -1]

    enade_teste = dados_2017_2021.iloc[test_index]
    X_enade_teste = enade_teste.iloc[:, 0:numero_caracteristicas]
    y_enade_teste = enade_teste.iloc[:, -1]
    
    normalizador_treinamento = StandardScaler()
    normalizador_treinamento.fit(X_enade_treino)
    treinamento_normalizado = normalizador_treinamento.transform(X_enade_treino)
    
    normalizador_teste = StandardScaler()
    normalizador_teste.fit(X_enade_teste)
    teste_normalizado = normalizador_teste.transform(X_enade_teste)
    
#    parametros_para_busca = {
#    'boosting_type': ['gbdt', 'dart', 'rf'],
#    'num_leaves': [21, 31, 51, 91, 131],
#    'learning_rate': [0.01, 0.05, 0.1],
#    # porcentagem de features para usar
#    'feature_fraction': [0.7, 0.8, 0.9],
#    # porcentagem de dados para usar em cada bagging
#    'bagging_fraction': [0.6, 0.7, 0.8],
#    # número de iterações até o próximo bagging
#    'bagging_freq': [2, 3, 5],
#    }

#    busca = HalvingGridSearchCV(lgbmClassifier, parametros_para_busca, scoring="f1").fit(treinamento_normalizado, y_enade_treino) 
#    estimator = busca.best_estimator_
    
    # LGBMClassifier é o modelo utilizado na biblioteca NannyML
    lgbmClassifier = LGBMClassifier(objective='binary', importance_type='split', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, 
                                    feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, colsample_bytree=None, subsample_freq=None, subsample=None)
    lgbmClassifier.fit(treinamento_normalizado, y_enade_treino)
            
    y_score_teste = lgbmClassifier.predict_proba(teste_normalizado)[:, 1]
    
    y_true_splits.extend(y_enade_teste)
    y_score_splits.extend(y_score_teste)
    
    print(f"\nFolder {i}")
    print(f"AUC score: {roc_auc_score(y_enade_teste, y_score_teste)}")

result = roc_auc_score(y_true_splits, y_score_splits)
print(f"\nAUC score Total: {result}")


Folder 0
AUC score: 0.8729137009758412

Folder 1
AUC score: 0.8714179612551836

Folder 2
AUC score: 0.8824554888490024

Folder 3
AUC score: 0.8569866518124241

Folder 4
AUC score: 0.8896040931691115

AUC score Total: 0.8745018670957891


In [103]:
# NannyML approach

stratifiedSplit = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
top_3 = [0, 0, 0]
media_auc = 0
media_drift = 0

for i, (train_index, test_index) in enumerate(stratifiedSplit.split(X_enade, y_enade)):
    enade_treino = dados_2017_2021.iloc[train_index]
    X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
    y_enade_treino = enade_treino.iloc[:, -1]

    enade_teste = dados_2017_2021.iloc[test_index]
    X_enade_teste = enade_teste.iloc[:, 0:numero_caracteristicas]
    y_enade_teste = enade_teste.iloc[:, -1]
    
    normalizador_treinamento = StandardScaler()
    normalizador_treinamento.fit(X_enade_treino)
    treinamento_normalizado = normalizador_treinamento.transform(X_enade_treino)
    
    normalizador_teste = StandardScaler()
    normalizador_teste.fit(X_enade_teste)
    teste_normalizado = normalizador_teste.transform(X_enade_teste)
    
#    parametros_para_busca = {
#    'boosting_type': ['gbdt', 'dart', 'rf'],
#    'num_leaves': [21, 31, 51, 91, 131],
#    'learning_rate': [0.01, 0.05, 0.1],
#    # porcentagem de features para usar
#    'feature_fraction': [0.7, 0.8, 0.9],
#    # porcentagem de dados para usar em cada bagging
#    'bagging_fraction': [0.6, 0.7, 0.8],
#    # número de iterações até o próximo bagging
#    'bagging_freq': [2, 3, 5],
#    }

#    busca = HalvingGridSearchCV(lgbmClassifier, parametros_para_busca, scoring="f1").fit(treinamento_normalizado, y_enade_treino) 
#    estimator = busca.best_estimator_
    
    model = HistGradientBoostingClassifier(max_depth=2, max_iter=10, random_state=42)
    model.fit(treinamento_normalizado, y_enade_treino)
    
    result = permutation_importance(model, teste_normalizado, y_enade_teste, scoring="accuracy",
                            n_repeats=30,
                            random_state=42)

    feature_importance = result.importances_mean
    total = feature_importance.sum()
    feature_importance = feature_importance / total
        
    # importances_mean: Mean of feature importance over n_repeats.
    # importances_std: Standard deviation over n_repeats.
    # argsort: retorna uma lista com os indices correspondentes ao resultado da ordenação por valor, do array em questão
    # [::-1]: retorna a lista na ordem inversa
    # [:5]: Os cinco primeiros elementos
    print(f"\nFolder {i}")
    for i in feature_importance.argsort()[::-1][0:10]:
        match lista_caracteristicas[i]:
            case "Nenhum":
                print(f"Atributo: {lista_caracteristicas[i]}"
                      f"\n\tMédia: {feature_importance[i]:.3f}")
                
                top_3[0] += feature_importance[i]
            case "Não sei responder.2":
                print(f"Atributo: {lista_caracteristicas[i]}"
                      f"\n\tMédia: {feature_importance[i]:.3f}")
                
                top_3[1] += feature_importance[i]
            case "Numero_Participantes":
                print(f"Atributo: {lista_caracteristicas[i]}"
                      f"\n\tMédia: {feature_importance[i]:.3f}")
                
                top_3[2] += feature_importance[i]
        
    
    y_score_teste = model.predict_proba(teste_normalizado)[:, 1]
    auc = roc_auc_score(y_enade_teste, y_score_teste)
    drift = max(2 * auc - 1, 0)
    
    media_auc += auc
    media_drift += drift

    print(f"AUC score: {auc}")
    print(f"Drift score: {drift}")
    
print("\nMédias das iterações")
print(f"AUC score: {media_auc/5}")
print(f"Drift score: {media_drift/5}")
print(f"Nenhum: {top_3[0]/5}")
print(f"Não sei responder.2: {top_3[1]/5}")
print(f"Numero_Participantes: {top_3[2]/5}")


Folder 0
Atributo: Nenhum
	Média: 0.399
Atributo: Não sei responder.2
	Média: 0.289
Atributo: Numero_Participantes
	Média: 0.158
AUC score: 0.7706275474608012
Drift score: 0.5412550949216024

Folder 1
Atributo: Nenhum
	Média: 0.526
Atributo: Numero_Participantes
	Média: 0.259
Atributo: Não sei responder.2
	Média: 0.191
AUC score: 0.7639880843094774
Drift score: 0.5279761686189548

Folder 2
Atributo: Nenhum
	Média: 0.450
Atributo: Não sei responder.2
	Média: 0.285
Atributo: Numero_Participantes
	Média: 0.187
AUC score: 0.7612730877978923
Drift score: 0.5225461755957845

Folder 3
Atributo: Nenhum
	Média: 0.530
Atributo: Não sei responder.2
	Média: 0.256
Atributo: Numero_Participantes
	Média: 0.090
AUC score: 0.763561212499541
Drift score: 0.527122424999082

Folder 4
Atributo: Nenhum
	Média: 0.416
Atributo: Numero_Participantes
	Média: 0.226
Atributo: Não sei responder.2
	Média: 0.223
AUC score: 0.7499173796496897
Drift score: 0.4998347592993795

Médias das iterações
AUC score: 0.7618734