In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import HistGradientBoostingClassifier

import matplotlib.pyplot as plt
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
enade_treino = pd.read_csv("../../concept_drift/tabela_final_2017_tres_anos_treinamento.csv")
enade_treino["origin"] = 0

In [4]:
enade_teste = pd.read_csv("../../concept_drift/tabela_final_2021_tres_anos_treinamento.csv")
enade_teste["origin"] = 1

In [12]:
dados_2017_2021 = pd.concat([enade_treino, enade_teste])

# No NannyML parece que não randomiza por causa da separação em chunks
#dados_2017_2021 = dados_2017_2021.sample(frac = 1, ignore_index=True)

dados_2017_2021.tail()

Unnamed: 0,Numero_Notas_Invalidas,Numero_Faltantes,Numero_Participantes,ADS,BCC,EC,GTI,LCC,RC,SI,...,Disc. totalmente.41,Discordo.41,Disc. parc..41,Concordo parc..41,Concordo.41,Concordo Total..41,Não sei responder.41,Não se aplica.43,Nota_Conceito_Faixa,origin
3,0.0,0.184615,0.815385,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.046154,0.061538,0.076923,0.292308,0.138462,0.215385,0.0,0.0,3,0
947,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,0
1174,0.0,0.277778,0.722222,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.111111,0.333333,0.222222,0.0,0.111111,0.055556,0.0,0.055556,3,0
476,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.142857,0.785714,0.071429,0.0,3,1
415,0.0,0.423077,0.576923,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.076923,0.051282,0.115385,0.24359,0.24359,0.025641,0.012821,4,1


In [6]:
numero_caracteristicas = dados_2017_2021.shape[1] - 1
X_enade = dados_2017_2021.iloc[:, 0:numero_caracteristicas]
y_enade = dados_2017_2021.iloc[:, -1]

In [7]:
lista_caracteristicas = enade_treino.columns.values

In [8]:
# NannyML approach

stratifiedSplit = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
y_true_splits = []
y_score_splits = []

for i, (train_index, test_index) in enumerate(stratifiedSplit.split(X_enade, y_enade)):
    enade_treino = dados_2017_2021.iloc[train_index]
    X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
    y_enade_treino = enade_treino.iloc[:, -1]

    enade_teste = dados_2017_2021.iloc[test_index]
    X_enade_teste = enade_teste.iloc[:, 0:numero_caracteristicas]
    y_enade_teste = enade_teste.iloc[:, -1]
    
    normalizador_treinamento = StandardScaler()
    normalizador_treinamento.fit(X_enade_treino)
    treinamento_normalizado = normalizador_treinamento.transform(X_enade_treino)
    
    normalizador_teste = StandardScaler()
    normalizador_teste.fit(X_enade_teste)
    teste_normalizado = normalizador_teste.transform(X_enade_teste)
    
    # LGBMClassifier é o modelo utilizado na biblioteca NannyML
    lgbmClassifier = LGBMClassifier(objective='binary', importance_type='split', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, 
                                    feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, colsample_bytree=None, subsample_freq=None, subsample=None)
    lgbmClassifier.fit(treinamento_normalizado, y_enade_treino)
            
    y_score_teste = lgbmClassifier.predict_proba(teste_normalizado)[:, 1]
    
    y_true_splits.extend(y_enade_teste)
    y_score_splits.extend(y_score_teste)
    
    print(f"\nFolder {i}")
    print(f"AUC score: {roc_auc_score(y_enade_teste, y_score_teste)}")

result = roc_auc_score(y_true_splits, y_score_splits)
print(f"\nAUC score Total: {result}")


Folder 0
AUC score: 0.9135384647229898

Folder 1
AUC score: 0.9056105815505946

Folder 2
AUC score: 0.9214352987445793

Folder 3
AUC score: 0.8872501837074755

Folder 4
AUC score: 0.9025367156208277

Folder 5
AUC score: 0.9031990975046833

Folder 6
AUC score: 0.9146251850011902

Folder 7
AUC score: 0.9317746659628858

Folder 8
AUC score: 0.9164260357479224

Folder 9
AUC score: 0.9157533041471316

AUC score Total: 0.9108705146914231


In [11]:
# Deepchecks approach

stratifiedSplit = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
top_3 = [0, 0, 0]
media_auc = 0
media_drift = 0

for i, (train_index, test_index) in enumerate(stratifiedSplit.split(X_enade, y_enade)):
    enade_treino = dados_2017_2021.iloc[train_index]
    X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
    y_enade_treino = enade_treino.iloc[:, -1]

    enade_teste = dados_2017_2021.iloc[test_index]
    X_enade_teste = enade_teste.iloc[:, 0:numero_caracteristicas]
    y_enade_teste = enade_teste.iloc[:, -1]
    
    normalizador_treinamento = StandardScaler()
    normalizador_treinamento.fit(X_enade_treino)
    treinamento_normalizado = normalizador_treinamento.transform(X_enade_treino)
    
    normalizador_teste = StandardScaler()
    normalizador_teste.fit(X_enade_teste)
    teste_normalizado = normalizador_teste.transform(X_enade_teste)
    
    model = HistGradientBoostingClassifier(max_depth=2, max_iter=10, random_state=42)
    model.fit(treinamento_normalizado, y_enade_treino)
    
    result = permutation_importance(model, teste_normalizado, y_enade_teste, scoring="accuracy",
                            n_repeats=30,
                            random_state=42)

    feature_importance = result.importances_mean
    total = feature_importance.sum()
    feature_importance = feature_importance / total
        
    # importances_mean: Mean of feature importance over n_repeats.
    # importances_std: Standard deviation over n_repeats.
    # argsort: retorna uma lista com os indices correspondentes ao resultado da ordenação por valor, do array em questão
    # [::-1]: retorna a lista na ordem inversa
    # [:5]: Os cinco primeiros elementos
    print(f"\nFolder {i}")
    for i in feature_importance.argsort()[::-1][0:5]:
        print(f"Atributo: {lista_caracteristicas[i]}"
              f"\n\tMédia: {feature_importance[i]:.3f}")
        
    y_score_teste = model.predict_proba(teste_normalizado)[:, 1]
    auc = roc_auc_score(y_enade_teste, y_score_teste)
    drift = max(2 * auc - 1, 0)
    
    media_auc += auc
    media_drift += drift

    print(f"AUC score: {auc}")
    print(f"Drift score: {drift}")
    
print("\nMédias das iterações")
print(f"AUC score: {media_auc/5}")
print(f"Drift score: {media_drift/5}")


Folder 0
Atributo: Nenhum.2
	Média: 0.408
Atributo: Não sei responder.26
	Média: 0.220
Atributo: Disc. totalmente.38
	Média: 0.080
Atributo: Não sei responder.41
	Média: 0.072
Atributo: Prog CSF
	Média: 0.071
AUC score: 0.8111120821297813
Drift score: 0.6222241642595625

Folder 1
Atributo: Nenhum.2
	Média: 0.408
Atributo: Não sei responder.26
	Média: 0.267
Atributo: Disc. totalmente.38
	Média: 0.135
Atributo: Numero_Participantes
	Média: 0.095
Atributo: Prog CSF
	Média: 0.034
AUC score: 0.7946180523614151
Drift score: 0.5892361047228303

Folder 2
Atributo: Nenhum.2
	Média: 0.398
Atributo: Não sei responder.26
	Média: 0.213
Atributo: Numero_Participantes
	Média: 0.136
Atributo: Não sei responder.34
	Média: 0.062
Atributo: Disc. totalmente.38
	Média: 0.062
AUC score: 0.7954045774842234
Drift score: 0.5908091549684469

Folder 3
Atributo: Nenhum.2
	Média: 0.379
Atributo: Não sei responder.26
	Média: 0.243
Atributo: Não se aplica.36
	Média: 0.119
Atributo: Prog CSF
	Média: 0.085
Atributo: 