In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import HistGradientBoostingClassifier

import matplotlib.pyplot as plt
import warnings

In [3]:
warnings.filterwarnings('ignore')

In [18]:
enade_treino = pd.read_csv("../tabelas_finais/tabela_final_2011_treinamento.csv")
enade_treino["origin"] = 0

In [19]:
enade_teste = pd.read_csv("../tabelas_finais/tabela_final_2014_treinamento.csv")
enade_teste["origin"] = 1

In [20]:
dados_2017_2021 = pd.concat([enade_treino, enade_teste])

# No NannyML parece que não randomiza por causa da separação em chunks
#dados_2017_2021 = dados_2017_2021.sample(frac = 1, ignore_index=True)

dados_2017_2021.tail()

Unnamed: 0,Codigo_do_Curso,Numero_Notas_Invalidas,Numero_Faltantes,Numero_Participantes,ADS,BCC,EC,GTI,LCC,RC,...,Não sei responder_Plano_de_Ensino,nulos_Formacao,Muito boa_Formacao,Boa_Formacao,Regular_Formacao,Fraca_Formacao,Muito fraca_Formacao,Não sei responder_Formacao,Nota_Conceito_Faixa,origin
1500,5001073,0.0,0.034483,0.965517,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.448276,0.344828,0.172414,0.0,0.034483,0.0,3,1
1501,5001087,0.0,0.130435,0.869565,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.130435,0.173913,0.130435,0.434783,0.043478,0.0,0.086957,4,1
1502,5001088,0.0,0.064516,0.935484,0.0,0.0,0.0,0.0,1.0,0.0,...,0.016129,0.0,0.645161,0.145161,0.16129,0.016129,0.0,0.032258,3,1
1503,5001103,0.0,0.043478,0.956522,0.0,0.0,0.0,0.0,1.0,0.0,...,0.043478,0.0,0.913043,0.043478,0.043478,0.0,0.0,0.0,2,1
1504,5001162,0.0,0.422222,0.577778,0.0,0.0,0.0,0.0,1.0,0.0,...,0.044444,0.311111,0.311111,0.066667,0.222222,0.066667,0.022222,0.0,3,1


In [21]:
numero_caracteristicas = dados_2017_2021.shape[1] - 1
X_enade = dados_2017_2021.iloc[:, 0:numero_caracteristicas]
y_enade = dados_2017_2021.iloc[:, -1]

In [22]:
lista_caracteristicas = enade_treino.columns.values

In [16]:
# NannyML approach

stratifiedSplit = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
y_true_splits = []
y_score_splits = []

for i, (train_index, test_index) in enumerate(stratifiedSplit.split(X_enade, y_enade)):
    enade_treino = dados_2017_2021.iloc[train_index]
    X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
    y_enade_treino = enade_treino.iloc[:, -1]

    enade_teste = dados_2017_2021.iloc[test_index]
    X_enade_teste = enade_teste.iloc[:, 0:numero_caracteristicas]
    y_enade_teste = enade_teste.iloc[:, -1]
    
    normalizador_treinamento = StandardScaler()
    normalizador_treinamento.fit(X_enade_treino)
    treinamento_normalizado = normalizador_treinamento.transform(X_enade_treino)
    
    normalizador_teste = StandardScaler()
    normalizador_teste.fit(X_enade_teste)
    teste_normalizado = normalizador_teste.transform(X_enade_teste)
    
    # LGBMClassifier é o modelo utilizado na biblioteca NannyML
    lgbmClassifier = LGBMClassifier(objective='binary', importance_type='split', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, 
                                    feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, colsample_bytree=None, subsample_freq=None, subsample=None)
    lgbmClassifier.fit(treinamento_normalizado, y_enade_treino)
            
    y_score_teste = lgbmClassifier.predict_proba(teste_normalizado)[:, 1]
    
    y_true_splits.extend(y_enade_teste)
    y_score_splits.extend(y_score_teste)
    
    print(f"\nFolder {i}")
    print(f"AUC score: {roc_auc_score(y_enade_teste, y_score_teste)}")

result = roc_auc_score(y_true_splits, y_score_splits)
print(f"\nAUC score Total: {result}")


Folder 0
AUC score: 0.909005288705354

Folder 1
AUC score: 0.8762380848883782

Folder 2
AUC score: 0.9058382753231699

Folder 3
AUC score: 0.9110545326585318

Folder 4
AUC score: 0.8943707889589219

Folder 5
AUC score: 0.9086326988956852

Folder 6
AUC score: 0.9203278790325085

Folder 7
AUC score: 0.8922076981194563

Folder 8
AUC score: 0.9087154966311671

Folder 9
AUC score: 0.8953747115016404

AUC score Total: 0.9016546092464371


In [23]:
# Deepchecks approach

stratifiedSplit = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
top_3 = [0, 0, 0]
media_auc = 0
media_drift = 0

for i, (train_index, test_index) in enumerate(stratifiedSplit.split(X_enade, y_enade)):
    enade_treino = dados_2017_2021.iloc[train_index]
    X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
    y_enade_treino = enade_treino.iloc[:, -1]

    enade_teste = dados_2017_2021.iloc[test_index]
    X_enade_teste = enade_teste.iloc[:, 0:numero_caracteristicas]
    y_enade_teste = enade_teste.iloc[:, -1]
    
    normalizador_treinamento = StandardScaler()
    normalizador_treinamento.fit(X_enade_treino)
    treinamento_normalizado = normalizador_treinamento.transform(X_enade_treino)
    
    normalizador_teste = StandardScaler()
    normalizador_teste.fit(X_enade_teste)
    teste_normalizado = normalizador_teste.transform(X_enade_teste)
    
    model = HistGradientBoostingClassifier(max_depth=2, max_iter=10, random_state=42)
    model.fit(treinamento_normalizado, y_enade_treino)
    
    result = permutation_importance(model, teste_normalizado, y_enade_teste, scoring="accuracy",
                            n_repeats=30,
                            random_state=42)

    feature_importance = result.importances_mean
    total = feature_importance.sum()
    feature_importance = feature_importance / total
        
    # importances_mean: Mean of feature importance over n_repeats.
    # importances_std: Standard deviation over n_repeats.
    # argsort: retorna uma lista com os indices correspondentes ao resultado da ordenação por valor, do array em questão
    # [::-1]: retorna a lista na ordem inversa
    # [:5]: Os cinco primeiros elementos
    print(f"\nFolder {i}")
    for i in feature_importance.argsort()[::-1][0:5]:
        print(f"Atributo: {lista_caracteristicas[i]}"
              f"\n\tMédia: {feature_importance[i]:.3f}")
        
    y_score_teste = model.predict_proba(teste_normalizado)[:, 1]
    auc = roc_auc_score(y_enade_teste, y_score_teste)
    drift = max(2 * auc - 1, 0)
    
    media_auc += auc
    media_drift += drift

    print(f"AUC score: {auc}")
    print(f"Drift score: {drift}")
    
print("\nMédias das iterações")
print(f"AUC score: {media_auc/5}")
print(f"Drift score: {media_drift/5}")


Folder 0
Atributo: nulos_Condicao_Salas
	Média: 0.773
Atributo: Exterior_Tipo_Escola_EM
	Média: 0.138
Atributo: Não sei responder_Dominio_Professores
	Média: 0.063
Atributo: nulos_Disponibilidade_Professores
	Média: 0.039
Atributo: Nota_Conceito_Faixa
	Média: 0.000
AUC score: 0.9757574108999467
Drift score: 0.9515148217998934

Folder 1
Atributo: nulos_Disponibilidade_Professores
	Média: 0.788
Atributo: Exterior_Tipo_Escola_EM
	Média: 0.149
Atributo: Não sei responder_Dominio_Professores
	Média: 0.054
Atributo: nulos_Qtde_Livros
	Média: 0.009
Atributo: nulos_Politica_de_Ingresso
	Média: 0.001
AUC score: 0.973551066299144
Drift score: 0.947102132598288

Folder 2
Atributo: nulos_Condicao_Salas
	Média: 0.655
Atributo: nulos_Qtde_Livros
	Média: 0.141
Atributo: Exterior_Tipo_Escola_EM
	Média: 0.114
Atributo: Não sei responder_Dominio_Professores
	Média: 0.053
Atributo: nulos_Disponibilidade_Professores
	Média: 0.030
AUC score: 0.9737263649112626
Drift score: 0.9474527298225253

Folder 3
Atr