In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz 

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.metrics import top_k_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import f1_score

### Carregamento do conjunto de dados de todos os anos

In [2]:
ano = 2021
enade_ano = pd.read_csv(f"../../concept_drift/tabela_final_{ano}_treinamento.csv")

### Cross-validation (k = 5)

In [3]:
k = 5

In [4]:
array_folds = np.array_split(enade_ano, k)

In [10]:
for i_fold in range(k):
    
    # Separação dos folds
    
    folds_treinamento = array_folds.copy()
    fold_teste = array_folds[i_fold]
    
    del folds_treinamento[i_fold]
    
    folds_treinamento = pd.concat(folds_treinamento, sort=False)
    
    # Separação em X e y
    
    numero_caracteristicas = enade_ano.shape[1] - 1
    
    X_folds_treinamento = folds_treinamento.iloc[:, 0:numero_caracteristicas]
    y_folds_treinamento = folds_treinamento.iloc[:, -1]
    
    X_fold_teste = fold_teste.iloc[:, 0:numero_caracteristicas]
    y_fold_teste = fold_teste.iloc[:, -1]
    
    # Normalização de treino e teste
    
    normalizador_treinamento = StandardScaler()
    
    normalizador_treinamento.fit(X_folds_treinamento)
    treinamento_normalizado = normalizador_treinamento.transform(X_folds_treinamento)
    
    normalizador_teste = StandardScaler()
    
    normalizador_teste.fit(X_fold_teste)
    teste_normalizado = normalizador_teste.transform(X_fold_teste)
    
    # Busca por hyperparâmetros
    
    parametros_para_busca = {"criterion":['gini', 'entropy'],
                              "min_samples_leaf": [10, 30, 50],
                              "ccp_alpha": [0.001, 0.0015, 0.0017, 0.002]}
    
    classificador = tree.DecisionTreeClassifier()
    
    busca = HalvingGridSearchCV(classificador, parametros_para_busca, scoring="f1_weighted").fit(treinamento_normalizado, y_folds_treinamento)
    
    busca.best_estimator_.fit(treinamento_normalizado, y_folds_treinamento)
    y_predito = busca.best_estimator_.predict(teste_normalizado)
    
    melhor_classificador = str(busca.best_estimator_)
    importancia_features = busca.best_estimator_.feature_importances_
    resultado = f1_score(y_fold_teste, y_predito, average='weighted')
    
    lista_features = X_folds_treinamento.columns.tolist()
    
    df_importanca_feature = {}
    for i, feature in enumerate(lista_features):
        df_importanca_feature[feature] = round(importancia_features[i], 3)
    
    sorted_df = sorted(df_importanca_feature.items(), key=lambda x:x[1])
    
    print(f"Teste: fold {i_fold}")
    print(f"Classificador: \n\n{melhor_classificador}\n")
    print(f"Resultado: {resultado}\n")
    print(f"Características importantes: {sorted_df}\n")

Teste: fold 0
Classificador: 

DecisionTreeClassifier(ccp_alpha=0.0015, min_samples_leaf=10)

Resultado: 0.41401074702546015

Características importantes: [('Numero_Notas_Invalidas', 0.0), ('nulos_UF_Ensino_Medio', 0.0), ('RO', 0.0), ('AC', 0.0), ('AM', 0.0), ('RR', 0.0), ('PA', 0.0), ('AP', 0.0), ('TO', 0.0), ('MA', 0.0), ('PI', 0.0), ('CE', 0.0), ('RN', 0.0), ('PB', 0.0), ('PE', 0.0), ('AL', 0.0), ('SE', 0.0), ('BA', 0.0), ('MG', 0.0), ('ES', 0.0), ('RJ', 0.0), ('PR', 0.0), ('SC', 0.0), ('RS', 0.0), ('MS', 0.0), ('MT', 0.0), ('GO', 0.0), ('DF', 0.0), ('Não se aplica', 0.0), ('nulos_Tipo_Escola_EM', 0.0), ('Pública', 0.0), ('Exterior', 0.0), ('nulos_Modalidade_EM', 0.0), ('Tradicional', 0.0), ('Magistério', 0.0), ('Outra', 0.0), ('nulos_Sexo', 0.0), ('M', 0.0), ('F', 0.0), ('nulos_Cor_Raca', 0.0), ('Preta', 0.0), ('Amarela', 0.0), ('Indígena', 0.0), ('nulos_Estado_Civil', 0.0), ('Solteiro', 0.0), ('Casado', 0.0), ('Separado', 0.0), ('Viúvo', 0.0), ('nulos_Nl_Pai', 0.0), ('Nenhuma', 0.

### Visualização das mudanças nas características da árvore conforme aumentamos o valor de ccp_alpha

In [None]:
path = classfier.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print(
    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]
    )
)

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

### Matriz de Confusão

In [None]:
y_pred = search.predict(X_test)

In [None]:
y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=search.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=search.classes_)
disp.plot()
plt.savefig('sample.png')
plt.show()