In [27]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz 

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.metrics import top_k_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [14]:
data = pd.read_csv("../teste_OHE_curso_treinamento.csv")

X = data.iloc[:, 0:225]
Y = data.iloc[:, -1]

In [None]:
mostrar_distribuicao(data, [1,2,3,4,5])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
pca = PCA(n_components=2)
pca.fit(X_test)
X_transformado = pca.transform(X_train)

In [None]:
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(X_transformado)

In [None]:
X_embedded.shape

In [16]:
scaler_train = StandardScaler()

In [17]:
scaler_train.fit(X_train)
normalized_train = scaler_train.transform(X_train)
normalized_train

array([[-0.03859939, -0.25927762,  0.26168505, ..., -0.01869269,
        -0.17718245,  0.79389553],
       [-0.03859939, -0.5443854 ,  0.54571519, ..., -0.47831106,
        -0.3534321 , -0.20274793],
       [-0.03859939,  1.50296002, -1.49389174, ..., -0.47831106,
        -0.3534321 , -0.46721672],
       ...,
       [-0.03859939, -0.71663802,  0.71731673, ..., -0.47831106,
        -0.3534321 ,  0.26007245],
       [-0.03859939, -1.09559378,  1.09484013, ...,  1.85425219,
        -0.3534321 ,  0.11461461],
       [-0.03859939, -0.64646103,  0.64740499, ...,  0.38560125,
        -0.3534321 , -0.03623055]])

In [18]:
scaler_test = StandardScaler()

In [19]:
scaler_test.fit(X_test)
normalized_test = scaler_test.transform(X_test)
normalized_test

array([[-0.10010154, -1.11727684,  1.11938765, ..., -0.45625419,
        -0.31319302, -0.4565165 ],
       [-0.10010154, -0.22195197,  0.22460016, ...,  0.20413202,
        -0.31319302, -0.4565165 ],
       [-0.10010154,  0.91212619, -0.90879733, ..., -0.45625419,
        -0.31319302, -0.4565165 ],
       ...,
       [-0.10010154,  0.12100298, -0.11814895, ...,  0.3048689 ,
        -0.31319302, -0.27487423],
       [-0.10010154, -0.3562507 ,  0.35881828, ..., -0.45625419,
        -0.31319302, -0.4565165 ],
       [-0.10010154,  2.36169979, -2.35750088, ..., -0.45625419,
        -0.31319302, -0.4565165 ]])

In [None]:
scaler_train_MM = MinMaxScaler()

normalized_train_MM = scaler_train_MM.fit_transform(X_train)
normalized_train_MM

In [None]:
scaler_test_MM = MinMaxScaler()

normalized_test_MM = scaler_test_MM.fit_transform(X_test)
normalized_test_MM

In [None]:
df_concat_X_y_train = pd.concat([X_train, y_train], axis=1)

In [None]:
def contar_classes(dataset, classes):
    dict_ocorrencias = {}
    for classe in classes:
        numero_de_ocorrencias = dataset["Nota_Conceito_Faixa"].value_counts()[classe]
        dict_ocorrencias[f"{classe}"] = numero_de_ocorrencias
    return dict_ocorrencias

def mostrar_distribuicao(dataset, classes):
    dict_ocorrencias = contar_classes(dataset, classes)
    
    plt.style.use('_mpl-gallery')
    
    x = dict_ocorrencias.keys()
   
    y = dict_ocorrencias.values()
    
    fig, ax = plt.subplots(figsize=(5,3))

    ax.bar(x, y, width=1, edgecolor="white")

    ax.set(xlim=(-1, 5), xticks=np.arange(0, 5, 1),
           ylim=(0, 3500), yticks=np.arange(0, 3500, 500))
    
    plt.show()

In [None]:
mostrar_distribuicao(df_concat_X_y_train, [1,2,3,4,5])

In [None]:
df_concat_X_y_test = pd.concat([X_test, y_test], axis=1)

In [None]:
mostrar_distribuicao(df_concat_X_y_test, [1,2,3,4,5])

In [20]:
classfier_HGS = tree.DecisionTreeClassifier(random_state = 42)

In [None]:
# Visualização das mudanças nas características da árvore conforme aumentamos o valor de ccp_alpha

In [None]:
path = classfier.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print(
    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]
    )
)

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
# Término do experimento

In [None]:
classfier_HGS.get_params()

In [None]:
# Important parameters for HalvingGridSearchCV: param_grid (parameter's values of the estimator to test over the dataset), factor (3 -> selects the best 3 candidates parameters), agressive_elimination (True -> garantees that we will end up with factor candidates) and min_resources ('exhaust')
# Candidate = a parameter combination
# scoring = 'f1_score' ('accuracy_score' is set by default)
# Separate data to the HalvingGridSearchCV and for the test of the result of it
# error_score=0 -> prevents the search to fail during the process
# cv is set to 5 by default, using StratifiedKFold (used Kfold keeping the classes percentage in each fold)
# parm_grid = {"criterion":['gini', 'entropy'],
#               "random_state": 42,
#               "min_impurity_decrease": [0.1, 0.2, 0.25]
#               "min_samples_leaf": [10, 30, 50],
#               "ccp_alpha": [0.001, 0.0015, 0.0017, 0.002]
                

In [None]:
# 0 -> score: 0.23~0.24
#param_grid = {"criterion":['gini', 'entropy'],
#              "random_state": [42],
#              "min_impurity_decrease": [0.1, 0.2, 0.25],
#              "min_samples_leaf": [10, 30, 50],
#              "ccp_alpha": [0.001, 0.0015, 0.0017, 0.002]}

In [21]:
# 1 -> score: 0.42(train) 0.44(test)
param_grid = {"criterion":['gini', 'entropy'],
              "min_samples_leaf": [10, 30, 50],
              "ccp_alpha": [0.001, 0.0015, 0.0017, 0.002]}

In [None]:
# 2 -> score: 0.42(train e test)
param_grid = {"criterion":['gini', 'entropy'],
              "min_samples_leaf": [10, 30, 50]}

In [22]:
search_normalized = HalvingGridSearchCV(classfier_HGS, param_grid, scoring="f1_weighted").fit(normalized_train, y_train) 

In [23]:
search_normalized.best_params_ 

{'ccp_alpha': 0.0015, 'criterion': 'gini', 'min_samples_leaf': 50}

In [None]:
search_normalized.score(normalized_test, y_test)

In [31]:
classfier = tree.DecisionTreeClassifier(ccp_alpha=0.0015, criterion='gini', min_samples_leaf=50)

In [32]:
cv = KFold(n_splits=5)
dt_cv = cross_val_score(classfier, X, Y, cv=cv, scoring='f1_macro')

In [33]:
dt_cv

array([0.28277025, 0.2404459 , 0.26814106, 0.27564838, 0.25813067])

In [34]:
dt_cv.mean()

0.2650272532589847

In [None]:
y_score = search_normalized.predict_proba(normalized_test_MM)

In [None]:
y_score

In [None]:
top_k_accuracy_score(y_test, y_score, k=3, labels=[1, 2, 3, 4, 5])

In [None]:
# Matriz de Confusão

In [None]:
y_pred = search.predict(X_test)

In [None]:
y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=search.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=search.classes_)
disp.plot()
plt.savefig('sample.png')
plt.show()