In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

import warnings

In [2]:
from sklearn.ensemble import VotingClassifier

In [3]:
warnings.filterwarnings('ignore')

In [4]:
dtClassifier = tree.DecisionTreeClassifier()
knnClassifier = KNeighborsClassifier()
rfClassifier = RandomForestClassifier()
svmClassifier = SVC(probability=True)
mlpClassifier = MLPClassifier()

In [5]:
votingClassifier = VotingClassifier(estimators=[('DT', dtClassifier), ('KNN', knnClassifier), 
                                                ('RF', rfClassifier), ('SVM', svmClassifier), 
                                                ('MLP', mlpClassifier)], voting='soft', weights=[1, 1, 2, 3, 3])

In [6]:
enade_todos_anos = pd.read_csv("../teste_OHE_curso_treinamento.csv")

In [7]:
k = 5

array_folds = np.array_split(enade_todos_anos, k)

for i_fold in range(k):
    
    # Separação dos folds
    
    folds_treinamento = array_folds.copy()
    fold_teste = array_folds[i_fold]
    
    del folds_treinamento[i_fold]
    
    folds_treinamento = pd.concat(folds_treinamento, sort=False)
    
    # Separação em X e y
    
    numero_caracteristicas = enade_todos_anos.shape[1] - 1
    
    X_folds_treinamento = folds_treinamento.iloc[:, 0:numero_caracteristicas]
    y_folds_treinamento = folds_treinamento.iloc[:, -1]
    
    X_fold_teste = fold_teste.iloc[:, 0:numero_caracteristicas]
    y_fold_teste = fold_teste.iloc[:, -1]
    
    # Normalização de treino e teste
    
    normalizador_treinamento = StandardScaler()
    
    normalizador_treinamento.fit(X_folds_treinamento)
    treinamento_normalizado = normalizador_treinamento.transform(X_folds_treinamento)
    
    normalizador_teste = StandardScaler()
    
    normalizador_teste.fit(X_fold_teste)
    teste_normalizado = normalizador_teste.transform(X_fold_teste)
    
    # parametros_para_busca = {"KNN__n_neighbors": [3, 5, 10],
    #                          "KNN__weights": ["uniform", "distance"],
    #                          "KNN__metric": ["euclidean", "manhattan", "nan_euclidean"],
    #                         "DT__criterion":['gini', 'entropy'],
    #                           "DT__min_samples_leaf": [10, 30, 50],
    #                           "DT__ccp_alpha": [0.001, 0.0015, 0.0017, 0.002],
    #                         "MLP__hidden_layer_sizes": [10, 20, 10], 
    #                           "MLP__activation": ["relu", "tanh"],
    #                           "MLP__solver": ["adam", "sgd"],
    #                           "MLP__learning_rate_init": [0.001, 0.01, 0.1],
    #                           "MLP__tol": [0.1, 0.001, 0.0001],
    #                           "MLP__alpha": [0.0001, 0.01, 0.1],
    #                           "MLP__batch_size": [32, 60, 100],
    #                         "RF__n_estimators": [60, 100, 200],
    #                           "RF__criterion":['gini', 'entropy'],
    #                           "RF__min_samples_leaf": [10, 30, 50],
    #                           "RF__ccp_alpha": [0.001, 0.0015, 0.0017, 0.002],
    #                         "SVM__C": [0.01, 0.1, 1, 10, 100], 
    #                           "SVM__kernel": ["linear", "poly", "rbf"],
    #                           "SVM__gamma": [0.1, 1, 10]}
    
#     buscaParam = HalvingGridSearchCV(votingClassifier, parametros_para_busca, scoring="f1_weighted").fit(treinamento_normalizado, y_folds_treinamento)
#     buscaParam.best_estimator_.fit(treinamento_normalizado, y_folds_treinamento)
    
#     y_predito = buscaParam.best_estimator_.predict(teste_normalizado)

    votingClassifier.fit(treinamento_normalizado, y_folds_treinamento)
    y_predito = votingClassifier.predict(teste_normalizado)
    
    resultado = f1_score(y_fold_teste, y_predito, average='weighted')
    
    # print(f"Teste: fold {i_fold}")
    # print(f"Classificador: \n\n{melhor_classificador}\n")
    print(f"{resultado} ", end="")

0.482060458808624 0.468899166095766 0.4801840964107076 0.4484126091660778 0.40554445848435267 