In [34]:
# Bibliotecas gerais
import pandas as pd
import pickle
import numpy as np

# Modelos categóricos
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

# Modelos regressores
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    r2_score,
    mean_absolute_error,
    mean_squared_error
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [13]:
csv_name = "Student_Performance.csv"
target_column = "Performance Index"

df = pd.read_csv("data/"+csv_name)

In [14]:
display(df.head())

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [15]:
for col, dtype in df.dtypes.items():
    if dtype == object:
        try:
            # Tentativa de conversão para datetime
            df[col] = pd.to_datetime(df[col], errors='raise')

            df[col+"_month_sin"] = np.sin(2*np.pi*df[col].dt.month/12)
            df[col+"_month_cos"] = np.cos(2*np.pi*df[col].dt.month/12)
            df[col+"_day_sin"]   = np.sin(2*np.pi*df[col].dt.day/31)
            df[col+"_day_cos"]   = np.cos(2*np.pi*df[col].dt.day/31)

            df = df.drop(col, axis=1)
        except Exception:
            n_categories = df[col].nunique()
            if n_categories <= 10 and col != target_column:
                ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                encoded = ohe.fit_transform(df[[col]])

                # Cria nomes de colunas legíveis
                encoded_cols = [f"{col}_{cat}" for cat in ohe.categories_[0]]

                # Concatena com o DataFrame original
                df = pd.concat([df.drop(columns=[col]), pd.DataFrame(encoded, columns=encoded_cols, index=df.index)], axis=1)
            else:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])


  df[col] = pd.to_datetime(df[col], errors='raise')


In [17]:
X = df.drop(target_column, axis=1)
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # porcentagem para teste
    random_state=42,      # garante reprodutibilidade
)

In [18]:
result = DecisionTreeClassifier().cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = result.ccp_alphas
ccp_alphas = np.delete(ccp_alphas, -1)

In [23]:

if y.dtype == object:
    
    scoring = 'accuracy'
    
    models = {
        "Decision Tree Gini": DecisionTreeClassifier(criterion='gini', random_state=42),
        "Decision Tree Entropy": DecisionTreeClassifier(criterion='entropy', random_state=42),
        "Naive Bayes": GaussianNB(),
        "KNN": KNeighborsClassifier(),
        "BernoulliNB": BernoulliNB(),
    }

    param_grids = {
        "Decision Tree Gini": {
            'max_depth': [None, 10, 20, 30],
            'min_samples_leaf': [1, 5, 10],
            'min_samples_split': [2, 5, 10],
            'splitter': ['best', 'random'],
            'ccp_alpha': ccp_alphas,
            'class_weight': [None, 'balanced']
        },
        "Decision Tree Entropy": {
            'max_depth': [None, 10, 20, 30],
            'min_samples_leaf': [1, 5, 10],
            'min_samples_split': [2, 5, 10],
            'splitter': ['best', 'random'],
            'ccp_alpha': ccp_alphas,
            'class_weight': [None, 'balanced']
        },
        "Naive Bayes": {
                'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
            },
        "KNN": {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski', 'cosine'],
        },
        "BernoulliNB": {
            'alpha': [0.5, 1.0, 1.5],
            'binarize': [0.0, 0.5, 1.0],
            'fit_prior': [True, False]
        },
    }
else:
    
    scoring = 'r2'
    
    models = {
        "Linear Regressor": LinearRegression(),
        "KNN Regressor": KNeighborsRegressor(),
    }

    param_grids = {
        "Linear Regressor": {
        "fit_intercept": [True, False],
        "copy_X": [True],
        "positive": [False, True],
    },
    "KNN Regressor": {
        "n_neighbors": [3, 5, 7, 9, 11],
        "weights": ["uniform", "distance"],
        "p": [1, 2],  # 1 = Manhattan, 2 = Euclidiana
        "algorithm": ["auto", "ball_tree", "kd_tree"],
    },
    }

In [None]:
best_models = {}

for name, model in models.items():
    print(f"Treinando e ajustando {name}...")

    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=5,  # validação cruzada com 5 folds
        scoring=scoring,
        n_jobs=-1,  # usa todos os núcleos do processador
        verbose=1
    )
    grid.fit(X_train, y_train)  # treino

    print(f"Melhores parâmetros para {name}: {grid.best_params_}")
    print(f"Melhor {scoring} de validação: {grid.best_score_:.4f}")

    best_models[name] = grid.best_estimator_    


Treinando e ajustando Linear Regressor...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parâmetros para Linear Regressor: {'copy_X': True, 'fit_intercept': True, 'positive': False}
Melhor acurácia de validação: 0.9887
Treinando e ajustando KNN Regressor...
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Melhores parâmetros para KNN Regressor: {'algorithm': 'ball_tree', 'n_neighbors': 11, 'p': 2, 'weights': 'uniform'}
Melhor acurácia de validação: 0.9842


In [35]:
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    if scoring == 'accuracy':
        print(f"Acurácia no teste: {accuracy_score(y_test, y_pred):.4f}")
        print("Matriz de confusão:")
        print(confusion_matrix(y_test, y_pred))
        print("Relatório de classificação:")
        print(classification_report(y_test, y_pred))
    else:
        print(f"R² no teste: {r2_score(y_test, y_pred):.4f}")
        print(f"MAE (Erro Absoluto Médio): {mean_absolute_error(y_test, y_pred):.4f}")
        print(f"MSE (Erro Quadrático Médio): {mean_squared_error(y_test, y_pred):.4f}")
        print(f"RMSE (Raiz do Erro Quadrático Médio): {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
        print("Resumo estatístico:")
        print(f"Valor real médio: {np.mean(y_test):.4f}")
        print(f"Valor previsto médio: {np.mean(y_pred):.4f}")
        print(f"Desvio padrão das previsões: {np.std(y_pred):.4f}")



=== Linear Regressor ===
R² no teste: 0.9890
MAE (Erro Absoluto Médio): 1.6111
MSE (Erro Quadrático Médio): 4.0826
RMSE (Raiz do Erro Quadrático Médio): 2.0206
Resumo estatístico:
Valor real médio: 54.8780
Valor previsto médio: 54.9578
Desvio padrão das previsões: 19.0249

=== KNN Regressor ===
R² no teste: 0.9852
MAE (Erro Absoluto Médio): 1.8638
MSE (Erro Quadrático Médio): 5.4670
RMSE (Raiz do Erro Quadrático Médio): 2.3382
Resumo estatístico:
Valor real médio: 54.8780
Valor previsto médio: 54.8415
Desvio padrão das previsões: 18.8617


In [31]:
if scoring == 'accuracy':
    best_model_name = max(best_models, key=lambda n: accuracy_score(y_test, best_models[n].predict(X_test)))
else:
    best_model_name = max(best_models, key=lambda n: r2_score(y_test, best_models[n].predict(X_test)))
best_model = best_models[best_model_name]

print(f"\nMelhor modelo geral: {best_model_name}")


Melhor modelo geral: Linear Regressor


In [36]:
with open(f"models/melhor_modelo_{best_model_name}.pkl", "wb") as f:
    pickle.dump(best_model, f)