## Importando Libs

In [47]:
# Bibliotecas gerais
import pandas as pd
import pickle
import numpy as np
import os

# Modelos categóricos
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

# Modelos regressores
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    r2_score,
    mean_absolute_error,
    mean_squared_error
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## Extraindo dados

In [48]:
csv_name = "wine_quality.csv"
target_column = "type"

df = pd.read_csv("data/"+csv_name)

## Primeira visualização

In [49]:
display(df.head())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


## Tratando colunas não numéricas

In [50]:
for col, dtype in df.dtypes.items():
    if dtype == object:
        try:
            # Tentativa de conversão para datetime
            df[col] = pd.to_datetime(df[col], errors='raise')

            df[col+"_month_sin"] = np.sin(2*np.pi*df[col].dt.month/12)
            df[col+"_month_cos"] = np.cos(2*np.pi*df[col].dt.month/12)
            df[col+"_day_sin"]   = np.sin(2*np.pi*df[col].dt.day/31)
            df[col+"_day_cos"]   = np.cos(2*np.pi*df[col].dt.day/31)

            df = df.drop(col, axis=1)
        except Exception:
            n_categories = df[col].nunique()
            if n_categories <= 10 and col != target_column:
                ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                encoded = ohe.fit_transform(df[[col]])

                # Cria nomes de colunas legíveis
                encoded_cols = [f"{col}_{cat}" for cat in ohe.categories_[0]]

                # Concatena com o DataFrame original
                df = pd.concat([df.drop(columns=[col]), pd.DataFrame(encoded, columns=encoded_cols, index=df.index)], axis=1)
            elif col != target_column:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])


  df[col] = pd.to_datetime(df[col], errors='raise')


## Separação teste e treino

In [51]:
X = df.drop(target_column, axis=1)
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # porcentagem para teste
    random_state=42,      # garante reprodutibilidade
)

## Setando os modelos a serem treinados

In [52]:
print(y.dtype)
if y.dtype == object or y.dtype == bool:
    
    result = DecisionTreeClassifier().cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = result.ccp_alphas
    ccp_alphas = np.delete(ccp_alphas, -1)
    
    scoring = 'accuracy'
    
    models = {
        "Decision Tree Gini": DecisionTreeClassifier(criterion='gini', random_state=42),
        "Decision Tree Entropy": DecisionTreeClassifier(criterion='entropy', random_state=42),
        "Naive Bayes": GaussianNB(),
        "KNN": KNeighborsClassifier(),
        "BernoulliNB": BernoulliNB(),
    }

    param_grids = {
        "Decision Tree Gini": {
            'max_depth': [None, 10, 20, 30],
            'min_samples_leaf': [1, 5, 10],
            'min_samples_split': [2, 5, 10],
            'splitter': ['best', 'random'],
            'ccp_alpha': ccp_alphas,
            'class_weight': [None, 'balanced']
        },
        "Decision Tree Entropy": {
            'max_depth': [None, 10, 20, 30],
            'min_samples_leaf': [1, 5, 10],
            'min_samples_split': [2, 5, 10],
            'splitter': ['best', 'random'],
            'ccp_alpha': ccp_alphas,
            'class_weight': [None, 'balanced']
        },
        "Naive Bayes": {
                'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
            },
        "KNN": {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski', 'cosine'],
        },
        "BernoulliNB": {
            'alpha': [0.5, 1.0, 1.5],
            'binarize': [0.0, 0.5, 1.0],
            'fit_prior': [True, False]
        },
    }
else:
    
    scoring = 'r2'
    
    models = {
        "Linear Regressor": LinearRegression(),
        "KNN Regressor": KNeighborsRegressor(),
    }

    param_grids = {
        "Linear Regressor": {
        "fit_intercept": [True, False],
        "copy_X": [True],
        "positive": [False, True],
    },
    "KNN Regressor": {
        "n_neighbors": [3, 5, 7, 9, 11],
        "weights": ["uniform", "distance"],
        "p": [1, 2],  # 1 = Manhattan, 2 = Euclidiana
        "algorithm": ["auto", "ball_tree", "kd_tree"],
    },
    }

object


## Treinando os modelos

In [53]:
best_models = {}

for name, model in models.items():
    print(f"Treinando e ajustando {name}...")

    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=5,  # validação cruzada com 5 folds
        scoring=scoring,
        n_jobs=-1,  # usa todos os núcleos do processador
        verbose=1
    )
    grid.fit(X_train, y_train)  # treino

    print(f"Melhores parâmetros para {name}: {grid.best_params_}")
    print(f"Melhor {scoring} de validação: {grid.best_score_:.4f}")

    best_models[name] = grid.best_estimator_    


Treinando e ajustando Decision Tree Gini...
Fitting 5 folds for each of 6624 candidates, totalling 33120 fits
Melhores parâmetros para Decision Tree Gini: {'ccp_alpha': 0.00012809227729298028, 'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
Melhor accuracy de validação: 0.9861
Treinando e ajustando Decision Tree Entropy...
Fitting 5 folds for each of 6624 candidates, totalling 33120 fits
Melhores parâmetros para Decision Tree Entropy: {'ccp_alpha': 0.00037299625523600866, 'class_weight': None, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
Melhor accuracy de validação: 0.9867
Treinando e ajustando Naive Bayes...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Melhores parâmetros para Naive Bayes: {'var_smoothing': 1e-09}
Melhor accuracy de validação: 0.9715
Treinando e ajustando KNN...
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Melhores parâmetros 

## Mostrando as métricas de cada modelo

In [54]:
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    if scoring == 'accuracy':
        print(f"Acurácia no teste: {accuracy_score(y_test, y_pred):.4f}")
        print("Matriz de confusão:")
        print(confusion_matrix(y_test, y_pred))
        print("Relatório de classificação:")
        print(classification_report(y_test, y_pred))
    else:
        print(f"R² no teste: {r2_score(y_test, y_pred):.4f}")
        print(f"MAE (Erro Absoluto Médio): {mean_absolute_error(y_test, y_pred):.4f}")
        print(f"MSE (Erro Quadrático Médio): {mean_squared_error(y_test, y_pred):.4f}")
        print(f"RMSE (Raiz do Erro Quadrático Médio): {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
        print("Resumo estatístico:")
        print(f"Valor real médio: {np.mean(y_test):.4f}")
        print(f"Valor previsto médio: {np.mean(y_pred):.4f}")
        print(f"Desvio padrão das previsões: {np.std(y_pred):.4f}")



=== Decision Tree Gini ===
Acurácia no teste: 0.9846
Matriz de confusão:
[[330  11]
 [  9 950]]
Relatório de classificação:
              precision    recall  f1-score   support

         red       0.97      0.97      0.97       341
       white       0.99      0.99      0.99       959

    accuracy                           0.98      1300
   macro avg       0.98      0.98      0.98      1300
weighted avg       0.98      0.98      0.98      1300


=== Decision Tree Entropy ===
Acurácia no teste: 0.9754
Matriz de confusão:
[[324  17]
 [ 15 944]]
Relatório de classificação:
              precision    recall  f1-score   support

         red       0.96      0.95      0.95       341
       white       0.98      0.98      0.98       959

    accuracy                           0.98      1300
   macro avg       0.97      0.97      0.97      1300
weighted avg       0.98      0.98      0.98      1300


=== Naive Bayes ===
Acurácia no teste: 0.9685
Matriz de confusão:
[[330  11]
 [ 30 929]]
Rel

## Definindo melhor modelo

In [55]:
if scoring == 'accuracy':
    best_model_name = max(best_models, key=lambda n: accuracy_score(y_test, best_models[n].predict(X_test)))
else:
    best_model_name = max(best_models, key=lambda n: r2_score(y_test, best_models[n].predict(X_test)))
best_model = best_models[best_model_name]

print(f"\nMelhor modelo geral: {best_model_name}")


Melhor modelo geral: Decision Tree Gini


## Salvando o melhor modelo

In [56]:
os.makedirs("models", exist_ok=True)

with open(f"models/best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)