In [2]:
# PREPROCESS & TRAIN ----------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (classification_report, f1_score, make_scorer,
                             ConfusionMatrixDisplay)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# -----------------------------------------------------------------------------#
# 1. Carrega i neteja bàsica
# -----------------------------------------------------------------------------#
pd.set_option("display.max_columns", None)

TARGET = "TIRED"
TOP15 = [
    "bpm", "calories", "resting_hr", "steps", "daily_temperature_variation",
    "minutesAsleep", "sedentary_minutes", "nightly_temperature",
    "minutesAwake", "sleep_light_ratio", "bmi", "full_sleep_breathing_rate",
    "nremhr", "rmssd", "sleep_deep_ratio",
]

df = pd.read_csv("../data/df_cleaned.csv").drop(columns=["TENSE/ANXIOUS"])
df = df.dropna(subset=[TARGET])                 # ens quedem només amb files amb target

print(f"Shape (amb target): {df.shape}")

# -----------------------------------------------------------------------------#
# 2. Disseny del conjunt de dades
# -----------------------------------------------------------------------------#
X = df[TOP15].copy()
y = df[TARGET].astype(int)                      # assegurem tipus numèric/enter

# -----------------------------------------------------------------------------#
# 3. Split TRAIN / TEST
# -----------------------------------------------------------------------------#
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("\nDistribució y_train:\n", y_train.value_counts(normalize=True))

# -----------------------------------------------------------------------------#
# 4. Pipeline de preprocessament + SMOTE + MLP
# -----------------------------------------------------------------------------#
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, TOP15),            # totes les 15 són numèriques
        # afegir ("cat", cat_pipeline, cat_cols) si en tens de categòriques
    ]
)

smote = SMOTE(random_state=42)
mlp   = MLPClassifier(max_iter=500, random_state=42)

pipe = ImbPipeline(
    steps=[
        ("preprocess", preprocess),    # evita data‑leakage
        ("smote", smote),
        ("clf", mlp),
    ]
)

# -----------------------------------------------------------------------------#
# 5. Cerca d'hiperparàmetres
# -----------------------------------------------------------------------------#
param_grid = {
    "clf__hidden_layer_sizes": [(32,), (64,), (32, 16), (100,), (100, 50)],
    "clf__alpha": [1e-4, 1e-3, 1e-2],
    "clf__learning_rate_init": [1e-3, 5e-3],
}

f1_cls1 = make_scorer(f1_score, pos_label=1)
cv      = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=f1_cls1,
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

gs.fit(X_train, y_train)

best_model = gs.best_estimator_
y_pred     = best_model.predict(X_test)

f1_test=f1_score(y_pred, y_test)
print("\nMillor combinació:", gs.best_params_)
print("Millor F1 (classe 1):", f1_test)

# -----------------------------------------------------------------------------#
# 6. Avaluació al TEST
# -----------------------------------------------------------------------------#


print("\n== CLASSIFICATION REPORT (TEST) ==")
print(classification_report(y_test, y_pred, digits=4))

ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred, labels=[0, 1], cmap="Blues"
).ax_.set_title("TIRED — Top‑15 MLP (GridSearch)")
plt.show()


Shape (amb target): (2290, 24)

Distribució y_train:
 TIRED
0    0.615721
1    0.384279
Name: proportion, dtype: float64
Fitting 5 folds for each of 30 candidates, totalling 150 fits


KeyboardInterrupt: 