# LifeSnaps Preprocessing Experiments

#### Import de llibreries necessaries

In [10]:
import warnings
warnings.filterwarnings('ignore')

# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import randint, uniform
import joblib

# Hyperparameter tuning
from xgboost import XGBClassifier

# Imbalanced data pipeline
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Core utilities
from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# Preprocessing
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Model definitions
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Model selection and evaluation
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    GridSearchCV,
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    learning_curve
)
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    make_scorer,
    roc_auc_score,
)

from ai_health_assistant.utils.train_helpers import train_models, append_results, plot_learning_curve, mat_confusio


# Configuració de pandas
pd.set_option('display.max_columns', None)

# Carrega de dades
df = pd.read_csv('../data/df_preprocessed.csv')
print(f"Shape: {df.shape}")


Shape: (2290, 42)


## Lectura de dades i split de train / test

In [11]:
# Comprovem quina es les estructura de les nostres dades faltants en el target
TARGET = 'TIRED'

df_complete = df.dropna(subset=[TARGET])

df_null_targets = df[df["TIRED"].isnull()]

print(f"Estructura dades amb target:\n{df_complete.shape}")
print(f"Estructura dades target null:\n{df_null_targets.shape}")

Estructura dades amb target:
(2290, 42)
Estructura dades target null:
(0, 42)


In [12]:
# Difinim X i el target y
# Prediccio de TIRED
X = df_complete.drop(columns=[TARGET])
y = df_complete[TARGET]

numerical_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['number']).columns.tolist()

print(f"\nCol. numeriques ({len(numerical_features)}): \n{numerical_features}")
print(f"Col. categoriques ({len(categorical_features)}): \n{categorical_features}")




Col. numeriques (41): 
['num__bmi', 'num__calories', 'num__steps', 'num__lightly_active_minutes', 'num__moderately_active_minutes', 'num__very_active_minutes', 'num__sedentary_minutes', 'num__resting_hr', 'num__minutes_below_default_zone_1', 'num__minutes_in_default_zone_1', 'num__minutes_in_default_zone_2', 'num__minutes_in_default_zone_3', 'num__minutesToFallAsleep', 'num__minutesAsleep', 'num__minutesAwake', 'num__minutesAfterWakeup', 'num__sleep_efficiency', 'num__sleep_deep_ratio', 'num__sleep_light_ratio', 'num__sleep_rem_ratio', 'num__sleep_wake_ratio', 'num__daily_temperature_variation', 'num__rmssd', 'num__spo2', 'num__full_sleep_breathing_rate', 'num__wake_after_sleep_pct', 'num__steps_norm_cal', 'num__deep_sleep_score', 'num__active_sedentary_ratio', 'num__sleep_activity_balance', 'num__bmi_hr_interaction', 'num__sleep_quality_index', 'num__hr_zone_variability', 'num__recovery_factor', 'num__sleep_eff_rmssd', 'num__active_to_rest_transition', 'num__active_to_total_ratio', '

### Train / Test Split

In [13]:
# Estratifiquem respecte un dels targets per tal d'assegurar el bon split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)


print(f"\nTrain shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print('\n','--'*50)
print(f"\nDistribució train:\n{y_train.value_counts(normalize=True)}")
print(f"\nDistribució test:\n{y_test.value_counts(normalize=True)}")



Train shape: (1832, 41)
Test shape: (458, 41)

 ----------------------------------------------------------------------------------------------------

Distribució train:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Distribució test:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64


### Classifiers i Grids que utilitzarem

Anem modificant tant els hiperparametres com els classifiers per tal d'anar ajustant el overfitting que hem detectat que tenim

In [14]:
from ai_health_assistant.utils.model_config import get_classifier_config, PARAM_GRIDS, CLASSIFIERS, BALANCING_METHODS

## ENTRENAMENT DEL MODEL BASE

In [15]:
# # Provarem de apuntar al accuracy o al f1 score de la classe 1, 
# f1_cls1 = make_scorer(f1_score, pos_label=1)
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

display_cols = ['Target', 'Experiment',	'Model', 'Train F1 (1)', 'Test F1 (1)',	'Train F1 (macro global)', 'Test F1 (macro global)', 'Train Accuracy', 'Test Accuracy']

balance_name = "SMOTETomek"
balance_method = BALANCING_METHODS[balance_name]



#### Filtre de prova amb alguns models

In [16]:
# Proves amb els models
PROVA_MODELS = ["BalancedRandomForest", "LGBM"]

# Filtre per no haver de entrenar tots el models
CLASSIFIERS_FILTER = {k: v for k, v in CLASSIFIERS.items() if k in PROVA_MODELS}

### Regressió Logistica

In [17]:

reg_results = []
reg_models = {}

reg_pipeline = Pipeline(steps=[
    ("classifier", LogisticRegression(
        max_iter=2000,            # convergència assegurada
        class_weight="balanced", # tracta l’imbalance de la classe 1
        solver="lbfgs",          # ràpid i estable per datasets petits/mitjans
    ))
])

reg_param_grid = {
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100, 200, 500, 1000]
}

best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train, 
        y_train,
        X_test,
        y_test, 
        reg_pipeline, 
        reg_param_grid,
        search_type='grid'
    )

reg_models["Logistic Regression"] = best_est

reg_results_df = append_results(
    reg_results,
    "Logistic Regression",
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="Entrenament basic"
)

display(reg_results_df[display_cols].round(4))




plt.show()


Train F1 (1): 0.5892 | Test F1 (1): 0.5722 | Train Acc: 0.6545 | Test Acc: 0.6507
              precision    recall  f1-score   support

         0.0     0.7346    0.6773    0.7048       282
         1.0     0.5404    0.6080    0.5722       176

    accuracy                         0.6507       458
   macro avg     0.6375    0.6426    0.6385       458
weighted avg     0.6600    0.6507    0.6538       458



Unnamed: 0,Target,Experiment,Model,Train F1 (1),Test F1 (1),Train F1 (macro global),Test F1 (macro global),Train Accuracy,Test Accuracy
0,TIRED,Entrenament basic,Logistic Regression,0.5892,0.5722,0.6455,0.6385,0.6545,0.6507


In [18]:
mat_confusio(
    "LogisticRegression",
    y_test,
    y_test_pred,
    )

In [19]:
plot_learning_curve(
    "Logistic Regression",
    best_est, 
    X_train, 
    y_train
    )

In [20]:
base_results = []
base_models = {}

for model, classifier in CLASSIFIERS_FILTER.items():
    pipeline = ImbPipeline([
        ("balance", balance_method),
        ("classifier", classifier)
    ])

    print(f"\n==== {model} ====")
    best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train,
        y_train,
        X_test,
        y_test, 
        pipeline,
        PARAM_GRIDS[model]
    )

    base_models[model] = best_est

    base_results_df = append_results(
    base_results,
    model,
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="Entrenament basic"
)


==== BalancedRandomForest ====

Train F1 (1): 0.8041 | Test F1 (1): 0.6250 | Train Acc: 0.8532 | Test Acc: 0.7249
              precision    recall  f1-score   support

         0.0     0.7617    0.8050    0.7828       282
         1.0     0.6562    0.5966    0.6250       176

    accuracy                         0.7249       458
   macro avg     0.7090    0.7008    0.7039       458
weighted avg     0.7212    0.7249    0.7221       458


==== LGBM ====

Train F1 (1): 0.9618 | Test F1 (1): 0.6516 | Train Acc: 0.9700 | Test Acc: 0.7314
              precision    recall  f1-score   support

         0.0     0.7829    0.7801    0.7815       282
         1.0     0.6497    0.6534    0.6516       176

    accuracy                         0.7314       458
   macro avg     0.7163    0.7168    0.7165       458
weighted avg     0.7317    0.7314    0.7316       458



In [21]:
display(base_results_df[display_cols].round(4))


Unnamed: 0,Target,Experiment,Model,Train F1 (1),Test F1 (1),Train F1 (macro global),Test F1 (macro global),Train Accuracy,Test Accuracy
0,TIRED,Entrenament basic,BalancedRandomForest,0.8041,0.625,0.8433,0.7039,0.8532,0.7249
1,TIRED,Entrenament basic,LGBM,0.9618,0.6516,0.9685,0.7165,0.97,0.7314


In [27]:
print(base_results_df[base_results_df['Model'] == "RandomForest"]['Best Params'].values[0])

IndexError: index 0 is out of bounds for axis 0 with size 0

In [22]:
# El model que vulguem inspecciona
model_name = "LGBM" # RandomForest, MLP, GradientBoosting, SVM
csf = base_models[model_name]


# Visualització del train
y_train_pred = csf.predict(X_train)

mat_confusio(
    f'Entrenament basic {model_name} (Train)',
    y_train,
    y_train_pred
)

# Visualització del test
y_test_pred = csf.predict(X_test)

mat_confusio(
    f"Entrenament basic {model_name} (Test)",
    y_test,
    y_test_pred
)

In [23]:
# Observem la corba d'aprenentatge:

plot_learning_curve(
    model_name,
    base_models[model_name],
    X_train,
    y_train
)

KeyboardInterrupt: 

## EXPERIMENT 1: Importancia de les caracteristiques

Entrenem RandomForest per indentificar les caracteristiques més importants (10-15), posteriorment entrenem els models utilitzant aquestes 10-15 característiques, per veure si augmenta el rendiment del model. Proavarem tambe amb permutation importances.

### Feature Importance

La Gini importance d’una feature és: La suma de totes les reduccions d’impuresa (Gini) que ha causat al llarg de tots els arbres i de totes les seves aparicions.

In [24]:
# Param grid & pipeline bàsic de random forest
rf_name = "BalancedRandomForest"

pipeline = ImbPipeline([
        ("classifier", CLASSIFIERS[rf_name])
    ])

# Entrenament del model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train, 
        y_train,
        X_test,
        y_test, 
        pipeline, 
        PARAM_GRIDS[rf_name],
        search_type='grid'
    )


all_features = X_train.columns.tolist()

# Importàncies Gini i agregació per feature “base”
importances_raw = pd.Series(best_est.named_steps["classifier"].feature_importances_,index=all_features)

def base_name(feat):
    if feat.startswith("cat__") and "_" in feat[6:]:
        return feat.rsplit("_", 1)[0]
    return feat

agg_importances = (
    importances_raw.groupby(base_name).sum().sort_values(ascending=False)
)


# Top-10 i Top-15 importàncies

top10 = agg_importances.head(10).index.tolist()
top15 = agg_importances.head(15).index.tolist()
print("Top-10 features:", top10)
print("Top-15 features:", top15)


# Visualització de les Top-15 importàncies
plt.figure(figsize=(10, 6))
plt.bar(agg_importances.head(15).index, agg_importances.head(15).values)
plt.xticks(rotation=45, ha="right")
plt.title("Top-15 Gini Importances")
plt.xlabel("Feature")
plt.ylabel("Importància (Gini)")
plt.tight_layout()
plt.show()



Train F1 (1): 0.6946 | Test F1 (1): 0.6500 | Train Acc: 0.6621 | Test Acc: 0.6026
              precision    recall  f1-score   support

         0.0     0.9386    0.3794    0.5404       282
         1.0     0.4913    0.9602    0.6500       176

    accuracy                         0.6026       458
   macro avg     0.7149    0.6698    0.5952       458
weighted avg     0.7667    0.6026    0.5825       458

Top-10 features: ['num__minutes_in_default_zone_2', 'num__sedentary_minutes', 'num__very_active_minutes', 'num__moderately_active_minutes', 'num__hr_zone_variability', 'num__minutes_below_default_zone_1', 'num__minutes_in_default_zone_1', 'num__lightly_active_minutes', 'num__bmi', 'num__steps_norm_cal']
Top-15 features: ['num__minutes_in_default_zone_2', 'num__sedentary_minutes', 'num__very_active_minutes', 'num__moderately_active_minutes', 'num__hr_zone_variability', 'num__minutes_below_default_zone_1', 'num__minutes_in_default_zone_1', 'num__lightly_active_minutes', 'num__bmi', 'nu

### Rentrenament de models 10-15 millors features

In [25]:
# Definim els dos conjunts de features
feature_sets = {
    "Top10": top10,   # llista de 10 noms de feature “base”
    "Top15": top15    # llista de 15 noms de feature “base”
}

# Reentrenament i avaluació per a cada subset
fi_results = []
fi_models = {}

for label, feats in feature_sets.items():
    print(f"\nEntrenament models amb {label}")
    num_feats = [f for f in feats if f in numerical_features]
    cat_feats = [f for f in feats if f in categorical_features]

    for model, classifier in CLASSIFIERS_FILTER.items():

        print(f'==== {model} ====')
        pipe = ImbPipeline([
            ("balance", balance_method),
            ("classifier", classifier)
        ])


        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train[feats],
            y_train,
            X_test[feats],
            y_test,
            pipe,
            PARAM_GRIDS[model]
        )

        fi_models[f"{model}_{label}"] = best_est

        append_results(
            fi_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"FI_{label}"
        )


Entrenament models amb Top10
==== BalancedRandomForest ====

Train F1 (1): 0.6569 | Test F1 (1): 0.5667 | Train Acc: 0.7697 | Test Acc: 0.7162
              precision    recall  f1-score   support

         0.0     0.7275    0.8617    0.7890       282
         1.0     0.6855    0.4830    0.5667       176

    accuracy                         0.7162       458
   macro avg     0.7065    0.6723    0.6778       458
weighted avg     0.7114    0.7162    0.7035       458

==== LGBM ====

Train F1 (1): 0.9288 | Test F1 (1): 0.5889 | Train Acc: 0.9438 | Test Acc: 0.6616
              precision    recall  f1-score   support

         0.0     0.7471    0.6809    0.7124       282
         1.0     0.5522    0.6307    0.5889       176

    accuracy                         0.6616       458
   macro avg     0.6497    0.6558    0.6506       458
weighted avg     0.6722    0.6616    0.6649       458


Entrenament models amb Top15
==== BalancedRandomForest ====

Train F1 (1): 0.7282 | Test F1 (1): 0.5968

In [26]:
feature_importance_results_df = pd.DataFrame(fi_results)

display(feature_importance_results_df[display_cols].round(4))

Unnamed: 0,Target,Experiment,Model,Train F1 (1),Test F1 (1),Train F1 (macro global),Test F1 (macro global),Train Accuracy,Test Accuracy
0,TIRED,FI_Top10,BalancedRandomForest,0.6569,0.5667,0.7418,0.6778,0.7697,0.7162
1,TIRED,FI_Top10,LGBM,0.9288,0.5889,0.9412,0.6506,0.9438,0.6616
2,TIRED,FI_Top15,BalancedRandomForest,0.7282,0.5968,0.7885,0.6928,0.8057,0.7227
3,TIRED,FI_Top15,LGBM,0.9368,0.6053,0.9479,0.6627,0.9503,0.6725


### Permutation Importance

Per avaluar la importància de les característiques del model. Serveix per determinar quines característiques tenen més impacte en el rendiment del model.

In [27]:
# Calcular permutation importance
best_rf_model=base_models["BalancedRandomForest"]
result = permutation_importance(
    best_rf_model,      # el teu model entrenat per al sol target
    X_test, 
    y_test,
    n_repeats=200,
    n_jobs=-1,
    random_state=42
)



perm_importances = (
    pd.Series(result.importances_mean, index=all_features).sort_values(ascending=False)
)


# Extreure Top-10 i Top-15
perm_top_features = {
    10: perm_importances.head(10).index.tolist(),
    15: perm_importances.head(15).index.tolist()
}


# Mostrem la taula i la gràfica  de Top-15

print("\nTop-15 features (Permutation):")
display(perm_importances.head(15).to_frame("Importancia"))


plt.figure(figsize=(10, 5))
top15 = perm_importances.head(15)
plt.bar(top15.index, top15.values)
plt.title("Top-15 Permutation Importance")
plt.ylabel("Mean F1 (test)")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()



Top-15 features (Permutation):


Unnamed: 0,Importancia
num__sedentary_minutes,0.043876
num__minutes_in_default_zone_2,0.036583
num__bmi,0.022336
num__full_sleep_breathing_rate,0.007183
num__recovery_factor,0.007063
num__daily_temperature_variation,0.005841
num__steps,0.005197
num__sleep_rem_ratio,0.005197
num__active_to_rest_transition,0.00488
num__rmssd,0.004814


### Reentrenament Permutation importance

In [28]:

# Resultats i models per a Top-k
perm_results = []
perm_models  = {}

for k in [10, 15]:
    sel_feats = perm_top_features[k]

    for model, classifier in CLASSIFIERS_FILTER.items():
        pipe = ImbPipeline([
            ("balance",        balance_method),
            ("classifier",   classifier)
        ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
            X_train[sel_feats], 
            y_train,
            X_test[sel_feats],
            y_test, 
            pipe, 
            PARAM_GRIDS[model]
        )
        
        perm_models[f'{model}_Top{k}'] = best_est
      
        append_results(
            perm_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment= f"Perm_Top{k}"
        )


Train F1 (1): 0.7398 | Test F1 (1): 0.5893 | Train Acc: 0.8150 | Test Acc: 0.7140
              precision    recall  f1-score   support

         0.0     0.7397    0.8262    0.7806       282
         1.0     0.6573    0.5341    0.5893       176

    accuracy                         0.7140       458
   macro avg     0.6985    0.6802    0.6850       458
weighted avg     0.7080    0.7140    0.7071       458


Train F1 (1): 0.9403 | Test F1 (1): 0.6054 | Train Acc: 0.9531 | Test Acc: 0.6812
              precision    recall  f1-score   support

         0.0     0.7576    0.7092    0.7326       282
         1.0     0.5773    0.6364    0.6054       176

    accuracy                         0.6812       458
   macro avg     0.6674    0.6728    0.6690       458
weighted avg     0.6883    0.6812    0.6837       458


Train F1 (1): 0.7985 | Test F1 (1): 0.6272 | Train Acc: 0.8488 | Test Acc: 0.7249
              precision    recall  f1-score   support

         0.0     0.7635    0.8014    0.782

In [29]:
perm_topk_results_df = pd.DataFrame(perm_results)
display(perm_topk_results_df[display_cols].round(4))

Unnamed: 0,Target,Experiment,Model,Train F1 (1),Test F1 (1),Train F1 (macro global),Test F1 (macro global),Train Accuracy,Test Accuracy
0,TIRED,Perm_Top10,BalancedRandomForest,0.7398,0.5893,0.7981,0.685,0.815,0.714
1,TIRED,Perm_Top10,LGBM,0.9403,0.6054,0.9508,0.669,0.9531,0.6812
2,TIRED,Perm_Top15,BalancedRandomForest,0.7985,0.6272,0.8388,0.7046,0.8488,0.7249
3,TIRED,Perm_Top15,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878


## EXPERIMENT 2: PCA


Es realitza una anàlisi de components principals (PCA) per examinar com evolucionen els components més rellevants del conjunt de dades en termes de variància explicada acumulada, considerant els primers 5, 10, 15, 20 i 25 components

In [30]:
pca = PCA(random_state=42)
pca.fit(X_train)

# 3) Calcular la varianza explicada acumulada
explained_cumsum = pca.explained_variance_ratio_.cumsum()*100

# 4) Definir los puntos de interés y extraer sus valores
ks = [5, 10, 15, 20, 25]
cums = explained_cumsum[[k-1 for k in ks]]

# 5) Dibujar la curva completa y señalar los ks elegidos
plt.figure(figsize=(8, 4))
plt.plot(
    range(1, len(explained_cumsum) + 1),
    explained_cumsum,
)
plt.scatter(ks, cums)
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title('Evolución de la varianza explicada según n_components')
plt.grid(True)
plt.tight_layout()
plt.show()

In [31]:
n_components_list = [5, 10, 15, 20, 25]
for k in n_components_list:
    # Ajusta PCA
    pca = PCA(n_components=k, random_state=42)
    pca.fit(X_train)

    # loadings: matriz (n_features, k)
    loadings = pca.components_.T

    # importancia = suma de cargas absolutes de cada feature en tots els components
    importance = np.sum(np.abs(loadings), axis=1)

    # crea DataFrame, ordena top-k
    df_imp = pd.DataFrame({
        'feature':    all_features,
        'importance': importance
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    topk = df_imp.head(k)

    plt.figure()
    plt.barh(topk['feature'][::-1], topk['importance'][::-1])
    plt.xlabel('Importancia (suma de |carregues|)')
    plt.title(f'Top {k} features segons PCA')
    plt.tight_layout()
    plt.show()

Aquests gràfics mostren, per cada valor de *k* (5, 10, 15, 20 i 25 components), quines variables original s’aporten més a l’espai de la PCA i, per tant, expliquen més variància del conjunt de dades.

* **Components principals**: són noves variables creades com a combinacions lineals de les variables originals.
* **Càrregues (loadings)**: cada component té un coeficient per a cada variable; aquell coeficient indica quant “pesa” la variable en aquest eix.
* **Importància de la variable**: per a cada variable, sumem el valor absolut de les càrregues als primers *k* components. Una suma més alta vol dir que la variable contribueix de manera rellevant a la variació capturada per aquests *k* eixos.

Així podem veure quines són les *k* variables que més pesen ens diu quins atributs són més informatius (i quins, en canvi, aporten informació redundant).
Els tops *k* ajuden a identificar les característiques més representatives del dataset segons la PCA.


In [32]:
# Guarda resultats i models
pca_results = []
pca_models = {}


for k in n_components_list:
    print(f"\n-- PCA - {k} components --")

    for model, classifier in CLASSIFIERS_FILTER.items():
        # Pipeline amb preprocessor, SMOTE, PCA i classificador
        pipeline = ImbPipeline([
            ("balance",       balance_method),
            ("pca",          PCA(n_components=k, random_state=42)),
            ("classifier",   classifier)
        ])

        best_est, y_pred, report, best_params, best_score, best_params, best_score = train_models( 
        X_train, 
        y_train,
        X_test,
        y_test,
        pipeline, 
        PARAM_GRIDS[model],
        search_type="grid"
        )

        # Guarda el millor model
        pca_models[f'{model}_PCA{k}'] = best_est

        df_pca_results = append_results(
            pca_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"PCA_{k}",
        )



-- PCA - 5 components --

Train F1 (1): 0.7491 | Test F1 (1): 0.4985 | Train Acc: 0.8160 | Test Acc: 0.6354
              precision    recall  f1-score   support

         0.0     0.6910    0.7376    0.7136       282
         1.0     0.5287    0.4716    0.4985       176

    accuracy                         0.6354       458
   macro avg     0.6098    0.6046    0.6060       458
weighted avg     0.6286    0.6354    0.6309       458


Train F1 (1): 0.7073 | Test F1 (1): 0.5000 | Train Acc: 0.7118 | Test Acc: 0.4629
              precision    recall  f1-score   support

         0.0     0.6268    0.3156    0.4198       282
         1.0     0.3892    0.6989    0.5000       176

    accuracy                         0.4629       458
   macro avg     0.5080    0.5072    0.4599       458
weighted avg     0.5355    0.4629    0.4506       458


-- PCA - 10 components --

Train F1 (1): 0.8208 | Test F1 (1): 0.5784 | Train Acc: 0.8608 | Test Acc: 0.6594
              precision    recall  f1-score 

In [33]:
pca_results_df = pd.DataFrame(pca_results)
display(pca_results_df[display_cols].round(4))

Unnamed: 0,Target,Experiment,Model,Train F1 (1),Test F1 (1),Train F1 (macro global),Test F1 (macro global),Train Accuracy,Test Accuracy
0,TIRED,PCA_5,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
1,TIRED,PCA_5,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
2,TIRED,PCA_10,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
3,TIRED,PCA_10,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
4,TIRED,PCA_15,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
5,TIRED,PCA_15,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
6,TIRED,PCA_20,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
7,TIRED,PCA_20,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
8,TIRED,PCA_25,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
9,TIRED,PCA_25,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878


## Anàlisi de resultats

In [34]:
# --- Results Compilation ---
all_results_dfs = [base_results_df]
if 'feature_importance_results_df' in locals() and not feature_importance_results_df.empty: 
    all_results_dfs.append(feature_importance_results_df)

if 'perm_topk_results_df' in locals() and not perm_topk_results_df.empty: 
    all_results_dfs.append(perm_topk_results_df)

if 'pca_results_df' in locals() and not pca_results_df.empty: 
    all_results_dfs.append(pca_results_df)

if len(all_results_dfs) > 1:
    combined_results_df = pd.concat(all_results_dfs, ignore_index=True)
    print("\nCombined results from all experiments.")
else:
    print("\nOnly baseline results available.")
    combined_results_df = base_results_df

final_cols = ["Target", "Experiment", "Model", "Test Accuracy","Test F1 (1)", "Test F1 (macro global)","Best Params"]
 
combined_results_df = combined_results_df[final_cols]

# --- Analysis ---
print("\n--- Overall Performance Analysis (Sorted by Test F1-Macro) ---")
combined_results_sorted = combined_results_df.sort_values(by=["Test Accuracy"], ascending=[False]).reset_index(drop=True)


print(f"\n--- Target: {TARGET} ---")
display(combined_results_sorted[final_cols].head().round(4))


Combined results from all experiments.

--- Overall Performance Analysis (Sorted by Test F1-Macro) ---

--- Target: TIRED ---


Unnamed: 0,Target,Experiment,Model,Test Accuracy,Test F1 (1),Test F1 (macro global),Best Params
0,TIRED,Entrenament basic,LGBM,0.7314,0.6516,0.7165,"{'classifier__subsample': 0.7590327755184709, ..."
1,TIRED,Entrenament basic,BalancedRandomForest,0.7249,0.625,0.7039,"{'classifier__n_estimators': 1163, 'classifier..."
2,TIRED,Perm_Top15,BalancedRandomForest,0.7249,0.6272,0.7046,"{'classifier__n_estimators': 1163, 'classifier..."
3,TIRED,FI_Top15,BalancedRandomForest,0.7227,0.5968,0.6928,"{'classifier__n_estimators': 1163, 'classifier..."
4,TIRED,FI_Top10,BalancedRandomForest,0.7162,0.5667,0.6778,"{'classifier__n_estimators': 1163, 'classifier..."


In [35]:
tired_top5 = combined_results_sorted.head(5)

# Mostra la taula resum dels 5 millors
print(f"\nTop 5 models per a {TARGET} segons Test Accuracy:\n")
display(tired_top5[["Model","Test Accuracy","Test F1 (1)"]])

# Dibuixa la matriu de confusió de cada un
for model in tired_top5["Model"]:
    clf = base_models[model] 
    y_pred = clf.predict(X_test)  
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix — {TARGET} — {model}")
    plt.show()


Top 5 models per a TIRED segons Test Accuracy:



Unnamed: 0,Model,Test Accuracy,Test F1 (1)
0,LGBM,0.731441,0.651558
1,BalancedRandomForest,0.724891,0.625
2,BalancedRandomForest,0.724891,0.627219
3,BalancedRandomForest,0.722707,0.596825
4,BalancedRandomForest,0.716157,0.566667
