# LifeSnaps Preprocessing Experiments

#### Import de llibreries necessaries

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Imbalanced data pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Core utilities
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# Model definitions fora dels defints
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


############## IMPORTS DEL NOSTRE PAQUET ###################
# Obtenim les funcions per entrenar i evaluar els models i registrar les mètriques
from ai_health_assistant.utils.train_helpers import train_models, append_results, plot_learning_curve, mat_confusio, update_experiments_file

# Obtenim els classificadors i els seus parametres
from ai_health_assistant.utils.model_config import get_classifier_config, PARAM_GRIDS, CLASSIFIERS, BALANCING_METHODS

# Obtenim el target, features el la construcció del preprocessador
from ai_health_assistant.utils.prep_helpers import TARGET, build_preprocessor, FEATURES


# Configuració de pandas
pd.set_option('display.max_columns', None)

# Carrega de dades, netes i amb fe aplicat
df_train = pd.read_csv('../data/df_engineered_train.csv')
df_test = pd.read_csv('../data/df_engineered_test.csv')
    
print(f"Shape: {df_train.shape}")
print(f"Shape: {df_test.shape}")


Shape: (1832, 38)
Shape: (458, 38)


## Lectura de dades i split de train / test

### Train / Test Split

In [2]:
# Fem l'split de les dades, separant les features i el target
X_train = df_train.drop(columns=[TARGET])
y_train = df_train[TARGET]

X_test = df_test.drop(columns=[TARGET])
y_test = df_test[TARGET]

print(f"\nTrain shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print('\n','--'*50)
print(f"\nDistribució train:\n{y_train.value_counts(normalize=True)}")
print(f"\nDistribució test:\n{y_test.value_counts(normalize=True)}")

numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=['number']).columns.tolist()

print(f"\nCol. numeriques ({len(numerical_features)}): \n{numerical_features}")
print(f"Col. categoriques ({len(categorical_features)}): \n{categorical_features}")



Train shape: (1832, 37)
Test shape: (458, 37)

 ----------------------------------------------------------------------------------------------------

Distribució train:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Distribució test:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Col. numeriques (35): 
['bmi', 'calories', 'steps', 'lightly_active_minutes', 'moderately_active_minutes', 'very_active_minutes', 'sedentary_minutes', 'resting_hr', 'minutes_below_default_zone_1', 'minutes_in_default_zone_1', 'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'minutesAsleep', 'minutesAwake', 'sleep_efficiency', 'sleep_deep_ratio', 'sleep_light_ratio', 'sleep_rem_ratio', 'sleep_wake_ratio', 'daily_temperature_variation', 'rmssd', 'spo2', 'full_sleep_breathing_rate', 'wake_after_sleep_pct', 'steps_norm_cal', 'deep_sleep_score', 'active_sedentary_ratio', 'sleep_activity_balance', 'bmi_hr_interaction', 'sleep_quality_index', 'hr_zone_variab

### Definim el preprocessador

In [3]:
preprocessor = build_preprocessor(df_train, FEATURES)

## ENTRENAMENT DEL MODEL BASE

In [4]:
display_cols = ['Experiment', 'Train F1 (1)', 'Train F1 (macro global)','Train Accuracy', 'Test Recall (1)', 'Test Precision (1)', 'Test F1 (1)', 'Test F1 (macro global)','Test Accuracy']

# --------------------------------------------------------------
# Filtre de selecció dels models a entrenar
PROVA_MODELS = ["BalancedRandomForest", "LGBM"]
# Selecció del mètode de balanceig
balance_name = "SMOTETomek"
# --------------------------------------------------------------

# Filtre per no haver de entrenar tots el models
CLASSIFIERS_FILTER = {k: v for k, v in CLASSIFIERS.items() if k in PROVA_MODELS}
balance_method = BALANCING_METHODS[balance_name]

### Regressió Logistica

In [5]:

reg_results = []
reg_models = {}

reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=2000,            # convergència assegurada
        class_weight="balanced", # tracta l’imbalance de la classe 1
        solver="lbfgs",          # ràpid i estable per datasets petits/mitjans
    ))
])

reg_param_grid = {
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100, 200, 500, 1000]
}

best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train, 
        y_train,
        X_test,
        y_test, 
        reg_pipeline, 
        reg_param_grid,
        search_type='grid',
    )

reg_results_df = append_results(
    reg_results,
    "LogisticRegression",
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)

# update_experiments_file(reg_results_df)
display(reg_results_df[display_cols])
update_experiments_file(reg_results_df)

Entrenant model...

Train F1 (1): 0.5156 | Test F1 (1): 0.4800 | Train Acc: 0.5939 | Test Acc: 0.5742
              precision    recall  f1-score   support

         0.0     0.6680    0.6135    0.6396       282
         1.0     0.4523    0.5114    0.4800       176

    accuracy                         0.5742       458
   macro avg     0.5601    0.5624    0.5598       458
weighted avg     0.5851    0.5742    0.5782       458



Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,LogisticRegression_EntrenamentBasic,0.51562,0.583,0.59389,0.51136,0.45226,0.48,0.55978,0.57424



Métriques guardades a ../results/02_experiments/experiments.csv



In [6]:
base_results = []

for model, classifier in CLASSIFIERS_FILTER.items():

    if model == "BalancedRandomForest":
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("classifier", classifier)
        ])
    else:
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("balance", balance_method),
            ("classifier", classifier)
        ])

    print(f"\n==== {model} ====")
    best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train,
        y_train,
        X_test,
        y_test, 
        pipeline,
        PARAM_GRIDS[model]
    )

    base_results_df = append_results(
    base_results,
    model,
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)


==== BalancedRandomForest ====
Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458


==== LGBM ====
Entrenant model...

Train F1 (1): 0.8812 | Test F1 (1): 0.5736 | Train Acc: 0.9023 | Test Acc: 0.6266
              precision    recall  f1-score   support

         0.0     0.7382    0.6099    0.6680       282
         1.0     0.5111    0.6534    0.5736       176

    accuracy                         0.6266       458
   macro avg     0.6247    0.6317    0.6208       458
weighted avg     0.6509    0.6266    0.6317       458



In [7]:
update_experiments_file(base_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 1: Importancia de les caracteristiques

Entrenem RandomForest per indentificar les caracteristiques més importants (10-15), posteriorment entrenem els models utilitzant aquestes 10-15 característiques, per veure si augmenta el rendiment del model. Proavarem tambe amb permutation importances.

### Feature Importance

La Gini importance d’una feature és: La suma de totes les reduccions d’impuresa (Gini) que ha causat al llarg de tots els arbres i de totes les seves aparicions.

In [None]:
# Param grid & pipeline bàsic de random forest
rf_name = "BalancedRandomForest"

pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", CLASSIFIERS[rf_name])
])

# Entrenament del model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipeline, 
    PARAM_GRIDS[rf_name],
    search_type='grid'
)
preprocessor = best_est.named_steps['preprocessor']

feature_names = preprocessor.get_feature_names_out()

# Serie em els noms
importances_raw = pd.Series(
    best_est.named_steps["classifier"].feature_importances_,
    index=feature_names
)

agg_importances = (
    importances_raw.groupby(feature_names).sum().sort_values(ascending=False)
)

# Top-10 i Top-15 importàncies

top10 = agg_importances.head(10).index.tolist()
top15 = agg_importances.head(15).index.tolist()
print("Top-10 features:", top10)
print("Top-15 features:", top15)

# Visualització de les Top-15 importàncies
plt.figure(figsize=(10, 6))
plt.bar(top15, agg_importances.head(15).values)
plt.xticks(rotation=45, ha="right")
plt.title("Top-15 Gini Importances")
plt.xlabel("Feature")
plt.ylabel("Importància (Gini)")
plt.tight_layout()
plt.show()


Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458

Top-10 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal', 'num__daily_temperature_variation', 'num__recovery_factor', 'num__hr_zone_variability', 'num__lightly_active_minutes', 'num__minutesAsleep']
Top-15 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal', 'num__daily_temperature_variation', 'num__recovery_factor', 'num__hr_zone_variability', 'num__lightly_active_minutes', 'num__minutesAsleep', 'num__active_to_total_ratio', 'num__sedentary_minutes', '

### Rentrenament de models 10-15 millors features

In [9]:
# Definim els dos conjunts de features
feature_sets = {
    "Top10": top10,   # llista de 10 noms de feature “base”
    "Top15": top15    # llista de 15 noms de feature “base”
}

# Reentrenament i avaluació per a cada subset
fi_results = []

preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Generem el train transformat i el test transformat
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)


for label, feats in feature_sets.items():
    print(f"\nEntrenament models amb {label}")
    for model, classifier in CLASSIFIERS_FILTER.items():

        print(f'==== {model} ====')
        if model == "BalancedRandomForest": # Em dona millors resutats sense balance
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[feats],
            y_train,
            X_test_transformed[feats],
            y_test,
            pipe,
            PARAM_GRIDS[model]
        )

        fi_results_df =append_results(
            fi_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"FI_{label}"
        )


Entrenament models amb Top10
==== BalancedRandomForest ====
Entrenant model...

Train F1 (1): 0.7248 | Test F1 (1): 0.5847 | Train Acc: 0.7156 | Test Acc: 0.5502
              precision    recall  f1-score   support

         0.0     0.7754    0.3794    0.5095       282
         1.0     0.4531    0.8239    0.5847       176

    accuracy                         0.5502       458
   macro avg     0.6142    0.6016    0.5471       458
weighted avg     0.6515    0.5502    0.5384       458

==== LGBM ====
Entrenant model...

Train F1 (1): 0.8374 | Test F1 (1): 0.5425 | Train Acc: 0.8597 | Test Acc: 0.5764
              precision    recall  f1-score   support

         0.0     0.7095    0.5284    0.6057       282
         1.0     0.4637    0.6534    0.5425       176

    accuracy                         0.5764       458
   macro avg     0.5866    0.5909    0.5741       458
weighted avg     0.6151    0.5764    0.5814       458


Entrenament models amb Top15
==== BalancedRandomForest ====
Entre

In [10]:
display(fi_results_df[display_cols])
update_experiments_file(fi_results_df)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_FI_Top10,0.72478,0.7153,0.71561,0.82386,0.45312,0.58468,0.5471,0.55022
1,LGBM_FI_Top10,0.83744,0.85703,0.85972,0.65341,0.46371,0.54245,0.57407,0.57642
2,BalancedRandomForest_FI_Top15,0.72263,0.70897,0.70961,0.80114,0.44904,0.57551,0.54362,0.54585
3,LGBM_FI_Top15,0.85605,0.87406,0.87664,0.64205,0.45382,0.53176,0.56323,0.5655



Métriques guardades a ../results/02_experiments/experiments.csv



### Permutation Importance

Per avaluar la importància de les característiques del model. Serveix per determinar quines característiques tenen més impacte en el rendiment del model.

In [11]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Train el random forest
rf_name = 'BalancedRandomForest'
clf_rf, param_grid_rf = get_classifier_config(rf_name)

# nou pipeline
pipe = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", clf_rf)
])

# Train el model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipe, 
    param_grid_rf,
    search_type='grid'
)

best_rf_model = best_est

preprocessor = best_rf_model.named_steps['preprocessor']
classifier = best_rf_model.named_steps['classifier']

X_test_transformed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names_out()

# pipeline nomes classifier
final_estimator = Pipeline([
    ('classifier', classifier)
])

# Calculem la importància permutativa
result = permutation_importance(
    final_estimator,
    X_test_transformed, 
    y_test,
    n_repeats=200,
    n_jobs=-1,
    random_state=42
)

# Creem Series amb els noms de les caracteristiques
perm_importances = pd.Series(
    result.importances_mean, 
    index=feature_names
).sort_values(ascending=False)

# Top 10 i 15 features
perm_top_features = {
    10: perm_importances.head(10).index.tolist(),
    15: perm_importances.head(15).index.tolist()
}

print("\nTop-15 features (Permutation):")
display(perm_importances.head(15).to_frame("Importancia"))

# Plot
plt.figure(figsize=(12, 6))
top15 = perm_importances.head(15)
bars = plt.barh(top15.index, top15.values)
plt.title("Top-15 Permutation Importance", fontsize=14)
plt.xlabel("Mean Decrease in F1 Score", fontsize=12)
plt.gca().invert_yaxis()  # Most important features on top
plt.grid(axis='x', linestyle='--', alpha=0.6)

# Add value labels on the bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.001, bar.get_y() + bar.get_height()/2., 
             f'{width:.3f}', 
             ha='left', va='center')

plt.tight_layout()
plt.show()

Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458


Top-15 features (Permutation):


Unnamed: 0,Importancia
num__bmi,0.025666
num__recovery_factor,0.020742
num__minutesAsleep,0.019367
num__full_sleep_breathing_rate,0.013919
num__daily_temperature_variation,0.012609
num__minutes_in_default_zone_1,0.01143
num__wake_after_sleep_pct,0.010993
num__calories,0.00929
num__active_to_rest_transition,0.008319
num__rmssd,0.008166


### Reentrenament Permutation importance

In [16]:
# Definim els dos conjunts de features basats en importància de permutació
perm_sets = {
    "Perm_Top10": perm_top_features[10],  # Índexos de les 10 millors característiques
    "Perm_Top15": perm_top_features[15]   # Índexos de les 15 millors característiques
}

# Reentrenament i avaluació per a cada subset
perm_results = []

# Preprocessem un cop fora del bucle per eficiència
preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convertim a DataFrame mantenint els índexs
feature_names = preprocessor.get_feature_names_out()
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)

for label, feat_indices in perm_sets.items():
    print(f"\nEntrenament models amb {label}")
    
    # Obtenim els noms de les característiques seleccionades
    selected_feats = [feature_names[i] for i in feat_indices]
    
    for model_name, classifier in CLASSIFIERS_FILTER.items():
        print(f'==== {model_name} ====')
        
        if model_name == "BalancedRandomForest":
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[selected_feats],
            y_train,
            X_test_transformed[selected_feats],
            y_test,
            pipe,
            PARAM_GRIDS[model_name]
        )

        perm_results_df = append_results(
            perm_results,
            model_name,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=label  # Usem directament l'etiqueta (Perm_Top10 o Perm_Top15)
        )


Entrenament models amb Perm_Top10


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [29]:
perm_topk_results_df = pd.DataFrame(perm_results)
display(perm_topk_results_df[display_cols].round(4))

Unnamed: 0,Target,Experiment,Model,Train F1 (1),Test F1 (1),Train F1 (macro global),Test F1 (macro global),Train Accuracy,Test Accuracy
0,TIRED,Perm_Top10,BalancedRandomForest,0.7398,0.5893,0.7981,0.685,0.815,0.714
1,TIRED,Perm_Top10,LGBM,0.9403,0.6054,0.9508,0.669,0.9531,0.6812
2,TIRED,Perm_Top15,BalancedRandomForest,0.7985,0.6272,0.8388,0.7046,0.8488,0.7249
3,TIRED,Perm_Top15,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878


In [None]:
update_experiments_file(base_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 2: PCA


Es realitza una anàlisi de components principals (PCA) per examinar com evolucionen els components més rellevants del conjunt de dades en termes de variància explicada acumulada, considerant els primers 5, 10, 15, 20 i 25 components

In [30]:
pca = PCA(random_state=42)
pca.fit(X_train)

# 3) Calcular la varianza explicada acumulada
explained_cumsum = pca.explained_variance_ratio_.cumsum()*100

# 4) Definir los puntos de interés y extraer sus valores
ks = [5, 10, 15, 20, 25]
cums = explained_cumsum[[k-1 for k in ks]]

# 5) Dibujar la curva completa y señalar los ks elegidos
plt.figure(figsize=(8, 4))
plt.plot(
    range(1, len(explained_cumsum) + 1),
    explained_cumsum,
)
plt.scatter(ks, cums)
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title('Evolución de la varianza explicada según n_components')
plt.grid(True)
plt.tight_layout()
plt.show()

In [31]:
n_components_list = [5, 10, 15, 20, 25]
for k in n_components_list:
    # Ajusta PCA
    pca = PCA(n_components=k, random_state=42)
    pca.fit(X_train)

    # loadings: matriz (n_features, k)
    loadings = pca.components_.T

    # importancia = suma de cargas absolutes de cada feature en tots els components
    importance = np.sum(np.abs(loadings), axis=1)

    # crea DataFrame, ordena top-k
    df_imp = pd.DataFrame({
        'feature':    all_features,
        'importance': importance
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    topk = df_imp.head(k)

    plt.figure()
    plt.barh(topk['feature'][::-1], topk['importance'][::-1])
    plt.xlabel('Importancia (suma de |carregues|)')
    plt.title(f'Top {k} features segons PCA')
    plt.tight_layout()
    plt.show()

Aquests gràfics mostren, per cada valor de *k* (5, 10, 15, 20 i 25 components), quines variables original s’aporten més a l’espai de la PCA i, per tant, expliquen més variància del conjunt de dades.

* **Components principals**: són noves variables creades com a combinacions lineals de les variables originals.
* **Càrregues (loadings)**: cada component té un coeficient per a cada variable; aquell coeficient indica quant “pesa” la variable en aquest eix.
* **Importància de la variable**: per a cada variable, sumem el valor absolut de les càrregues als primers *k* components. Una suma més alta vol dir que la variable contribueix de manera rellevant a la variació capturada per aquests *k* eixos.

Així podem veure quines són les *k* variables que més pesen ens diu quins atributs són més informatius (i quins, en canvi, aporten informació redundant).
Els tops *k* ajuden a identificar les característiques més representatives del dataset segons la PCA.


In [32]:
# Guarda resultats i models
pca_results = []
pca_models = {}


for k in n_components_list:
    print(f"\n-- PCA - {k} components --")

    for model, classifier in CLASSIFIERS_FILTER.items():
        # Pipeline amb preprocessor, SMOTE, PCA i classificador
        pipeline = ImbPipeline([
            ("balance",       balance_method),
            ("pca",          PCA(n_components=k, random_state=42)),
            ("classifier",   classifier)
        ])

        best_est, y_pred, report, best_params, best_score, best_params, best_score = train_models( 
        X_train, 
        y_train,
        X_test,
        y_test,
        pipeline, 
        PARAM_GRIDS[model],
        search_type="grid"
        )

        # Guarda el millor model
        pca_models[f'{model}_PCA{k}'] = best_est

        df_pca_results = append_results(
            pca_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"PCA_{k}",
        )



-- PCA - 5 components --

Train F1 (1): 0.7491 | Test F1 (1): 0.4985 | Train Acc: 0.8160 | Test Acc: 0.6354
              precision    recall  f1-score   support

         0.0     0.6910    0.7376    0.7136       282
         1.0     0.5287    0.4716    0.4985       176

    accuracy                         0.6354       458
   macro avg     0.6098    0.6046    0.6060       458
weighted avg     0.6286    0.6354    0.6309       458


Train F1 (1): 0.7073 | Test F1 (1): 0.5000 | Train Acc: 0.7118 | Test Acc: 0.4629
              precision    recall  f1-score   support

         0.0     0.6268    0.3156    0.4198       282
         1.0     0.3892    0.6989    0.5000       176

    accuracy                         0.4629       458
   macro avg     0.5080    0.5072    0.4599       458
weighted avg     0.5355    0.4629    0.4506       458


-- PCA - 10 components --

Train F1 (1): 0.8208 | Test F1 (1): 0.5784 | Train Acc: 0.8608 | Test Acc: 0.6594
              precision    recall  f1-score 

In [33]:
pca_results_df = pd.DataFrame(pca_results)
display(pca_results_df[display_cols].round(4))

Unnamed: 0,Target,Experiment,Model,Train F1 (1),Test F1 (1),Train F1 (macro global),Test F1 (macro global),Train Accuracy,Test Accuracy
0,TIRED,PCA_5,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
1,TIRED,PCA_5,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
2,TIRED,PCA_10,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
3,TIRED,PCA_10,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
4,TIRED,PCA_15,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
5,TIRED,PCA_15,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
6,TIRED,PCA_20,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
7,TIRED,PCA_20,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
8,TIRED,PCA_25,BalancedRandomForest,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878
9,TIRED,PCA_25,LGBM,0.9474,0.6207,0.9566,0.6777,0.9585,0.6878


## Anàlisi de resultats

In [34]:
# --- Results Compilation ---
all_results_dfs = [base_results_df]
if 'feature_importance_results_df' in locals() and not feature_importance_results_df.empty: 
    all_results_dfs.append(feature_importance_results_df)

if 'perm_topk_results_df' in locals() and not perm_topk_results_df.empty: 
    all_results_dfs.append(perm_topk_results_df)

if 'pca_results_df' in locals() and not pca_results_df.empty: 
    all_results_dfs.append(pca_results_df)

if len(all_results_dfs) > 1:
    combined_results_df = pd.concat(all_results_dfs, ignore_index=True)
    print("\nCombined results from all experiments.")
else:
    print("\nOnly baseline results available.")
    combined_results_df = base_results_df

final_cols = ["Target", "Experiment", "Model", "Test Accuracy","Test F1 (1)", "Test F1 (macro global)","Best Params"]
 
combined_results_df = combined_results_df[final_cols]

# --- Analysis ---
print("\n--- Overall Performance Analysis (Sorted by Test F1-Macro) ---")
combined_results_sorted = combined_results_df.sort_values(by=["Test Accuracy"], ascending=[False]).reset_index(drop=True)


print(f"\n--- Target: {TARGET} ---")
display(combined_results_sorted[final_cols].head().round(4))


Combined results from all experiments.

--- Overall Performance Analysis (Sorted by Test F1-Macro) ---

--- Target: TIRED ---


Unnamed: 0,Target,Experiment,Model,Test Accuracy,Test F1 (1),Test F1 (macro global),Best Params
0,TIRED,Entrenament basic,LGBM,0.7314,0.6516,0.7165,"{'classifier__subsample': 0.7590327755184709, ..."
1,TIRED,Entrenament basic,BalancedRandomForest,0.7249,0.625,0.7039,"{'classifier__n_estimators': 1163, 'classifier..."
2,TIRED,Perm_Top15,BalancedRandomForest,0.7249,0.6272,0.7046,"{'classifier__n_estimators': 1163, 'classifier..."
3,TIRED,FI_Top15,BalancedRandomForest,0.7227,0.5968,0.6928,"{'classifier__n_estimators': 1163, 'classifier..."
4,TIRED,FI_Top10,BalancedRandomForest,0.7162,0.5667,0.6778,"{'classifier__n_estimators': 1163, 'classifier..."


In [35]:
tired_top5 = combined_results_sorted.head(5)

# Mostra la taula resum dels 5 millors
print(f"\nTop 5 models per a {TARGET} segons Test Accuracy:\n")
display(tired_top5[["Model","Test Accuracy","Test F1 (1)"]])

# Dibuixa la matriu de confusió de cada un
for model in tired_top5["Model"]:
    clf = base_models[model] 
    y_pred = clf.predict(X_test)  
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix — {TARGET} — {model}")
    plt.show()


Top 5 models per a TIRED segons Test Accuracy:



Unnamed: 0,Model,Test Accuracy,Test F1 (1)
0,LGBM,0.731441,0.651558
1,BalancedRandomForest,0.724891,0.625
2,BalancedRandomForest,0.724891,0.627219
3,BalancedRandomForest,0.722707,0.596825
4,BalancedRandomForest,0.716157,0.566667
