# LifeSnaps Feature Selection Experiments v2

Expermiments amb un menor nombre de features.

#### Import de llibreries necessaries

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Imbalanced data pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Core utilities
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# Model definitions fora dels defints
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


############## IMPORTS DEL NOSTRE PAQUET ###################
# Obtenim les funcions per entrenar i evaluar els models i registrar les mètriques
from ai_health_assistant.utils.train_helpers import train_models, append_results, plot_learning_curve, mat_confusio, update_experiments_file

# Obtenim els classificadors i els seus parametres
from ai_health_assistant.utils.model_config import get_classifier_config, PARAM_GRIDS, CLASSIFIERS, BALANCING_METHODS

# Obtenim el target, features el la construcció del preprocessador
from ai_health_assistant.utils.prep_helpers import TARGET, build_preprocessor, FEATURES



# Configuració de pandas
pd.set_option('display.max_columns', None)

# Carrega de dades, netes i amb fe aplicat
df_train = pd.read_csv('../data/df_engineered_train.csv')
df_test = pd.read_csv('../data/df_engineered_test.csv')
    
print(f"Shape: {df_train.shape}")
print(f"Shape: {df_test.shape}")


Shape: (1832, 40)
Shape: (458, 40)


## Lectura de dades i split de train / test

### Train / Test Split

In [2]:
# Fem l'split de les dades, separant les features i el target
X_train = df_train[FEATURES]
y_train = df_train[TARGET]

X_test = df_test[FEATURES]
y_test = df_test[TARGET]

print(f"\nTrain shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print('\n','--'*50)
print(f"\nDistribució train:\n{y_train.value_counts(normalize=True)}")
print(f"\nDistribució test:\n{y_test.value_counts(normalize=True)}")

numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=['number']).columns.tolist()

print(f"\nCol. numeriques ({len(numerical_features)}): \n{numerical_features}")
print(f"Col. categoriques ({len(categorical_features)}): \n{categorical_features}")



Train shape: (1832, 37)
Test shape: (458, 37)

 ----------------------------------------------------------------------------------------------------

Distribució train:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Distribució test:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Col. numeriques (35): 
['bmi', 'calories', 'steps', 'lightly_active_minutes', 'moderately_active_minutes', 'very_active_minutes', 'sedentary_minutes', 'resting_hr', 'minutes_below_default_zone_1', 'minutes_in_default_zone_1', 'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'minutesAsleep', 'minutesAwake', 'sleep_efficiency', 'sleep_deep_ratio', 'sleep_light_ratio', 'sleep_rem_ratio', 'sleep_wake_ratio', 'daily_temperature_variation', 'rmssd', 'spo2', 'full_sleep_breathing_rate', 'wake_after_sleep_pct', 'steps_norm_cal', 'deep_sleep_score', 'active_sedentary_ratio', 'sleep_activity_balance', 'bmi_hr_interaction', 'sleep_quality_index', 'hr_zone_variab

### Definim el preprocessador

In [3]:
preprocessor = build_preprocessor(df_train, FEATURES)

## ENTRENAMENT DEL MODEL BASE

In [4]:
display_cols = ['Experiment', 'Train F1 (1)', 'Train F1 (macro global)','Train Accuracy', 'Test Recall (1)', 'Test Precision (1)', 'Test F1 (1)', 'Test F1 (macro global)','Test Accuracy']

# --------------------------------------------------------------
# Filtre de selecció dels models a entrenar
PROVA_MODELS = ["BalancedRandomForest", "LGBM"]
# Selecció del mètode de balanceig
balance_name = "SMOTETomek"
# --------------------------------------------------------------

# Filtre per no haver de entrenar tots el models
CLASSIFIERS_FILTER = {k: v for k, v in CLASSIFIERS.items() if k in PROVA_MODELS}
balance_method = BALANCING_METHODS[balance_name]

### Regressió Logistica

In [5]:

reg_results = []
reg_models = {}

reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=2000,            # convergència assegurada
        class_weight="balanced", # tracta l’imbalance de la classe 1
        solver="lbfgs",          # ràpid i estable per datasets petits/mitjans
    ))
])

reg_param_grid = {
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100, 200, 500, 1000]
}

best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train, 
        y_train,
        X_test,
        y_test, 
        reg_pipeline, 
        reg_param_grid,
        search_type='grid',
    )

reg_results_df = append_results(
    reg_results,
    "LogisticRegression",
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)

display(reg_results_df[display_cols])
update_experiments_file(reg_results_df)

Entrenant model...

Train F1 (1): 0.5156 | Test F1 (1): 0.4800 | Train Acc: 0.5939 | Test Acc: 0.5742
              precision    recall  f1-score   support

         0.0     0.6680    0.6135    0.6396       282
         1.0     0.4523    0.5114    0.4800       176

    accuracy                         0.5742       458
   macro avg     0.5601    0.5624    0.5598       458
weighted avg     0.5851    0.5742    0.5782       458



Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,LogisticRegression_EntrenamentBasic,0.51562,0.583,0.59389,0.51136,0.45226,0.48,0.55978,0.57424



Métriques guardades a ../results/02_experiments/experiments.csv



In [6]:
base_results = []

for model, classifier in CLASSIFIERS_FILTER.items():

    if model == "BalancedRandomForest_prova":
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("classifier", classifier)
        ])
    else:
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("balance", balance_method),
            ("classifier", classifier)
        ])

    print(f"\n==== {model} ====")
    best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train,
        y_train,
        X_test,
        y_test, 
        pipeline,
        PARAM_GRIDS[model],
        search_type='grid'
    )

    base_results_df = append_results(
    base_results,
    model,
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)


==== BalancedRandomForest ====
Entrenant model...

Train F1 (1): 0.8101 | Test F1 (1): 0.5029 | Train Acc: 0.8564 | Test Acc: 0.6245
              precision    recall  f1-score   support

         0.0     0.6910    0.7057    0.6982       282
         1.0     0.5118    0.4943    0.5029       176

    accuracy                         0.6245       458
   macro avg     0.6014    0.6000    0.6006       458
weighted avg     0.6221    0.6245    0.6232       458


==== LGBM ====
Entrenant model...

Train F1 (1): 0.7795 | Test F1 (1): 0.5391 | Train Acc: 0.7953 | Test Acc: 0.5371
              precision    recall  f1-score   support

         0.0     0.7011    0.4326    0.5351       282
         1.0     0.4366    0.7045    0.5391       176

    accuracy                         0.5371       458
   macro avg     0.5689    0.5686    0.5371       458
weighted avg     0.5995    0.5371    0.5366       458



In [7]:
update_experiments_file(base_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 1: Importancia de les caracteristiques

Entrenem RandomForest per indentificar les caracteristiques més importants (3 i 5), posteriorment entrenem els models utilitzant aquestes 3 i 5 característiques, per veure si augmenta el rendiment del model. Proavarem tambe amb permutation importances.

### Feature Importance

La Gini importance d’una feature és: La suma de totes les reduccions d’impuresa (Gini) que ha causat al llarg de tots els arbres i de totes les seves aparicions.

In [9]:
# Param grid & pipeline bàsic de random forest
rf_name = "BalancedRandomForest"

pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("balance", BALANCING_METHODS[balance_name]),
    ("classifier", CLASSIFIERS[rf_name])
])

# Entrenament del model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipeline, 
    PARAM_GRIDS[rf_name],
    search_type='grid'
)
preprocessor = best_est.named_steps['preprocessor']

feature_names = preprocessor.get_feature_names_out()

# Serie em els noms
importances_raw = pd.Series(
    best_est.named_steps["classifier"].feature_importances_,
    index=feature_names
)

agg_importances = (
    importances_raw.groupby(feature_names).sum().sort_values(ascending=False)
)

# Top-4 a Top-15 importàncies

top3 = agg_importances.head(3).index.tolist()
top4 = agg_importances.head(4).index.tolist() 
top5 = agg_importances.head(5).index.tolist()
top6 = agg_importances.head(6).index.tolist()
top7 = agg_importances.head(7).index.tolist()
top8 = agg_importances.head(8).index.tolist()
top9 = agg_importances.head(9).index.tolist()
top10 = agg_importances.head(10).index.tolist()
top11 = agg_importances.head(11).index.tolist()
top12 = agg_importances.head(12).index.tolist()
top13 = agg_importances.head(13).index.tolist()

print("Top-3 features:", top3)
print("Top-4 features:", top4)
print("Top-5 features:", top5)
print("Top-6 features:", top6)
print("Top-7 features:", top7)
print("Top-8 features:", top8)
print("Top-9 features:", top9)
print("Top-10 features:", top10)
print("Top-11 features:", top11)
print("Top-12 features:", top12)
print("Top-13 features:", top13)


Entrenant model...

Train F1 (1): 0.8101 | Test F1 (1): 0.5029 | Train Acc: 0.8564 | Test Acc: 0.6245
              precision    recall  f1-score   support

         0.0     0.6910    0.7057    0.6982       282
         1.0     0.5118    0.4943    0.5029       176

    accuracy                         0.6245       458
   macro avg     0.6014    0.6000    0.6006       458
weighted avg     0.6221    0.6245    0.6232       458

Top-3 features: ['num__bmi', 'num__bmi_hr_interaction', 'num__calories']
Top-4 features: ['num__bmi', 'num__bmi_hr_interaction', 'num__calories', 'num__resting_hr']
Top-5 features: ['num__bmi', 'num__bmi_hr_interaction', 'num__calories', 'num__resting_hr', 'num__daily_temperature_variation']
Top-6 features: ['num__bmi', 'num__bmi_hr_interaction', 'num__calories', 'num__resting_hr', 'num__daily_temperature_variation', 'num__steps_norm_cal']
Top-7 features: ['num__bmi', 'num__bmi_hr_interaction', 'num__calories', 'num__resting_hr', 'num__daily_temperature_variation',

### Rentrenament de models 4-15 millors features

In [10]:
# Definim els conjunts de features
feature_sets = {
    "Top3": top3,
    "Top4": top4,   
    "Top5": top5,
    "Top6": top6,
    "Top7": top7,
    "Top8": top8,
    "Top9": top9,
    "Top10": top10,
    "Top11": top11,
    "Top12": top12,
    "Top13": top13,
}

# Reentrenament i avaluació per a cada subset
fi_results = []
preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()

# Generem el train transformat i el test transformat
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)


for label, feats in feature_sets.items():
    print(f"\nEntrenament amb {label}")
    for model, classifier in CLASSIFIERS_FILTER.items():

        print(f'==== {model} ====')
        if model == "BalancedRandomForest_prova": # Em dona millors resutats sense balance
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[feats],
            y_train,
            X_test_transformed[feats],
            y_test,
            pipe,
            PARAM_GRIDS[model]
        )

        fi_results_df =append_results(
            fi_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"FI_{label}"
        )


Entrenament amb Top3
==== BalancedRandomForest ====
Entrenant model...

Train F1 (1): 0.6680 | Test F1 (1): 0.4973 | Train Acc: 0.7287 | Test Acc: 0.5983
              precision    recall  f1-score   support

         0.0     0.6828    0.6489    0.6655       282
         1.0     0.4789    0.5170    0.4973       176

    accuracy                         0.5983       458
   macro avg     0.5809    0.5830    0.5814       458
weighted avg     0.6045    0.5983    0.6008       458

==== LGBM ====
Entrenant model...

Train F1 (1): 0.6181 | Test F1 (1): 0.5521 | Train Acc: 0.5677 | Test Acc: 0.4934
              precision    recall  f1-score   support

         0.0     0.7155    0.2943    0.4171       282
         1.0     0.4181    0.8125    0.5521       176

    accuracy                         0.4934       458
   macro avg     0.5668    0.5534    0.4846       458
weighted avg     0.6012    0.4934    0.4690       458


Entrenament amb Top4
==== BalancedRandomForest ====
Entrenant model...

T

In [12]:
display(fi_results_df[display_cols])
update_experiments_file(fi_results_df)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_FI_Top3,0.668,0.71933,0.72871,0.51705,0.47895,0.49727,0.58136,0.59825
1,LGBM_FI_Top3,0.61813,0.56001,0.56769,0.8125,0.41813,0.55212,0.4846,0.49345
2,BalancedRandomForest_FI_Top4,0.68712,0.7391,0.74945,0.46591,0.46591,0.46591,0.56629,0.58952
3,LGBM_FI_Top4,0.63284,0.59332,0.59716,0.76705,0.42857,0.5499,0.51495,0.51747
4,BalancedRandomForest_FI_Top5,0.69161,0.74563,0.7571,0.5,0.49718,0.49858,0.5921,0.61354
5,LGBM_FI_Top5,0.65705,0.63515,0.63646,0.76136,0.44224,0.5595,0.53833,0.5393
6,BalancedRandomForest_FI_Top6,0.71368,0.76646,0.77838,0.45455,0.4878,0.47059,0.57904,0.60699
7,LGBM_FI_Top6,0.66189,0.63819,0.63974,0.82386,0.46326,0.59305,0.5635,0.5655
8,BalancedRandomForest_FI_Top7,0.72664,0.77543,0.78603,0.49432,0.49432,0.49432,0.58936,0.61135
9,LGBM_FI_Top7,0.67979,0.6617,0.66266,0.79545,0.44164,0.56795,0.5322,0.53493



Métriques guardades a ../results/02_experiments/experiments.csv



### Permutation Importance

Per avaluar la importància de les característiques del model. Serveix per determinar quines característiques tenen més impacte en el rendiment del model.

In [14]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Train el random forest
rf_name = 'BalancedRandomForest'
clf_rf, param_grid_rf = get_classifier_config(rf_name)

# nou pipeline
pipe = ImbPipeline([
    ("preprocessor", preprocessor),
    ("balancing", BALANCING_METHODS[balance_name]),
    ("classifier", clf_rf)
])

# Train el model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipe, 
    param_grid_rf,
    search_type='grid'
)

best_rf_model = best_est

preprocessor = best_rf_model.named_steps['preprocessor']
classifier = best_rf_model.named_steps['classifier']

X_test_transformed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names_out()



# pipeline nomes classifier
final_estimator = ImbPipeline([
    ('balancing', BALANCING_METHODS[balance_name]),
    ('classifier', classifier)
])

# Calculem la importància permutativa
result = permutation_importance(
    final_estimator,
    X_test_transformed,  # Conjunt de dades que no hagi vist
    y_test,
    n_repeats=200,
    n_jobs=-1,
    random_state=42
)

# Creem Series amb els noms de les caracteristiques
perm_importances = pd.Series(
    result.importances_mean, 
    index=feature_names
).sort_values(ascending=False)

# Top 3 a 15 features
perm_top_features = {
    3: perm_importances.head(3).index.tolist(),
    4: perm_importances.head(4).index.tolist(),
    5: perm_importances.head(5).index.tolist(),
    6: perm_importances.head(6).index.tolist(),
    7: perm_importances.head(7).index.tolist(),
    8: perm_importances.head(8).index.tolist(),
    9: perm_importances.head(9).index.tolist(),
    10: perm_importances.head(10).index.tolist(),
    11: perm_importances.head(11).index.tolist(),
    12: perm_importances.head(12).index.tolist(),
    13: perm_importances.head(13).index.tolist(),
}

print("\nTop-13 features (Permutation):")
display(perm_importances.head(13).to_frame("Importancia"))

Entrenant model...

Train F1 (1): 0.8101 | Test F1 (1): 0.5029 | Train Acc: 0.8564 | Test Acc: 0.6245
              precision    recall  f1-score   support

         0.0     0.6910    0.7057    0.6982       282
         1.0     0.5118    0.4943    0.5029       176

    accuracy                         0.6245       458
   macro avg     0.6014    0.6000    0.6006       458
weighted avg     0.6221    0.6245    0.6232       458


Top-15 features (Permutation):


Unnamed: 0,Importancia
num__bmi,0.028952
num__daily_temperature_variation,0.023755
num__resting_hr,0.009967
num__sleep_eff_rmssd,0.009061
num__sleep_deep_ratio,0.008996
cat__gender_FEMALE,0.008886
num__lightly_active_minutes,0.007347
num__calories,0.007194
num__bmi_hr_interaction,0.007129
num__full_sleep_breathing_rate,0.007074


### Reentrenament Permutation importance

In [15]:
# Definim els dos conjunts de features basats en importància de permutació
perm_sets = {
    "Perm_Top3": perm_top_features[3],
    "Perm_Top4": perm_top_features[4],
    "Perm_Top5": perm_top_features[5], 
    "Perm_Top6": perm_top_features[6],
    "Perm_Top7": perm_top_features[7],
    "Perm_Top8": perm_top_features[8],
    "Perm_Top9": perm_top_features[9],
    "Perm_Top10": perm_top_features[10],
    "Perm_Top11": perm_top_features[11],
    "Perm_Top12": perm_top_features[12],
    "Perm_Top13": perm_top_features[13]  
}

print(perm_top_features)

# Reentrenament i avaluació per a cada subset
perm_results = []

# Preprocessem un cop fora del bucle per eficiència
preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convertim a DataFrame mantenint els índexs
feature_names = preprocessor.get_feature_names_out()
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)

for label, feat_indices in perm_sets.items():
    print(f"\nEntrenament models amb {label}")
    
    for model_name, classifier in CLASSIFIERS_FILTER.items():
        print(f'==== {model_name} ====')
        
        if model_name == "BalancedRandomForest_prova":
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[feat_indices],
            y_train,
            X_test_transformed[feat_indices],
            y_test,
            pipe,
            PARAM_GRIDS[model_name]
        )

        perm_results_df = append_results(
            perm_results,
            model_name,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=label  # Usem directament l'etiqueta (Perm_TopX)
        )

{3: ['num__bmi', 'num__daily_temperature_variation', 'num__resting_hr'], 4: ['num__bmi', 'num__daily_temperature_variation', 'num__resting_hr', 'num__sleep_eff_rmssd'], 5: ['num__bmi', 'num__daily_temperature_variation', 'num__resting_hr', 'num__sleep_eff_rmssd', 'num__sleep_deep_ratio'], 6: ['num__bmi', 'num__daily_temperature_variation', 'num__resting_hr', 'num__sleep_eff_rmssd', 'num__sleep_deep_ratio', 'cat__gender_FEMALE'], 7: ['num__bmi', 'num__daily_temperature_variation', 'num__resting_hr', 'num__sleep_eff_rmssd', 'num__sleep_deep_ratio', 'cat__gender_FEMALE', 'num__lightly_active_minutes'], 8: ['num__bmi', 'num__daily_temperature_variation', 'num__resting_hr', 'num__sleep_eff_rmssd', 'num__sleep_deep_ratio', 'cat__gender_FEMALE', 'num__lightly_active_minutes', 'num__calories'], 9: ['num__bmi', 'num__daily_temperature_variation', 'num__resting_hr', 'num__sleep_eff_rmssd', 'num__sleep_deep_ratio', 'cat__gender_FEMALE', 'num__lightly_active_minutes', 'num__calories', 'num__bmi_hr

In [16]:
display(perm_results_df[display_cols])

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_Perm_Top3,0.63686,0.69897,0.71179,0.5,0.48352,0.49162,0.58273,0.60262
1,LGBM_Perm_Top3,0.62022,0.56434,0.57151,0.8125,0.42059,0.55426,0.48963,0.49782
2,BalancedRandomForest_Perm_Top4,0.67,0.73297,0.74782,0.47159,0.52866,0.4985,0.60602,0.63537
3,LGBM_Perm_Top4,0.62374,0.56455,0.5726,0.83523,0.42486,0.56322,0.49227,0.50218
4,BalancedRandomForest_Perm_Top5,0.68551,0.74774,0.7631,0.44886,0.50968,0.47734,0.59081,0.62227
5,LGBM_Perm_Top5,0.64798,0.60436,0.60917,0.80682,0.41765,0.55039,0.48519,0.49345
6,BalancedRandomForest_Perm_Top6,0.68189,0.74727,0.76419,0.44886,0.51974,0.48171,0.5963,0.62882
7,LGBM_Perm_Top6,0.63774,0.58855,0.59443,0.84659,0.43567,0.57529,0.51126,0.51965
8,BalancedRandomForest_Perm_Top7,0.70776,0.7657,0.78002,0.51136,0.55215,0.53097,0.62771,0.65284
9,LGBM_Perm_Top7,0.67115,0.65176,0.65284,0.78409,0.44373,0.56674,0.53745,0.5393


In [17]:
update_experiments_file(perm_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## Top correlated

In [20]:
# ---------------------------------------------------
# Selecció automàtica de columnes per correlació
# ---------------------------------------------------
correlacions = df_train.corr(numeric_only=True)[TARGET].abs().sort_values(ascending=False)

# Eliminem la correlació amb la pròpia variable objectiu
correlacions = correlacions.drop(TARGET, errors='ignore')

# Top correlacions respecte el target

corr_top_feats = {
    3: correlacions.head(3).index.tolist(),
    4: correlacions.head(4).index.tolist(),
    5: correlacions.head(5).index.tolist(),
    6: correlacions.head(6).index.tolist(),
    7: correlacions.head(7).index.tolist(),
    8: correlacions.head(8).index.tolist(),
    9: correlacions.head(9).index.tolist(),
    10: correlacions.head(10).index.tolist(),
    11: correlacions.head(11).index.tolist(),
    12: correlacions.head(12).index.tolist(),
    13: correlacions.head(13).index.tolist(),
    14: correlacions.head(14).index.tolist(),
    15: correlacions.head(15).index.tolist()
}
print(corr_top_feats[15])

corr_sets = {
    "Corr_3": corr_top_feats[3],
    "Corr_4": corr_top_feats[4],
    "Corr_5": corr_top_feats[5],
    "Corr_6": corr_top_feats[6],
    "Corr_7": corr_top_feats[7],
    "Corr_8": corr_top_feats[8],
    "Corr_9": corr_top_feats[9],
    "Corr_10": corr_top_feats[10],
    "Corr_11": corr_top_feats[11],
    "Corr_12": corr_top_feats[12],
    "Corr_13": corr_top_feats[13],
    "Corr_14": corr_top_feats[14],
    "Corr_15": corr_top_feats[15]
}

corr_results = []

for i, feats in corr_sets.items():
    print(f"Entrenant {i}")
    for model_name, classifier in CLASSIFIERS_FILTER.items():
        print(f'==== {model_name} ====')
        
        preprocessor = build_preprocessor(df_train, feats)

        if model_name == "BalancedRandomForest_prova":
            pipe = ImbPipeline([
                ("preprocessor", preprocessor),
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("preprocessor", preprocessor),
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train,
            y_train,
            X_test,
            y_test,
            pipe,
            PARAM_GRIDS[model_name]
        )

        corr_results_df = append_results(
            corr_results,
            model_name,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=i  # Usem directament l'etiqueta (Perm_TopX)
        )




['bmi', 'calories', 'full_sleep_breathing_rate', 'deep_sleep_score', 'sedentary_minutes', 'lightly_active_minutes', 'sleep_deep_ratio', 'active_to_total_ratio', 'sleep_efficiency', 'daily_temperature_variation', 'steps', 'active_to_rest_transition', 'active_sedentary_ratio', 'sleep_rem_ratio', 'minutesAsleep']
Entrenant Corr_3
==== BalancedRandomForest ====
Entrenant model...

Train F1 (1): 0.6211 | Test F1 (1): 0.5183 | Train Acc: 0.7003 | Test Acc: 0.6266
              precision    recall  f1-score   support

         0.0     0.6989    0.6915    0.6952       282
         1.0     0.5140    0.5227    0.5183       176

    accuracy                         0.6266       458
   macro avg     0.6064    0.6071    0.6067       458
weighted avg     0.6278    0.6266    0.6272       458

==== LGBM ====
Entrenant model...

Train F1 (1): 0.6130 | Test F1 (1): 0.5360 | Train Acc: 0.5699 | Test Acc: 0.4934
              precision    recall  f1-score   support

         0.0     0.6866    0.3262    0.

In [21]:
display(corr_results_df[display_cols])
update_experiments_file(corr_results_df)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_Corr_3,0.62112,0.68663,0.70033,0.52273,0.51397,0.51831,0.60675,0.62664
1,LGBM_Corr_3,0.61297,0.56447,0.56987,0.76136,0.41358,0.536,0.48915,0.49345
2,BalancedRandomForest_Corr_4,0.63425,0.70153,0.7167,0.51705,0.50556,0.51124,0.60026,0.62009
3,LGBM_Corr_4,0.61028,0.54898,0.55731,0.83523,0.43881,0.57534,0.51977,0.5262
4,BalancedRandomForest_Corr_5,0.67633,0.7323,0.744,0.52841,0.50543,0.51667,0.60186,0.62009
5,LGBM_Corr_5,0.64012,0.60163,0.60535,0.8125,0.43731,0.56859,0.52158,0.5262
6,BalancedRandomForest_Corr_6,0.7133,0.7634,0.77402,0.49432,0.47541,0.48468,0.57627,0.59607
7,LGBM_Corr_6,0.65383,0.62496,0.62718,0.78409,0.41945,0.54653,0.49468,0.5
8,BalancedRandomForest_Corr_7,0.68864,0.74512,0.75764,0.50568,0.49721,0.50141,0.59295,0.61354
9,LGBM_Corr_7,0.66358,0.64229,0.64356,0.78409,0.43533,0.55984,0.52342,0.5262



Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 2: PCA


Es realitza una anàlisi de components principals (PCA) per examinar com evolucionen els components més rellevants del conjunt de dades en termes de variància explicada acumulada, considerant els primers 3, 4, 5, 6, 7, 8, 9, 10 components

In [22]:
pca = PCA(random_state=42)
pca.fit(X_train_transformed)

explained_cumsum = pca.explained_variance_ratio_.cumsum()*100

ks = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
cums = explained_cumsum[[k-1 for k in ks]]

plt.figure(figsize=(8, 4))
plt.plot(
    range(1, len(explained_cumsum) + 1),
    explained_cumsum,
)
plt.scatter(ks, cums)
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title('Evolución de la varianza explicada según n_components')
plt.grid(True)
plt.tight_layout()
plt.show()

In [23]:
n_components_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
for k in n_components_list:
    # Ajusta PCA
    pca = PCA(n_components=k, random_state=42)
    pca.fit(X_train_transformed)

    # loadings: matriz (n_features, k)
    loadings = pca.components_.T

    # importancia = suma de cargas absolutes de cada feature en tots els components
    importance = np.sum(np.abs(loadings), axis=1)

    # crea DataFrame, ordena top-k
    df_imp = pd.DataFrame({
        'feature':    feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    topk = df_imp.head(k)

In [25]:
# Guarda resultats i models
pca_results = []

for k in n_components_list:
    print(f"\n-- PCA - {k} components --")

    for model, classifier in CLASSIFIERS_FILTER.items():
        preprocessor = build_preprocessor(df_train, FEATURES)
        if model == "BalancedRandomForest_prova":
            pipeline = ImbPipeline([
                ("preprocessor", preprocessor),
                ("pca",          PCA(n_components=k, random_state=42)),
                ("classifier",   classifier)
            ])
        else:
            pipeline = ImbPipeline([
                ("preprocessor", preprocessor),
                ("balance",       balance_method),
                ("pca",          PCA(n_components=k, random_state=42)),
                ("classifier",   classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
        X_train, 
        y_train,
        X_test,
        y_test,
        pipeline, 
        PARAM_GRIDS[model],
        search_type="grid"
        )

        df_pca_results = append_results(
            pca_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"PCA_{k}",
        )



-- PCA - 3 components --
Entrenant model...

Train F1 (1): 0.6852 | Test F1 (1): 0.4343 | Train Acc: 0.7533 | Test Acc: 0.5677
              precision    recall  f1-score   support

         0.0     0.6479    0.6525    0.6502       282
         1.0     0.4368    0.4318    0.4343       176

    accuracy                         0.5677       458
   macro avg     0.5423    0.5422    0.5422       458
weighted avg     0.5668    0.5677    0.5672       458

Entrenant model...

Train F1 (1): 0.6034 | Test F1 (1): 0.5263 | Train Acc: 0.5257 | Test Acc: 0.4498
              precision    recall  f1-score   support

         0.0     0.6471    0.2340    0.3438       282
         1.0     0.3933    0.7955    0.5263       176

    accuracy                         0.4498       458
   macro avg     0.5202    0.5147    0.4350       458
weighted avg     0.5495    0.4498    0.4139       458


-- PCA - 4 components --
Entrenant model...

Train F1 (1): 0.7277 | Test F1 (1): 0.4103 | Train Acc: 0.7904 | Test 

In [26]:
display(df_pca_results[display_cols])
update_experiments_file(df_pca_results)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_PCA_3,0.68524,0.74118,0.75328,0.43182,0.43678,0.43429,0.54223,0.56769
1,LGBM_PCA_3,0.60338,0.50671,0.52566,0.79545,0.39326,0.52632,0.43503,0.44978
2,BalancedRandomForest_PCA_4,0.72766,0.77865,0.79039,0.40909,0.41143,0.41026,0.52194,0.54803
3,LGBM_PCA_4,0.62846,0.55658,0.56823,0.73295,0.38393,0.50391,0.4376,0.44541
4,BalancedRandomForest_PCA_5,0.74286,0.79192,0.80349,0.42045,0.42775,0.42407,0.53479,0.56114
5,LGBM_PCA_5,0.64391,0.58047,0.59007,0.81818,0.39669,0.53432,0.43427,0.45197
6,BalancedRandomForest_PCA_6,0.75917,0.80589,0.81714,0.43182,0.43429,0.43305,0.54042,0.5655
7,LGBM_PCA_6,0.65458,0.59642,0.6048,0.86364,0.40426,0.55072,0.4347,0.45852
8,BalancedRandomForest_PCA_7,0.76685,0.80932,0.81878,0.44886,0.44633,0.44759,0.55062,0.57424
9,LGBM_PCA_7,0.66244,0.61685,0.62227,0.82955,0.39891,0.53875,0.43515,0.45415



Métriques guardades a ../results/02_experiments/experiments.csv



## Anàlisi de resultats

In [27]:
# Llegeix la taula d'experiments (ja ordenada per F1)
experiments_df = pd.read_csv("../results/02_experiments/experiments.csv")
top5 = experiments_df.head(5)

# Mostra els top 5
display(top5)


Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Precision (1),Test Recall (1),Test F1 (1),Test F1 (macro global),Test Accuracy,Best Params
0,LGBM_Perm_Top9,0.68856,0.67874,0.67904,0.47855,0.82386,0.60543,0.58647,0.58734,"{'classifier__subsample': 0.7511807565247405, ..."
1,LGBM_Perm_Top12,0.69565,0.68641,0.68668,0.46667,0.83523,0.59878,0.56762,0.56987,"{'classifier__subsample': 0.7511807565247405, ..."
2,LGBM_Perm_Top10,0.68535,0.67488,0.67522,0.46774,0.82386,0.59671,0.57045,0.57205,"{'classifier__subsample': 0.7511807565247405, ..."
3,LGBM_FI_Top6,0.66189,0.63819,0.63974,0.46326,0.82386,0.59305,0.5635,0.5655,
4,LGBM_Perm_Top11,0.69288,0.68426,0.6845,0.46254,0.80682,0.58799,0.5642,0.5655,"{'classifier__subsample': 0.7511807565247405, ..."
