# LifeSnaps Feature Selection Experiments v2

Expermiments amb un menor nombre de features.

#### Import de llibreries necessaries

In [12]:
import warnings
warnings.filterwarnings('ignore')

# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Imbalanced data pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Core utilities
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# Model definitions fora dels defints
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


############## IMPORTS DEL NOSTRE PAQUET ###################
# Obtenim les funcions per entrenar i evaluar els models i registrar les mètriques
from ai_health_assistant.utils.train_helpers import train_models, append_results, plot_learning_curve, mat_confusio, update_experiments_file

# Obtenim els classificadors i els seus parametres
from ai_health_assistant.utils.model_config import get_classifier_config, PARAM_GRIDS, CLASSIFIERS, BALANCING_METHODS

# Obtenim el target, features el la construcció del preprocessador
from ai_health_assistant.utils.prep_helpers import TARGET, build_preprocessor, FEATURES

# Configuració de pandas
pd.set_option('display.max_columns', None)

# Carrega de dades, netes i amb fe aplicat
df_train = pd.read_csv('../data/df_engineered_train.csv')
df_test = pd.read_csv('../data/df_engineered_test.csv')
    
print(f"Shape: {df_train.shape}")
print(f"Shape: {df_test.shape}")


Shape: (1832, 40)
Shape: (458, 40)


## Lectura de dades i split de train / test

### Train / Test Split

In [13]:
# Fem l'split de les dades, separant les features i el target
X_train = df_train[FEATURES]
y_train = df_train[TARGET]

X_test = df_test[FEATURES]
y_test = df_test[TARGET]

print(f"\nTrain shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print('\n','--'*50)
print(f"\nDistribució train:\n{y_train.value_counts(normalize=True)}")
print(f"\nDistribució test:\n{y_test.value_counts(normalize=True)}")

numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=['number']).columns.tolist()

print(f"\nCol. numeriques ({len(numerical_features)}): \n{numerical_features}")
print(f"Col. categoriques ({len(categorical_features)}): \n{categorical_features}")



Train shape: (1832, 37)
Test shape: (458, 37)

 ----------------------------------------------------------------------------------------------------

Distribució train:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Distribució test:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Col. numeriques (35): 
['bmi', 'calories', 'steps', 'lightly_active_minutes', 'moderately_active_minutes', 'very_active_minutes', 'sedentary_minutes', 'resting_hr', 'minutes_below_default_zone_1', 'minutes_in_default_zone_1', 'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'minutesAsleep', 'minutesAwake', 'sleep_efficiency', 'sleep_deep_ratio', 'sleep_light_ratio', 'sleep_rem_ratio', 'sleep_wake_ratio', 'daily_temperature_variation', 'rmssd', 'spo2', 'full_sleep_breathing_rate', 'wake_after_sleep_pct', 'steps_norm_cal', 'deep_sleep_score', 'active_sedentary_ratio', 'sleep_activity_balance', 'bmi_hr_interaction', 'sleep_quality_index', 'hr_zone_variab

### Definim el preprocessador

In [14]:
preprocessor = build_preprocessor(df_train, FEATURES)

## ENTRENAMENT DEL MODEL BASE

In [15]:
display_cols = ['Experiment', 'Train F1 (1)', 'Train F1 (macro global)','Train Accuracy', 'Test Recall (1)', 'Test Precision (1)', 'Test F1 (1)', 'Test F1 (macro global)','Test Accuracy']

# --------------------------------------------------------------
# Filtre de selecció dels models a entrenar
PROVA_MODELS = ["LGBM"]
# Selecció del mètode de balanceig
balance_name = "SMOTETomek"
# --------------------------------------------------------------

# Filtre per no haver de entrenar tots el models
CLASSIFIERS_FILTER = {k: v for k, v in CLASSIFIERS.items() if k in PROVA_MODELS}
balance_method = BALANCING_METHODS[balance_name]

### Regressió Logistica

In [None]:

reg_results = []
reg_models = {}

reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=2000,            # convergència assegurada
        class_weight="balanced", # tracta l’imbalance de la classe 1
        solver="lbfgs",          # ràpid i estable per datasets petits/mitjans
    ))
])

reg_param_grid = {
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100, 200, 500, 1000]
}

best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train, 
        y_train,
        X_test,
        y_test, 
        reg_pipeline, 
        reg_param_grid,
        search_type='grid',
    )

reg_results_df = append_results(
    reg_results,
    "LogisticRegression",
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)

display(reg_results_df[display_cols])
update_experiments_file(reg_results_df)

Entrenant model...


In [None]:
base_results = []

for model, classifier in CLASSIFIERS_FILTER.items():

    if model == "BalancedRandomForest":
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("classifier", classifier)
        ])
    else:
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("balance", balance_method),
            ("classifier", classifier)
        ])

    print(f"\n==== {model} ====")
    best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train,
        y_train,
        X_test,
        y_test, 
        pipeline,
        PARAM_GRIDS[model],
        search_type='grid'
    )

    base_results_df = append_results(
    base_results,
    model,
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)


==== LGBM ====
Entrenant model...

Train F1 (1): 0.7993 | Test F1 (1): 0.5151 | Train Acc: 0.8215 | Test Acc: 0.5437
              precision    recall  f1-score   support

         0.0     0.6798    0.4894    0.5691       282
         1.0     0.4353    0.6307    0.5151       176

    accuracy                         0.5437       458
   macro avg     0.5575    0.5600    0.5421       458
weighted avg     0.5858    0.5437    0.5483       458



In [None]:
update_experiments_file(base_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 1: Importancia de les caracteristiques

Entrenem RandomForest per indentificar les caracteristiques més importants (3 i 5), posteriorment entrenem els models utilitzant aquestes 3 i 5 característiques, per veure si augmenta el rendiment del model. Proavarem tambe amb permutation importances.

### Feature Importance

La Gini importance d’una feature és: La suma de totes les reduccions d’impuresa (Gini) que ha causat al llarg de tots els arbres i de totes les seves aparicions.

In [None]:
# Param grid & pipeline bàsic de random forest
rf_name = "BalancedRandomForest"

pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", CLASSIFIERS[rf_name])
])

# Entrenament del model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipeline, 
    PARAM_GRIDS[rf_name],
    search_type='grid'
)
preprocessor = best_est.named_steps['preprocessor']

feature_names = preprocessor.get_feature_names_out()

# Serie em els noms
importances_raw = pd.Series(
    best_est.named_steps["classifier"].feature_importances_,
    index=feature_names
)

agg_importances = (
    importances_raw.groupby(feature_names).sum().sort_values(ascending=False)
)

# Top-4 a Top-15 importàncies

top3 = agg_importances.head(3).index.tolist()
top4 = agg_importances.head(4).index.tolist() 
top5 = agg_importances.head(5).index.tolist()
top6 = agg_importances.head(6).index.tolist()
top7 = agg_importances.head(7).index.tolist()
top8 = agg_importances.head(8).index.tolist()
top9 = agg_importances.head(9).index.tolist()
top10 = agg_importances.head(10).index.tolist()
top11 = agg_importances.head(11).index.tolist()
top12 = agg_importances.head(12).index.tolist()
top13 = agg_importances.head(13).index.tolist()
top14 = agg_importances.head(14).index.tolist()
top15 = agg_importances.head(15).index.tolist() #FEM FINS A 20
top16 = agg_importances.head(16).index.tolist()
top17 = agg_importances.head(17).index.tolist()
top18 = agg_importances.head(18).index.tolist()
top19 = agg_importances.head(19).index.tolist()
top20 = agg_importances.head(20).index.tolist()

print("Top-3 features:", top3)
print("Top-4 features:", top4)
print("Top-5 features:", top5)
print("Top-6 features:", top6)
print("Top-7 features:", top7)
print("Top-8 features:", top8)
print("Top-9 features:", top9)
print("Top-10 features:", top10)
print("Top-11 features:", top11)
print("Top-12 features:", top12)
print("Top-13 features:", top13)
print("Top-14 features:", top14)
print("Top-15 features:", top15)
print("Top-16 features:", top16)
print("Top-17 features:", top17)
print("Top-18 features:", top18)
print("Top-19 features:", top19)
print("Top-20 features:", top20)



Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458

Top-3 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi']
Top-4 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr']
Top-5 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal']
Top-6 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal', 'num__daily_temperature_variation']
Top-7 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal', 'num__daily_temperature_v

### Rentrenament de models 4-15 millors features

In [None]:
# Definim els conjunts de features
feature_sets = {
    "Top3": top3,
    "Top4": top4,   
    "Top5": top5,
    "Top6": top6,
    "Top7": top7,
    "Top8": top8,
    "Top9": top9,
    "Top10": top10,
    "Top11": top11,
    "Top12": top12,
    "Top13": top13,
    "Top14": top14,
    "Top15": top15,
    "Top16": top16,
    "Top17": top17,
    "Top18": top18,
    "Top19": top19,
    "Top20": top20,
}

# Reentrenament i avaluació per a cada subset
fi_results = []
preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()

# Generem el train transformat i el test transformat
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)


for label, feats in feature_sets.items():
    print(f"\nEntrenament amb {label}")
    for model, classifier in CLASSIFIERS_FILTER.items():

        print(f'==== {model} ====')
        if model == "BalancedRandomForest": # Em dona millors resutats sense balance
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[feats],
            y_train,
            X_test_transformed[feats],
            y_test,
            pipe,
            PARAM_GRIDS[model]
        )

        fi_results_df =append_results(
            fi_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"FI_{label}"
        )


Entrenament amb Top3
==== LGBM ====
Entrenant model...

Train F1 (1): 0.6195 | Test F1 (1): 0.5277 | Train Acc: 0.6070 | Test Acc: 0.5153
              precision    recall  f1-score   support

         0.0     0.6829    0.3972    0.5022       282
         1.0     0.4218    0.7045    0.5277       176

    accuracy                         0.5153       458
   macro avg     0.5523    0.5509    0.5150       458
weighted avg     0.5826    0.5153    0.5120       458


Entrenament amb Top4
==== LGBM ====
Entrenant model...

Train F1 (1): 0.6464 | Test F1 (1): 0.5431 | Train Acc: 0.6386 | Test Acc: 0.5371
              precision    recall  f1-score   support

         0.0     0.7059    0.4255    0.5310       282
         1.0     0.4375    0.7159    0.5431       176

    accuracy                         0.5371       458
   macro avg     0.5717    0.5707    0.5370       458
weighted avg     0.6027    0.5371    0.5356       458


Entrenament amb Top5
==== LGBM ====
Entrenant model...

Train F1 (1

In [None]:
display(fi_results_df[display_cols])
update_experiments_file(fi_results_df)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,LGBM_FI_Top3,0.61945,0.60656,0.60699,0.70455,0.42177,0.52766,0.51495,0.51528
1,LGBM_FI_Top4,0.64637,0.63847,0.63865,0.71591,0.4375,0.5431,0.53704,0.53712
2,LGBM_FI_Top5,0.66988,0.66307,0.66321,0.67045,0.43704,0.52915,0.54117,0.54148
3,LGBM_FI_Top6,0.67466,0.67413,0.67413,0.77841,0.48754,0.59956,0.60043,0.60044
4,LGBM_FI_Top7,0.70303,0.70576,0.70579,0.71023,0.4417,0.54466,0.54367,0.54367
5,LGBM_FI_Top8,0.69934,0.70194,0.70197,0.72159,0.45357,0.55702,0.55894,0.55895
6,LGBM_FI_Top9,0.71973,0.73047,0.7309,0.70455,0.45421,0.55234,0.56097,0.56114
7,LGBM_FI_Top10,0.72675,0.73955,0.74017,0.69886,0.46067,0.5553,0.56941,0.56987
8,LGBM_FI_Top11,0.73946,0.75307,0.75382,0.70455,0.45091,0.54989,0.55666,0.55677
9,LGBM_FI_Top12,0.73781,0.74996,0.75055,0.72159,0.4652,0.5657,0.57407,0.57424



Métriques guardades a ../results/02_experiments/experiments.csv



### Permutation Importance

Per avaluar la importància de les característiques del model. Serveix per determinar quines característiques tenen més impacte en el rendiment del model.

In [None]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Train el random forest
rf_name = 'BalancedRandomForest'
clf_rf, param_grid_rf = get_classifier_config(rf_name)

# nou pipeline
pipe = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", clf_rf)
])

# Train el model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipe, 
    param_grid_rf,
    search_type='grid'
)

best_rf_model = best_est

preprocessor = best_rf_model.named_steps['preprocessor']
classifier = best_rf_model.named_steps['classifier']

X_test_transformed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names_out()



# pipeline nomes classifier
final_estimator = Pipeline([
    ('classifier', classifier)
])

# Calculem la importància permutativa
result = permutation_importance(
    final_estimator,
    X_test_transformed,  # Conjunt de dades que no hagi vist
    y_test,
    n_repeats=200,
    n_jobs=-1,
    random_state=42
)

# Creem Series amb els noms de les caracteristiques
perm_importances = pd.Series(
    result.importances_mean, 
    index=feature_names
).sort_values(ascending=False)

# Top 3 a 15 features
perm_top_features = {
    3: perm_importances.head(3).index.tolist(),
    4: perm_importances.head(4).index.tolist(),
    5: perm_importances.head(5).index.tolist(),
    6: perm_importances.head(6).index.tolist(),
    7: perm_importances.head(7).index.tolist(),
    8: perm_importances.head(8).index.tolist(),
    9: perm_importances.head(9).index.tolist(),
    10: perm_importances.head(10).index.tolist(),
    11: perm_importances.head(11).index.tolist(),
    12: perm_importances.head(12).index.tolist(),
    13: perm_importances.head(13).index.tolist(),
    14: perm_importances.head(14).index.tolist(),
    15: perm_importances.head(15).index.tolist(),
    16: perm_importances.head(16).index.tolist(),
    17: perm_importances.head(17).index.tolist(),
    18: perm_importances.head(18).index.tolist(),
    19: perm_importances.head(19).index.tolist(),
    20: perm_importances.head(20).index.tolist(),
}

print("\nTop-15 features (Permutation):")
display(perm_importances.head(20).to_frame("Importancia"))

Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458



KeyboardInterrupt: 

### Reentrenament Permutation importance

In [None]:
# Definim els dos conjunts de features basats en importància de permutació
perm_sets = {
    "Perm_Top3": perm_top_features[3],
    "Perm_Top4": perm_top_features[4],
    "Perm_Top5": perm_top_features[5], 
    "Perm_Top6": perm_top_features[6],
    "Perm_Top7": perm_top_features[7],
    "Perm_Top8": perm_top_features[8],
    "Perm_Top9": perm_top_features[9],
    "Perm_Top10": perm_top_features[10],
    "Perm_Top11": perm_top_features[11],
    "Perm_Top12": perm_top_features[12],
    "Perm_Top13": perm_top_features[13],
    "Perm_Top14": perm_top_features[14],
    "Perm_Top15": perm_top_features[15],
    "Perm_Top16": perm_top_features[16],
    "Perm_Top17": perm_top_features[17],
    "Perm_Top18": perm_top_features[18],
    "Perm_Top19": perm_top_features[19],
    "Perm_Top20": perm_top_features[20],
}

print(perm_top_features[20])

# Reentrenament i avaluació per a cada subset
perm_results = []

# Preprocessem un cop fora del bucle per eficiència
preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convertim a DataFrame mantenint els índexs
feature_names = preprocessor.get_feature_names_out()
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)

for label, feat_indices in perm_sets.items():
    print(f"\nEntrenament models amb {label}")
    
    for model_name, classifier in CLASSIFIERS_FILTER.items():
        print(f'==== {model_name} ====')
        
        if model_name == "BalancedRandomForest":
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[feat_indices],
            y_train,
            X_test_transformed[feat_indices],
            y_test,
            pipe,
            PARAM_GRIDS[model_name]
        )

        perm_results_df = append_results(
            perm_results,
            model_name,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=label  # Usem directament l'etiqueta (Perm_TopX)
        )

{3: ['num__bmi', 'num__recovery_factor', 'num__minutesAsleep'], 4: ['num__bmi', 'num__recovery_factor', 'num__minutesAsleep', 'num__full_sleep_breathing_rate'], 5: ['num__bmi', 'num__recovery_factor', 'num__minutesAsleep', 'num__full_sleep_breathing_rate', 'num__daily_temperature_variation'], 6: ['num__bmi', 'num__recovery_factor', 'num__minutesAsleep', 'num__full_sleep_breathing_rate', 'num__daily_temperature_variation', 'num__minutes_in_default_zone_1'], 7: ['num__bmi', 'num__recovery_factor', 'num__minutesAsleep', 'num__full_sleep_breathing_rate', 'num__daily_temperature_variation', 'num__minutes_in_default_zone_1', 'num__wake_after_sleep_pct'], 8: ['num__bmi', 'num__recovery_factor', 'num__minutesAsleep', 'num__full_sleep_breathing_rate', 'num__daily_temperature_variation', 'num__minutes_in_default_zone_1', 'num__wake_after_sleep_pct', 'num__calories'], 9: ['num__bmi', 'num__recovery_factor', 'num__minutesAsleep', 'num__full_sleep_breathing_rate', 'num__daily_temperature_variation'

In [None]:
display(perm_results_df[display_cols])

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_Perm_Top3,0.66561,0.65409,0.65448,0.76136,0.46207,0.57511,0.56755,0.56769
1,LGBM_Perm_Top3,0.60874,0.53303,0.54531,0.84091,0.42651,0.56597,0.49418,0.50437
2,BalancedRandomForest_Perm_Top4,0.67388,0.67082,0.67085,0.76136,0.46048,0.57388,0.56533,0.5655
3,LGBM_Perm_Top4,0.6137,0.54693,0.55677,0.80114,0.4184,0.54971,0.48825,0.49563
4,BalancedRandomForest_Perm_Top5,0.67676,0.67355,0.67358,0.74432,0.45017,0.56103,0.55223,0.5524
5,LGBM_Perm_Top5,0.63061,0.57519,0.58242,0.80682,0.42262,0.55469,0.49517,0.50218
6,BalancedRandomForest_Perm_Top6,0.69617,0.69265,0.69269,0.74432,0.44558,0.55745,0.54554,0.54585
7,LGBM_Perm_Top6,0.66127,0.63194,0.63428,0.77841,0.42947,0.55354,0.5143,0.51747
8,BalancedRandomForest_Perm_Top7,0.6971,0.69206,0.69214,0.75568,0.44333,0.55882,0.54078,0.54148
9,LGBM_Perm_Top7,0.67289,0.6546,0.65557,0.71023,0.44014,0.54348,0.54148,0.54148


In [None]:
update_experiments_file(perm_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 2: PCA


Es realitza una anàlisi de components principals (PCA) per examinar com evolucionen els components més rellevants del conjunt de dades en termes de variància explicada acumulada, considerant els primers 3, 4, 5, 6, 7, 8, 9, 10 components

In [None]:
pca = PCA(random_state=42)
pca.fit(X_train_transformed)

explained_cumsum = pca.explained_variance_ratio_.cumsum()*100

ks = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
cums = explained_cumsum[[k-1 for k in ks]]

plt.figure(figsize=(8, 4))
plt.plot(
    range(1, len(explained_cumsum) + 1),
    explained_cumsum,
)
plt.scatter(ks, cums)
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title('Evolución de la varianza explicada según n_components')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
n_components_list = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
for k in n_components_list:
    # Ajusta PCA
    pca = PCA(n_components=k, random_state=42)
    pca.fit(X_train_transformed)

    # loadings: matriz (n_features, k)
    loadings = pca.components_.T

    # importancia = suma de cargas absolutes de cada feature en tots els components
    importance = np.sum(np.abs(loadings), axis=1)

    # crea DataFrame, ordena top-k
    df_imp = pd.DataFrame({
        'feature':    feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    topk = df_imp.head(k)

    plt.figure()
    plt.barh(topk['feature'][::-1], topk['importance'][::-1])
    plt.xlabel('Importancia (suma de |carregues|)')
    plt.title(f'Top {k} features segons PCA')
    plt.tight_layout()
    plt.show()

In [None]:
# Guarda resultats i models
pca_results = []

for k in n_components_list:
    print(f"\n-- PCA - {k} components --")

    for model, classifier in CLASSIFIERS_FILTER.items():
        if model == "BalancedRandomForest":
            pipeline = ImbPipeline([
                ("preprocessor", preprocessor),
                ("pca",          PCA(n_components=k, random_state=42)),
                ("classifier",   classifier)
            ])
        else:
            pipeline = ImbPipeline([
                ("preprocessor", preprocessor),
                ("balance",       balance_method),
                ("pca",          PCA(n_components=k, random_state=42)),
                ("classifier",   classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
        X_train, 
        y_train,
        X_test,
        y_test,
        pipeline, 
        PARAM_GRIDS[model],
        search_type="grid"
        )

        df_pca_results = append_results(
            pca_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"PCA_{k}",
        )



-- PCA - 5 components --
Entrenant model...

Train F1 (1): 0.6504 | Test F1 (1): 0.5576 | Train Acc: 0.5939 | Test Acc: 0.4803
              precision    recall  f1-score   support

         0.0     0.7292    0.2482    0.3704       282
         1.0     0.4144    0.8523    0.5576       176

    accuracy                         0.4803       458
   macro avg     0.5718    0.5502    0.4640       458
weighted avg     0.6082    0.4803    0.4423       458

Entrenant model...

Train F1 (1): 0.6439 | Test F1 (1): 0.5343 | Train Acc: 0.5901 | Test Acc: 0.4520
              precision    recall  f1-score   support

         0.0     0.6632    0.2234    0.3342       282
         1.0     0.3967    0.8182    0.5343       176

    accuracy                         0.4520       458
   macro avg     0.5299    0.5208    0.4343       458
weighted avg     0.5608    0.4520    0.4111       458


-- PCA - 8 components --
Entrenant model...

Train F1 (1): 0.7017 | Test F1 (1): 0.5444 | Train Acc: 0.6905 | Test 

In [None]:
display(df_pca_results[display_cols])
update_experiments_file(df_pca_results)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_PCA_5,0.65038,0.583,0.59389,0.85227,0.41436,0.55762,0.464,0.48035
1,LGBM_PCA_5,0.64391,0.58047,0.59007,0.81818,0.39669,0.53432,0.43427,0.45197
2,BalancedRandomForest_PCA_8,0.70174,0.69006,0.6905,0.78409,0.41692,0.54438,0.48979,0.49563
3,LGBM_PCA_8,0.68209,0.6525,0.65502,0.85227,0.41899,0.5618,0.47462,0.48908
4,BalancedRandomForest_PCA_10,0.70601,0.69502,0.69541,0.81818,0.41618,0.55172,0.47891,0.48908
5,LGBM_PCA_10,0.69411,0.66958,0.6714,0.84091,0.41457,0.55535,0.46827,0.48253
6,BalancedRandomForest_PCA_12,0.70361,0.6899,0.6905,0.84659,0.4221,0.56333,0.48321,0.49563
7,LGBM_PCA_12,0.70582,0.68721,0.68832,0.82955,0.42319,0.56046,0.49036,0.5
8,BalancedRandomForest_PCA_15,0.72816,0.72485,0.72489,0.84091,0.43658,0.57476,0.51431,0.52183
9,LGBM_PCA_15,0.73661,0.73415,0.73417,0.82955,0.43713,0.57255,0.5178,0.52402



Métriques guardades a ../results/02_experiments/experiments.csv



## Anàlisi de resultats

In [None]:
# Llegeix la taula d'experiments (ja ordenada per F1)
experiments_df = pd.read_csv("../results/02_experiments/experiments.csv")
top5 = experiments_df.head(5)

# Mostra els top 5
display(top5)


Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Precision (1),Test Recall (1),Test F1 (1),Test F1 (macro global),Test Accuracy,Best Params
0,LGBM_Perm_Top13,0.7139,0.71122,0.71124,0.47841,0.81818,0.60377,0.58662,0.58734,"{'classifier__subsample': 0.7511807565247405, ..."
1,LGBM_Perm_Top15,0.71538,0.71507,0.71507,0.47213,0.81818,0.59875,0.57754,0.5786,"{'classifier__subsample': 0.7511807565247405, ..."
2,LGBM_Perm_Top12,0.69345,0.68307,0.68341,0.47,0.80114,0.59244,0.57576,0.57642,"{'classifier__subsample': 0.7511807565247405, ..."
3,BalancedRandomForest_Perm_Top13,0.69537,0.687,0.68723,0.46405,0.80682,0.58921,0.5665,0.56769,"{'classifier__n_estimators': 1163, 'classifier..."
4,LGBM_FI_Top10,0.70198,0.69584,0.69596,0.46254,0.80682,0.58799,0.5642,0.5655,
