# LifeSnaps Preprocessing Experiments

Expermiments amb un menor nombre de features.

#### Import de llibreries necessaries

In [5]:
import warnings
warnings.filterwarnings('ignore')

# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Imbalanced data pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Core utilities
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# Model definitions fora dels defints
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


############## IMPORTS DEL NOSTRE PAQUET ###################
# Obtenim les funcions per entrenar i evaluar els models i registrar les mètriques
from ai_health_assistant.utils.train_helpers import train_models, append_results, plot_learning_curve, mat_confusio, update_experiments_file

# Obtenim els classificadors i els seus parametres
from ai_health_assistant.utils.model_config import get_classifier_config, PARAM_GRIDS, CLASSIFIERS, BALANCING_METHODS

# Obtenim el target, features el la construcció del preprocessador
from ai_health_assistant.utils.prep_helpers import TARGET, build_preprocessor, FEATURES

# Configuració de pandas
pd.set_option('display.max_columns', None)

# Carrega de dades, netes i amb fe aplicat
df_train = pd.read_csv('../data/df_engineered_train.csv')
df_test = pd.read_csv('../data/df_engineered_test.csv')
    
print(f"Shape: {df_train.shape}")
print(f"Shape: {df_test.shape}")


Shape: (1832, 40)
Shape: (458, 40)


## Lectura de dades i split de train / test

### Train / Test Split

In [6]:
# Fem l'split de les dades, separant les features i el target
X_train = df_train[FEATURES]
y_train = df_train[TARGET]

X_test = df_test[FEATURES]
y_test = df_test[TARGET]

print(f"\nTrain shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print('\n','--'*50)
print(f"\nDistribució train:\n{y_train.value_counts(normalize=True)}")
print(f"\nDistribució test:\n{y_test.value_counts(normalize=True)}")

numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=['number']).columns.tolist()

print(f"\nCol. numeriques ({len(numerical_features)}): \n{numerical_features}")
print(f"Col. categoriques ({len(categorical_features)}): \n{categorical_features}")



Train shape: (1832, 37)
Test shape: (458, 37)

 ----------------------------------------------------------------------------------------------------

Distribució train:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Distribució test:
TIRED
0.0    0.615721
1.0    0.384279
Name: proportion, dtype: float64

Col. numeriques (35): 
['bmi', 'calories', 'steps', 'lightly_active_minutes', 'moderately_active_minutes', 'very_active_minutes', 'sedentary_minutes', 'resting_hr', 'minutes_below_default_zone_1', 'minutes_in_default_zone_1', 'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'minutesAsleep', 'minutesAwake', 'sleep_efficiency', 'sleep_deep_ratio', 'sleep_light_ratio', 'sleep_rem_ratio', 'sleep_wake_ratio', 'daily_temperature_variation', 'rmssd', 'spo2', 'full_sleep_breathing_rate', 'wake_after_sleep_pct', 'steps_norm_cal', 'deep_sleep_score', 'active_sedentary_ratio', 'sleep_activity_balance', 'bmi_hr_interaction', 'sleep_quality_index', 'hr_zone_variab

### Definim el preprocessador

In [7]:
preprocessor = build_preprocessor(df_train, FEATURES)

## ENTRENAMENT DEL MODEL BASE

In [8]:
display_cols = ['Experiment', 'Train F1 (1)', 'Train F1 (macro global)','Train Accuracy', 'Test Recall (1)', 'Test Precision (1)', 'Test F1 (1)', 'Test F1 (macro global)','Test Accuracy']

# --------------------------------------------------------------
# Filtre de selecció dels models a entrenar
PROVA_MODELS = ["BalancedRandomForest", "LGBM"]
# Selecció del mètode de balanceig
balance_name = "SMOTETomek"
# --------------------------------------------------------------

# Filtre per no haver de entrenar tots el models
CLASSIFIERS_FILTER = {k: v for k, v in CLASSIFIERS.items() if k in PROVA_MODELS}
balance_method = BALANCING_METHODS[balance_name]

### Regressió Logistica

In [9]:

reg_results = []
reg_models = {}

reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=2000,            # convergència assegurada
        class_weight="balanced", # tracta l’imbalance de la classe 1
        solver="lbfgs",          # ràpid i estable per datasets petits/mitjans
    ))
])

reg_param_grid = {
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100, 200, 500, 1000]
}

best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train, 
        y_train,
        X_test,
        y_test, 
        reg_pipeline, 
        reg_param_grid,
        search_type='grid',
    )

reg_results_df = append_results(
    reg_results,
    "LogisticRegression",
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)

display(reg_results_df[display_cols])
update_experiments_file(reg_results_df)

Entrenant model...

Train F1 (1): 0.5156 | Test F1 (1): 0.4800 | Train Acc: 0.5939 | Test Acc: 0.5742
              precision    recall  f1-score   support

         0.0     0.6680    0.6135    0.6396       282
         1.0     0.4523    0.5114    0.4800       176

    accuracy                         0.5742       458
   macro avg     0.5601    0.5624    0.5598       458
weighted avg     0.5851    0.5742    0.5782       458



Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,LogisticRegression_EntrenamentBasic,0.51562,0.583,0.59389,0.51136,0.45226,0.48,0.55978,0.57424



Métriques guardades a ../results/02_experiments/experiments.csv



In [10]:
base_results = []

for model, classifier in CLASSIFIERS_FILTER.items():

    if model == "BalancedRandomForest":
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("classifier", classifier)
        ])
    else:
        pipeline = ImbPipeline([
            ("preprocessor", preprocessor),
            ("balance", balance_method),
            ("classifier", classifier)
        ])

    print(f"\n==== {model} ====")
    best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
        X_train,
        y_train,
        X_test,
        y_test, 
        pipeline,
        PARAM_GRIDS[model],
        search_type='grid'
    )

    base_results_df = append_results(
    base_results,
    model,
    train_report,
    test_report,
    best_params,
    best_score,
    experiment="EntrenamentBasic"
)


==== BalancedRandomForest ====
Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458


==== LGBM ====
Entrenant model...

Train F1 (1): 0.7795 | Test F1 (1): 0.5391 | Train Acc: 0.7953 | Test Acc: 0.5371
              precision    recall  f1-score   support

         0.0     0.7011    0.4326    0.5351       282
         1.0     0.4366    0.7045    0.5391       176

    accuracy                         0.5371       458
   macro avg     0.5689    0.5686    0.5371       458
weighted avg     0.5995    0.5371    0.5366       458



In [11]:
update_experiments_file(base_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 1: Importancia de les caracteristiques

Entrenem RandomForest per indentificar les caracteristiques més importants (3 i 5), posteriorment entrenem els models utilitzant aquestes 3 i 5 característiques, per veure si augmenta el rendiment del model. Proavarem tambe amb permutation importances.

### Feature Importance

La Gini importance d’una feature és: La suma de totes les reduccions d’impuresa (Gini) que ha causat al llarg de tots els arbres i de totes les seves aparicions.

In [12]:
# Param grid & pipeline bàsic de random forest
rf_name = "BalancedRandomForest"

pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", CLASSIFIERS[rf_name])
])

# Entrenament del model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipeline, 
    PARAM_GRIDS[rf_name],
    search_type='grid'
)
preprocessor = best_est.named_steps['preprocessor']

feature_names = preprocessor.get_feature_names_out()

# Serie em els noms
importances_raw = pd.Series(
    best_est.named_steps["classifier"].feature_importances_,
    index=feature_names
)

agg_importances = (
    importances_raw.groupby(feature_names).sum().sort_values(ascending=False)
)

# Top-4 a Top-15 importàncies

top3 = agg_importances.head(3).index.tolist()
top4 = agg_importances.head(4).index.tolist() 
top5 = agg_importances.head(5).index.tolist()
top6 = agg_importances.head(6).index.tolist()
top7 = agg_importances.head(7).index.tolist()
top8 = agg_importances.head(8).index.tolist()
top9 = agg_importances.head(9).index.tolist()
top10 = agg_importances.head(10).index.tolist()
top11 = agg_importances.head(11).index.tolist()
top12 = agg_importances.head(12).index.tolist()
top13 = agg_importances.head(13).index.tolist()
top14 = agg_importances.head(14).index.tolist()
top15 = agg_importances.head(15).index.tolist()

print("Top-3 features:", top3)
print("Top-4 features:", top4)
print("Top-5 features:", top5)
print("Top-6 features:", top6)
print("Top-7 features:", top7)
print("Top-8 features:", top8)
print("Top-9 features:", top9)
print("Top-10 features:", top10)
print("Top-11 features:", top11)
print("Top-12 features:", top12)
print("Top-13 features:", top13)
print("Top-14 features:", top14)
print("Top-15 features:", top15)


Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458

Top-3 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi']
Top-4 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr']
Top-5 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal']
Top-6 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal', 'num__daily_temperature_variation']
Top-7 features: ['num__calories', 'num__bmi_hr_interaction', 'num__bmi', 'num__resting_hr', 'num__steps_norm_cal', 'num__daily_temperature_v

### Rentrenament de models 4-15 millors features

In [13]:
# Definim els conjunts de features
feature_sets = {
    "Top3": top3,
    "Top4": top4,   
    "Top5": top5,
    "Top6": top6,
    "Top7": top7,
    "Top8": top8,
    "Top9": top9,
    "Top10": top10,
    "Top11": top11,
    "Top12": top12,
    "Top13": top13,
    "Top14": top14,
    "Top15": top15,
}

# Reentrenament i avaluació per a cada subset
fi_results = []
preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()

# Generem el train transformat i el test transformat
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)


for label, feats in feature_sets.items():
    print(f"\nEntrenament amb {label}")
    for model, classifier in CLASSIFIERS_FILTER.items():

        print(f'==== {model} ====')
        if model == "BalancedRandomForest": # Em dona millors resutats sense balance
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[feats],
            y_train,
            X_test_transformed[feats],
            y_test,
            pipe,
            PARAM_GRIDS[model]
        )

        fi_results_df =append_results(
            fi_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"FI_{label}"
        )


Entrenament amb Top3
==== BalancedRandomForest ====
Entrenant model...

Train F1 (1): 0.6505 | Test F1 (1): 0.5620 | Train Acc: 0.6064 | Test Acc: 0.5066
              precision    recall  f1-score   support

         0.0     0.7373    0.3085    0.4350       282
         1.0     0.4265    0.8239    0.5620       176

    accuracy                         0.5066       458
   macro avg     0.5819    0.5662    0.4985       458
weighted avg     0.6178    0.5066    0.4838       458

==== LGBM ====
Entrenant model...

Train F1 (1): 0.6168 | Test F1 (1): 0.5614 | Train Acc: 0.5693 | Test Acc: 0.5087
              precision    recall  f1-score   support

         0.0     0.7355    0.3156    0.4417       282
         1.0     0.4273    0.8182    0.5614       176

    accuracy                         0.5087       458
   macro avg     0.5814    0.5669    0.5015       458
weighted avg     0.6171    0.5087    0.4877       458


Entrenament amb Top4
==== BalancedRandomForest ====
Entrenant model...

T

In [14]:
display(fi_results_df[display_cols])
update_experiments_file(fi_results_df)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,BalancedRandomForest_FI_Top3,0.65051,0.60008,0.60644,0.82386,0.42647,0.56202,0.49851,0.50655
1,LGBM_FI_Top3,0.6168,0.56261,0.56932,0.81818,0.4273,0.5614,0.50155,0.50873
2,BalancedRandomForest_FI_Top4,0.65955,0.61068,0.61681,0.78409,0.42462,0.5509,0.50436,0.50873
3,LGBM_FI_Top4,0.63524,0.59476,0.5988,0.77273,0.43871,0.55967,0.531,0.53275
4,BalancedRandomForest_FI_Top5,0.64891,0.58822,0.59716,0.80114,0.41716,0.54864,0.48576,0.49345
5,LGBM_FI_Top5,0.65312,0.62509,0.62718,0.73864,0.43046,0.54393,0.52311,0.52402
6,BalancedRandomForest_FI_Top6,0.66634,0.62198,0.62718,0.78977,0.42638,0.55378,0.50636,0.51092
7,LGBM_FI_Top6,0.66019,0.63531,0.63701,0.82386,0.45597,0.58704,0.55182,0.55459
8,BalancedRandomForest_FI_Top7,0.70062,0.68112,0.68231,0.77273,0.44013,0.56082,0.53331,0.53493
9,LGBM_FI_Top7,0.68046,0.66161,0.66266,0.78977,0.43849,0.56389,0.52781,0.53057



Métriques guardades a ../results/02_experiments/experiments.csv



### Permutation Importance

Per avaluar la importància de les característiques del model. Serveix per determinar quines característiques tenen més impacte en el rendiment del model.

In [15]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Train el random forest
rf_name = 'BalancedRandomForest'
clf_rf, param_grid_rf = get_classifier_config(rf_name)

# nou pipeline
pipe = ImbPipeline([
    ("preprocessor", preprocessor),
    ("classifier", clf_rf)
])

# Train el model
best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
    X_train, 
    y_train,
    X_test,
    y_test, 
    pipe, 
    param_grid_rf,
    search_type='grid'
)

best_rf_model = best_est

preprocessor = best_rf_model.named_steps['preprocessor']
classifier = best_rf_model.named_steps['classifier']

X_test_transformed = preprocessor.transform(X_test)

feature_names = preprocessor.get_feature_names_out()



# pipeline nomes classifier
final_estimator = Pipeline([
    ('classifier', classifier)
])

# Calculem la importància permutativa
result = permutation_importance(
    final_estimator,
    X_test_transformed,  # Conjunt de dades que no hagi vist
    y_test,
    n_repeats=200,
    n_jobs=-1,
    random_state=42
)

# Creem Series amb els noms de les caracteristiques
perm_importances = pd.Series(
    result.importances_mean, 
    index=feature_names
).sort_values(ascending=False)

# Top 3 a 15 features
perm_top_features = {
    3: perm_importances.head(3).index.tolist(),
    4: perm_importances.head(4).index.tolist(),
    5: perm_importances.head(5).index.tolist(),
    6: perm_importances.head(6).index.tolist(),
    7: perm_importances.head(7).index.tolist(),
    8: perm_importances.head(8).index.tolist(),
    9: perm_importances.head(9).index.tolist(),
    10: perm_importances.head(10).index.tolist(),
    11: perm_importances.head(11).index.tolist(),
    12: perm_importances.head(12).index.tolist(),
    13: perm_importances.head(13).index.tolist(),
    14: perm_importances.head(14).index.tolist(),
    15: perm_importances.head(15).index.tolist()
}

print("\nTop-15 features (Permutation):")
display(perm_importances.head(15).to_frame("Importancia"))

Entrenant model...

Train F1 (1): 0.7136 | Test F1 (1): 0.5793 | Train Acc: 0.6960 | Test Acc: 0.5306
              precision    recall  f1-score   support

         0.0     0.7724    0.3369    0.4691       282
         1.0     0.4418    0.8409    0.5793       176

    accuracy                         0.5306       458
   macro avg     0.6071    0.5889    0.5242       458
weighted avg     0.6453    0.5306    0.5115       458


Top-15 features (Permutation):


Unnamed: 0,Importancia
num__bmi,0.025666
num__recovery_factor,0.020742
num__minutesAsleep,0.019367
num__full_sleep_breathing_rate,0.013919
num__daily_temperature_variation,0.012609
num__minutes_in_default_zone_1,0.01143
num__wake_after_sleep_pct,0.010993
num__calories,0.00929
num__active_to_rest_transition,0.008319
num__rmssd,0.008166


### Reentrenament Permutation importance

In [35]:
# Definim els dos conjunts de features basats en importància de permutació
perm_sets = {
    "Perm_Top3": perm_top_features[3],  # Índexos de les 10 millors característiques
    "Perm_Top5": perm_top_features[5]   # Índexos de les 15 millors característiques
}

print(perm_top_features)

# Reentrenament i avaluació per a cada subset
perm_results = []

# Preprocessem un cop fora del bucle per eficiència
preprocessor = build_preprocessor(X_train, FEATURES)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convertim a DataFrame mantenint els índexs
feature_names = preprocessor.get_feature_names_out()
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)

for label, feat_indices in perm_sets.items():
    print(f"\nEntrenament models amb {label}")
    
    for model_name, classifier in CLASSIFIERS_FILTER.items():
        print(f'==== {model_name} ====')
        
        if model_name == "BalancedRandomForest":
            pipe = ImbPipeline([
                ("classifier", classifier)
            ])
        else:
            pipe = ImbPipeline([
                ("balance", balance_method),
                ("classifier", classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models(
            X_train_transformed[feat_indices],
            y_train,
            X_test_transformed[feat_indices],
            y_test,
            pipe,
            PARAM_GRIDS[model_name]
        )

        perm_results_df = append_results(
            perm_results,
            model_name,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=label  # Usem directament l'etiqueta (Perm_Top10 o Perm_Top15)
        )

{3: ['num__bmi', 'num__calories', 'num__recovery_factor'], 5: ['num__bmi', 'num__calories', 'num__recovery_factor', 'num__full_sleep_breathing_rate', 'num__minutesAsleep']}

Entrenament models amb Perm_Top3
==== MLP ====
Entrenant model...

Train F1 (1): 0.4444 | Test F1 (1): 0.4294 | Train Acc: 0.5961 | Test Acc: 0.5939
              precision    recall  f1-score   support

         0.0     0.6558    0.7163    0.6847       282
         1.0     0.4667    0.3977    0.4294       176

    accuracy                         0.5939       458
   macro avg     0.5613    0.5570    0.5571       458
weighted avg     0.5831    0.5939    0.5866       458

==== SVM ====
Entrenant model...

Train F1 (1): 0.4977 | Test F1 (1): 0.4570 | Train Acc: 0.5901 | Test Acc: 0.5590
              precision    recall  f1-score   support

         0.0     0.6527    0.6064    0.6287       282
         1.0     0.4337    0.4830    0.4570       176

    accuracy                         0.5590       458
   macro avg    

In [36]:
display(perm_results_df[display_cols])

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,MLP_Perm_Top3,0.44444,0.56356,0.59607,0.39773,0.46667,0.42945,0.5571,0.59389
1,SVM_Perm_Top3,0.49766,0.57571,0.59007,0.48295,0.43367,0.45699,0.54283,0.55895
2,GradientBoosting_Perm_Top3,0.54827,0.59591,0.60153,0.55682,0.4689,0.50909,0.57658,0.58734
3,BalancedRandomForest_Perm_Top3,0.63749,0.59824,0.60207,0.80114,0.44062,0.56855,0.52951,0.53275
4,LGBM_Perm_Top3,0.61762,0.59601,0.59716,0.71591,0.40909,0.52066,0.49181,0.49345
5,MLP_Perm_Top5,0.55452,0.57932,0.58079,0.64205,0.4502,0.52927,0.55912,0.56114
6,SVM_Perm_Top5,0.53171,0.57613,0.58079,0.58523,0.44978,0.50864,0.5596,0.5655
7,GradientBoosting_Perm_Top5,0.55717,0.61016,0.61736,0.5625,0.47596,0.51562,0.583,0.59389
8,BalancedRandomForest_Perm_Top5,0.65528,0.63538,0.63646,0.78409,0.45246,0.5738,0.55127,0.5524
9,LGBM_Perm_Top5,0.63559,0.6,0.60317,0.73295,0.41613,0.53086,0.50032,0.50218


In [37]:
update_experiments_file(perm_results_df)


Métriques guardades a ../results/02_experiments/experiments.csv



## EXPERIMENT 2: PCA


Es realitza una anàlisi de components principals (PCA) per examinar com evolucionen els components més rellevants del conjunt de dades en termes de variància explicada acumulada, considerant els primers 3, 4, 5, 6, 7, 8, 9, 10 components

In [None]:
pca = PCA(random_state=42)
pca.fit(X_train_transformed)

explained_cumsum = pca.explained_variance_ratio_.cumsum()*100

ks = [3, 4, 5, 6, 7, 8, 9, 10]
cums = explained_cumsum[[k-1 for k in ks]]

plt.figure(figsize=(8, 4))
plt.plot(
    range(1, len(explained_cumsum) + 1),
    explained_cumsum,
)
plt.scatter(ks, cums)
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title('Evolución de la varianza explicada según n_components')
plt.grid(True)
plt.tight_layout()
plt.show()

In [39]:
n_components_list = [3, 4, 5, 6, 7, 8, 9, 10]
for k in n_components_list:
    # Ajusta PCA
    pca = PCA(n_components=k, random_state=42)
    pca.fit(X_train_transformed)

    # loadings: matriz (n_features, k)
    loadings = pca.components_.T

    # importancia = suma de cargas absolutes de cada feature en tots els components
    importance = np.sum(np.abs(loadings), axis=1)

    # crea DataFrame, ordena top-k
    df_imp = pd.DataFrame({
        'feature':    feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    topk = df_imp.head(k)

    plt.figure()
    plt.barh(topk['feature'][::-1], topk['importance'][::-1])
    plt.xlabel('Importancia (suma de |carregues|)')
    plt.title(f'Top {k} features segons PCA')
    plt.tight_layout()
    plt.show()

In [40]:
# Guarda resultats i models
pca_results = []

for k in n_components_list:
    print(f"\n-- PCA - {k} components --")

    for model, classifier in CLASSIFIERS_FILTER.items():
        if model == "BalancedRandomForest":
            pipeline = ImbPipeline([
                ("preprocessor", preprocessor),
                ("pca",          PCA(n_components=k, random_state=42)),
                ("classifier",   classifier)
            ])
        else:
            pipeline = ImbPipeline([
                ("preprocessor", preprocessor),
                ("balance",       balance_method),
                ("pca",          PCA(n_components=k, random_state=42)),
                ("classifier",   classifier)
            ])

        best_est, y_train_pred, train_report, y_test_pred, test_report, best_params, best_score = train_models( 
        X_train, 
        y_train,
        X_test,
        y_test,
        pipeline, 
        PARAM_GRIDS[model],
        search_type="grid"
        )

        df_pca_results = append_results(
            pca_results,
            model,
            train_report,
            test_report,
            best_params,
            best_score,
            experiment=f"PCA_{k}",
        )



-- PCA - 3 components --
Entrenant model...

Train F1 (1): 0.4928 | Test F1 (1): 0.4853 | Train Acc: 0.5606 | Test Acc: 0.5786
              precision    recall  f1-score   support

         0.0     0.6718    0.6170    0.6433       282
         1.0     0.4573    0.5170    0.4853       176

    accuracy                         0.5786       458
   macro avg     0.5646    0.5670    0.5643       458
weighted avg     0.5894    0.5786    0.5826       458

Entrenant model...

Train F1 (1): 0.4976 | Test F1 (1): 0.5039 | Train Acc: 0.5437 | Test Acc: 0.5786
              precision    recall  f1-score   support

         0.0     0.6816    0.5922    0.6338       282
         1.0     0.4601    0.5568    0.5039       176

    accuracy                         0.5786       458
   macro avg     0.5709    0.5745    0.5688       458
weighted avg     0.5965    0.5786    0.5839       458

Entrenant model...

Train F1 (1): 0.5237 | Test F1 (1): 0.4114 | Train Acc: 0.6059 | Test Acc: 0.5502
              

In [41]:
display(df_pca_results[display_cols])
update_experiments_file(df_pca_results)

Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Recall (1),Test Precision (1),Test F1 (1),Test F1 (macro global),Test Accuracy
0,MLP_PCA_3,0.49275,0.55259,0.56059,0.51705,0.45729,0.48533,0.56429,0.5786
1,SVM_PCA_3,0.4976,0.5398,0.54367,0.55682,0.46009,0.50386,0.56882,0.5786
2,GradientBoosting_PCA_3,0.52375,0.59381,0.6059,0.40909,0.41379,0.41143,0.52374,0.55022
3,BalancedRandomForest_PCA_3,0.61726,0.51676,0.53766,0.84659,0.40822,0.55083,0.45142,0.46943
4,LGBM_PCA_3,0.64516,0.60578,0.60972,0.74432,0.41066,0.52929,0.48792,0.49127
5,MLP_PCA_4,0.54714,0.57124,0.5726,0.63636,0.448,0.52582,0.55679,0.55895
6,SVM_PCA_4,0.49384,0.56281,0.57369,0.5,0.45596,0.47696,0.56207,0.5786
7,GradientBoosting_PCA_4,0.54631,0.61768,0.631,0.4375,0.41622,0.42659,0.52681,0.54803
8,BalancedRandomForest_PCA_4,0.666,0.63425,0.63701,0.79545,0.4142,0.54475,0.48133,0.48908
9,LGBM_PCA_4,0.65806,0.62012,0.62391,0.75,0.3964,0.51866,0.45835,0.46507



Métriques guardades a ../results/02_experiments/experiments.csv



## Anàlisi de resultats

In [42]:
# Llegeix la taula d'experiments (ja ordenada per F1)
experiments_df = pd.read_csv("../results/02_experiments/experiments.csv")
top5 = experiments_df.head(5)

# Mostra els top 5
display(top5)


Unnamed: 0,Experiment,Train F1 (1),Train F1 (macro global),Train Accuracy,Test Precision (1),Test Recall (1),Test F1 (1),Test F1 (macro global),Test Accuracy,Best Params
0,BalancedRandomForest_Perm_Top5,0.65528,0.63538,0.63646,0.45246,0.78409,0.5738,0.55127,0.5524,"{'classifier__n_estimators': 1021, 'classifier..."
1,BalancedRandomForest_FI_Top5,0.67946,0.66173,0.66266,0.44872,0.79545,0.57377,0.54389,0.54585,
2,LGBM_EntrenamentBasic,0.88122,0.89912,0.90229,0.51111,0.65341,0.57357,0.62076,0.62664,
3,BalancedRandomForest_Perm_Top3,0.63749,0.59824,0.60207,0.44062,0.80114,0.56855,0.52951,0.53275,"{'classifier__n_estimators': 1021, 'classifier..."
4,BalancedRandomForest_PCA_8,0.67928,0.6562,0.65775,0.43114,0.81818,0.56471,0.50895,0.51528,{'classifier__class_weight': 'balanced_subsamp...
