In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PowerTransformer, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_squared_log_error, make_scorer
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from functools import partial

# 1. Caricamento dati
df_train = pd.read_csv('/content/drive/MyDrive/data_calories/train.csv')
df_test  = pd.read_csv('/content/drive/MyDrive/data_calories/test.csv')

# 2. Feature engineering avanzato
def make_features(df):
    df = df.copy()
    df['Sex']               = df['Sex'].map({'male': 0, 'female': 1})
    df['BMI']               = df['Weight'] / (df['Height'] / 100) ** 2
    df['BSA']               = 0.007184 * df['Weight']**0.425 * df['Height']**0.725
    #df['Delta_Temp']        = df['Body_Temp'] - 37.0
    df['HR_per_kg']         = df['Heart_Rate'] / df['Weight']
    df['Heart_Beats_Total'] = df['Heart_Rate'] * df['Duration']
    # Interazioni aggiuntive
    df['BMI_x_Duration']    = df['BMI'] * df['Duration']
    df['HBT_per_BSA']       = df['Heart_Beats_Total'] / df['BSA']
    df['BMI_squared']       = df['BMI'] ** 2
    return df

train_fe = make_features(df_train)
test_fe  = make_features(df_test)

# 3. EDA rapida per verificare le feature (opzionale, commentato per esecuzione)
"""
plt.figure(figsize=(12, 8))
corr = train_fe.drop('id', axis=1).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Matrice di Correlazione')
plt.tight_layout()
plt.show()
"""

# 4. Definizione di X e y
X = train_fe.drop(columns=['id', 'Calories'])
y = train_fe['Calories']
X_test_raw = test_fe.drop(columns=['id'])

# 5. Identificazione delle feature asimmetriche
skew_thresh  = 0.40
skewness     = X.skew().abs()
skewed_feats = skewness[skewness > skew_thresh].index.tolist()
pos_feats    = [f for f in skewed_feats if (X[f] > 0).all()]
nonpos_feats = [f for f in skewed_feats if f not in pos_feats]

# 6. Preprocessing
# 6.1 Fit del PowerTransformer sui nonpos_feats
pt = None
if nonpos_feats:
    pt = PowerTransformer(method='yeo-johnson', standardize=False)
    pt.fit(X[nonpos_feats])

# 6.2 Trasformazione log1p per pos_feats e PowerTransformer per nonpos_feats
X_p = X.copy()
for f in pos_feats:
    X_p[f] = np.log1p(X_p[f])
if nonpos_feats:
    X_p[nonpos_feats] = pt.transform(X_p[nonpos_feats])

# 6.3 Robust Scaling
scaler = RobustScaler()
scaler.fit(X_p)
X_proc = pd.DataFrame(scaler.transform(X_p), columns=X.columns, index=X.index)

# 6.4 Preprocessing del test set
X_test_p = X_test_raw.copy()
for f in pos_feats:
    X_test_p[f] = np.log1p(X_test_p[f])
if nonpos_feats:
    X_test_p[nonpos_feats] = pt.transform(X_test_p[nonpos_feats])
X_test_proc = pd.DataFrame(scaler.transform(X_test_p), columns=X_test_p.columns)

# 7. Log-transform del target
y_log = np.log1p(y)

# 8. Definizione dello split per hold-out (10%)
X_train_main, X_hold, y_train_main, y_hold = train_test_split(
    X_proc, y_log, test_size=0.1, random_state=42
)

# 9. Iperparametri ottimali da Optuna (trial 17)
best_params = {
    'learning_rate':    0.004523596335091698,
    'max_depth':        9,
    'subsample':        0.9708888609967907,
    'colsample_bytree': 0.7684560546712256,
    'min_child_weight': 5,
    'reg_alpha':        2.9753791923207157e-05,
    'reg_lambda':       2.986233138083534
}

# 10. Definizione metrica custom RMSLE per cross-validation
def rmsle_scorer(y_true, y_pred):
    y_true_exp = np.expm1(y_true)
    y_pred_exp = np.expm1(y_pred)
    return np.sqrt(mean_squared_log_error(y_true_exp, y_pred_exp))

custom_rmsle_scorer = make_scorer(rmsle_scorer, greater_is_better=False)

# 11. Cross-validation con KFold
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Inizializzazione del modello base per CV
cv_model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42,
    **best_params
)

# Esecuzione CV sul training set principale (escludendo hold-out)
cv_results = cross_val_score(
    cv_model,
    X_train_main,
    y_train_main,
    cv=kf,
    scoring=custom_rmsle_scorer,
    n_jobs=-1,
    verbose=1
)

# Stampa risultati CV
cv_rmsle_scores = -cv_results  # Converto da negative a positive scores
print(f"Cross-Validation RMSLE Scores: {cv_rmsle_scores}")
print(f"Mean CV RMSLE: {cv_rmsle_scores.mean():.4f}, Std: {cv_rmsle_scores.std():.4f}")

# 12. Allenamento con early stopping su hold-out
model_es = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42,
    n_estimators=2000,
    early_stopping_rounds=50,
    verbosity=1,
    **best_params
)
model_es.fit(
    X_train_main, y_train_main,
    eval_set=[(X_hold, y_hold)],
    verbose=True
)

print(f"Numero ottimale di boosting rounds: {model_es.best_iteration}")

# 13. Valutazione su hold-out
y_pred_log = model_es.predict(X_hold)
y_pred     = np.expm1(y_pred_log)
y_true     = np.expm1(y_hold)
rmse  = np.sqrt(mean_squared_error(y_true, y_pred))
rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
print(f"Hold-out → RMSE: {rmse:.4f}, RMSLE: {rmsle:.4f}")

# 14. Re-fit su tutto il training set con CV ensemble
final_models = []
fold_predictions = []

print("\nAddestramento modelli CV ensemble...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_proc)):
    print(f"\nTraining fold {fold+1}/{n_folds}")
    X_train_fold, X_val_fold = X_proc.iloc[train_idx], X_proc.iloc[val_idx]
    y_train_fold, y_val_fold = y_log.iloc[train_idx], y_log.iloc[val_idx]

    # Modello fold con early stopping interno
    fold_model = XGBRegressor(
        objective='reg:squarederror',
        eval_metric='rmse',
        random_state=42,
        n_estimators=model_es.best_iteration,  # Numero di round ottimale trovato precedentemente
        verbosity=1,
        **best_params
    )

    # Fit con early stopping su validation set del fold
    fold_model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        verbose=False
    )

    # Validazione fold
    fold_preds = fold_model.predict(X_val_fold)
    fold_preds_exp = np.expm1(fold_preds)
    fold_true_exp = np.expm1(y_val_fold)
    fold_rmsle = np.sqrt(mean_squared_log_error(fold_true_exp, fold_preds_exp))
    print(f"Fold {fold+1} RMSLE: {fold_rmsle:.4f}")

    # Salvare previsioni e modello
    fold_predictions.append((val_idx, fold_preds))
    final_models.append(fold_model)

# 15. Validazione OOF (Out-of-fold)
oof_true_indices = np.concatenate([idx for idx, _ in fold_predictions])
oof_preds = np.concatenate([pred for _, pred in sorted(fold_predictions, key=lambda x: x[0][0])])
oof_preds_exp = np.expm1(oof_preds)
oof_true_exp = np.expm1(y_log.iloc[oof_true_indices])

oof_rmsle = np.sqrt(mean_squared_log_error(oof_true_exp, oof_preds_exp))
print(f"\nOut-of-fold RMSLE: {oof_rmsle:.4f}")

# 16. Previsioni finali (media degli ensemble models)
test_preds = []
for model in final_models:
    test_pred = model.predict(X_test_proc)
    test_preds.append(test_pred)

# Media delle previsioni log-scale e trasformazione inversa
y_test_log_ensemble = np.mean(test_preds, axis=0)
y_test_ensemble = np.expm1(y_test_log_ensemble)

# 17. Feature importance media
importance_df = pd.DataFrame()
importance_df['Feature'] = X.columns
importance_df['Importance'] = np.mean([model.feature_importances_ for model in final_models], axis=0)
importance_df = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)

print("\nTop 10 feature importances:")
print(importance_df.head(10))

# 18. Creazione submission
df_test['Pred_Calories'] = y_test_ensemble
df_test[['id', 'Pred_Calories']].to_csv('submission.csv', index=False)
print("\nPreview delle predizioni:")
print(df_test[['id', 'Pred_Calories']].head())

submission = pd.read_csv('/content/drive/MyDrive/data_calories/sample_submission.csv')
submission['Calories'] = df_test['Pred_Calories']
submission.to_csv('/content/drive/MyDrive/data_calories/submission_cv_ensemble.csv', index=False)
print("\nCreato file pronto per il submit con ensemble CV!")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


Cross-Validation RMSLE Scores: [0.61804133 0.61533677 0.61718857 0.61653014 0.62105008 0.61675175
 0.61506709 0.6143131  0.61890586 0.61661916]
Mean CV RMSLE: 0.6170, Std: 0.0019
[0]	validation_0-rmse:0.96014
[1]	validation_0-rmse:0.95584
[2]	validation_0-rmse:0.95161
[3]	validation_0-rmse:0.94735
[4]	validation_0-rmse:0.94311
[5]	validation_0-rmse:0.93893
[6]	validation_0-rmse:0.93478
[7]	validation_0-rmse:0.93064
[8]	validation_0-rmse:0.92648
[9]	validation_0-rmse:0.92234
[10]	validation_0-rmse:0.91822
[11]	validation_0-rmse:0.91411
[12]	validation_0-rmse:0.91002
[13]	validation_0-rmse:0.90595
[14]	validation_0-rmse:0.90194
[15]	validation_0-rmse:0.89795
[16]	validation_0-rmse:0.89394
[17]	validation_0-rmse:0.88995
[18]	validation_0-rmse:0.88597
[19]	validation_0-rmse:0.88203
[20]	validation_0-rmse:0.87808
[21]	validation_0-rmse:0.87420
[22]	validation_0-rmse:0.87030
[23]	validation_0-rmse:0.86642
[24]	validation_0-rmse:0.86259
[25]	validation_0-rmse:0.85874
[26]	validation_0-rmse:0.

In [2]:
import time
from google.colab import runtime

print("Il runtime verrà interrotto tra 30 secondi...")
time.sleep(30)
runtime.unassign()

Il runtime verrà interrotto tra 30 secondi...
