In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# 1) Carica i dati
train_path = r'C:\Users\fabri\Desktop\MarcoPatierno_DepositoCorsoPython\env\Giorno 22 06-05\kaggle_competition\data\train.csv'
test_path  = r'C:\Users\fabri\Desktop\MarcoPatierno_DepositoCorsoPython\env\Giorno 22 06-05\kaggle_competition\data\test.csv'
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

# 2) Funzione di feature engineering (NO target leakage: non uso mai df['Calories'])
def make_features(df):
    df = df.copy()
    # codifica Sex
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    # BMI e Body Surface Area
    df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
    df['BSA'] = 0.007184 * df['Weight'] ** 0.425 * df['Height'] ** 0.725
    # Delta temperatura
    df['Delta_Temp'] = df['Body_Temp'] - 37.0
    # HR per kg
    df['HR_per_kg'] = df['Heart_Rate'] / df['Weight']
    # battiti totali e durate
    df['Heart_Beats_Total'] = df['Heart_Rate'] * df['Duration']
    # trasformazioni su Duration
    df['log_Duration']  = np.log1p(df['Duration'])
    df['sqrt_Duration'] = np.sqrt(df['Duration'])
    # BMR (Harris-Benedict)
    df['BMR'] = np.where(
        df['Sex'] == 0,
        88.362 + 13.397 * df['Weight'] + 4.799 * df['Height'] - 5.677 * df['Age'],
        447.593 +  9.247 * df['Weight'] + 3.098 * df['Height'] - 4.330 * df['Age']
    )
    # MaxHR e percentuale
    df['MaxHR']     = 220 - df['Age']
    df['pct_MaxHR'] = df['Heart_Rate'] / df['MaxHR']
    # HR * Duration^2
    df['HR_Dur2'] = df['Heart_Rate'] * (df['Duration'] ** 2)
    return df

# 3) Applica la stessa funzione a train e test
train_fe = make_features(df_train)
test_fe  = make_features(df_test)

# 4) Prepara X e y
X = train_fe.drop(columns=['id', 'Calories'])
y = train_fe['Calories']

# 5) Split train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6) Grid search su XGBoost
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
param_grid = {
    'n_estimators':    [100, 200, 250],
    'max_depth':       [3, 5, 7],
    'learning_rate':   [0.01, 0.05, 0.1]
}
grid = GridSearchCV(
    xgb, param_grid, cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV MSE:", -grid.best_score_)

# 7) Valutazione su validation set
y_val_pred = grid.predict(X_val)
val_mse = ((y_val - y_val_pred) ** 2).mean()
print("Validation MSE:", val_mse)

# 8) Predizione su test set
#   - prima allineo nomi e ordine delle colonne a quelli di X_train
X_test = test_fe.drop(columns=['id'])
X_test = X_test[X_train.columns]

df_test['Pred_Calories'] = grid.predict(X_test)
print(df_test[['id', 'Pred_Calories']].head())


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best params: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 250}
Best CV MSE: 13.086534212895705
Validation MSE: 12.880139844022148
       id  Pred_Calories
0  750000      27.780828
1  750001     107.989243
2  750002      86.624619
3  750003     124.263741
4  750004      75.725960


In [2]:
val_rmse = np.sqrt(val_mse)
print(f"Validation RMSE: {val_rmse:.4f}")

Validation RMSE: 3.5889


In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# 1) Carica i dati
train_path = r'C:\Users\fabri\Desktop\MarcoPatierno_DepositoCorsoPython\env\Giorno 22 06-05\kaggle_competition\data\train.csv'
test_path  = r'C:\Users\fabri\Desktop\MarcoPatierno_DepositoCorsoPython\env\Giorno 22 06-05\kaggle_competition\data\test.csv'
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

# 2) Funzione di feature engineering (no target leakage)
def make_features(df):
    df = df.copy()
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
    df['BSA'] = 0.007184 * df['Weight']**0.425 * df['Height']**0.725
    df['Delta_Temp'] = df['Body_Temp'] - 37.0
    df['HR_per_kg'] = df['Heart_Rate'] / df['Weight']
    df['Heart_Beats_Total'] = df['Heart_Rate'] * df['Duration']
    df['log_Duration']  = np.log1p(df['Duration'])
    df['sqrt_Duration'] = np.sqrt(df['Duration'])
    df['BMR'] = np.where(
        df['Sex'] == 0,
        88.362 + 13.397 * df['Weight'] + 4.799 * df['Height'] - 5.677 * df['Age'],
        447.593 +  9.247 * df['Weight'] + 3.098 * df['Height'] - 4.330 * df['Age']
    )
    df['MaxHR']     = 220 - df['Age']
    df['pct_MaxHR'] = df['Heart_Rate'] / df['MaxHR']
    df['HR_Dur2']   = df['Heart_Rate'] * (df['Duration'] ** 2)
    return df

# 3) Applica feature engineering a train e test
train_fe = make_features(df_train)
test_fe  = make_features(df_test)

# 4) Prepara X_raw e X_test_raw (senza id e target)
X_raw       = train_fe.drop(columns=['id', 'Calories'])
X_test_raw  = test_fe.drop(columns=['id'])

# 5) Trasformazione logaritmica su tutte le feature
#    - per ciascuna colonna, se min <= 0, shift = (-min + 1), altrimenti shift = 0
X     = X_raw.copy()
X_test = X_test_raw.copy()
for col in X.columns:
    min_val = min(X[col].min(), X_test[col].min())
    shift   = -min_val + 1 if min_val <= 0 else 0
    X[col]      = np.log1p(X[col] + shift)
    X_test[col] = np.log1p(X_test[col] + shift)

# 6) Split train/validation
y = train_fe['Calories']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7) Grid search su XGBoost
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
param_grid = {
    'n_estimators':  [100, 200, 250],
    'max_depth':     [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}
grid = GridSearchCV(
    xgb, param_grid, cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)

# 8) Calcolo di CV RMSE e validation RMSE
best_mse_cv = -grid.best_score_
best_rmse_cv = np.sqrt(best_mse_cv)
y_val_pred  = grid.predict(X_val)
val_mse     = mean_squared_error(y_val, y_val_pred)
val_rmse    = np.sqrt(val_mse)

print("Best params:", grid.best_params_)
print(f"CV MSE: {best_mse_cv:.4f}   CV RMSE: {best_rmse_cv:.4f}")
print(f"Validation MSE: {val_mse:.4f}   Validation RMSE: {val_rmse:.4f}")

# 9) Predizione sul test set
df_test['Pred_Calories'] = grid.predict(X_test)
print(df_test[['id', 'Pred_Calories']].head())


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best params: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 250}
CV MSE: 13.0865   CV RMSE: 3.6175
Validation MSE: 12.8801   Validation RMSE: 3.5889
       id  Pred_Calories
0  750000      27.780828
1  750001     107.989243
2  750002      86.624619
3  750003     124.263741
4  750004      75.725960


In [4]:
# Calcolo della Root Mean Squared Logarithmic Error (RMSLE) sulla validation set
rmsle = np.sqrt(np.mean((np.log1p(y_val_pred) - np.log1p(y_val)) ** 2))
print(f"Validation RMSLE: {rmsle:.4f}")

Validation RMSLE: 0.0614


In [15]:
# Fit XGBoost with specified hyperparameters
xgb_custom = XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.05,
    max_depth=10,
    n_estimators=1000,
    random_state=42
)
xgb_custom.fit(X_train, y_train)

# Evaluation on validation set
y_val_pred_custom = xgb_custom.predict(X_val)
mse_custom = mean_squared_error(y_val, y_val_pred_custom)
rmse_custom = np.sqrt(mse_custom)
print(f"Custom model Validation MSE: {mse_custom:.4f}, RMSE: {rmse_custom:.4f}")
rmsle_custom = np.sqrt(np.mean((np.log1p(y_val_pred_custom) - np.log1p(y_val)) ** 2))
print(f"Custom model Validation RMSLE: {rmsle_custom:.4f}")
# Predict on test set
df_test['Pred_Calories_custom'] = xgb_custom.predict(X_test)
print(df_test[['id', 'Pred_Calories_custom']].head())

Custom model Validation MSE: 13.1928, RMSE: 3.6322
Custom model Validation RMSLE: 0.0607
       id  Pred_Calories_custom
0  750000             27.780058
1  750001            107.488564
2  750002             87.484024
3  750003            125.975304
4  750004             75.889946


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# 1) Carica i dati
train_path = r'C:\Users\fabri\Desktop\MarcoPatierno_DepositoCorsoPython\env\Giorno 22 06-05\kaggle_competition\data\train.csv'
test_path  = r'C:\Users\fabri\Desktop\MarcoPatierno_DepositoCorsoPython\env\Giorno 22 06-05\kaggle_competition\data\test.csv'
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

# 2) Funzione di feature engineering (senza target leakage)
def make_features(df):
    df = df.copy()
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
    df['BSA'] = 0.007184 * df['Weight']**0.425 * df['Height']**0.725
    df['Delta_Temp'] = df['Body_Temp'] - 37.0
    df['HR_per_kg'] = df['Heart_Rate'] / df['Weight']
    df['Heart_Beats_Total'] = df['Heart_Rate'] * df['Duration']
    df['log_Duration']  = np.log1p(df['Duration'])
    df['sqrt_Duration'] = np.sqrt(df['Duration'])
    df['BMR'] = np.where(
        df['Sex'] == 0,
        88.362 + 13.397 * df['Weight'] + 4.799 * df['Height'] - 5.677 * df['Age'],
        447.593 +  9.247 * df['Weight'] + 3.098 * df['Height'] - 4.330 * df['Age']
    )
    df['MaxHR']     = 220 - df['Age']
    df['pct_MaxHR'] = df['Heart_Rate'] / df['MaxHR']
    df['HR_Dur2']   = df['Heart_Rate'] * (df['Duration'] ** 2)
    return df

# 3) Applica feature engineering
train_fe = make_features(df_train)
test_fe  = make_features(df_test)

# 4) Prepara X_raw, y_orig e X_test_raw
X_raw      = train_fe.drop(columns=['id', 'Calories'])
y_orig     = train_fe['Calories']
X_test_raw = test_fe.drop(columns=['id'])

# 5) Log‐trasformazione automatica di tutte le feature
X      = X_raw.copy()
X_test = X_test_raw.copy()
for col in X.columns:
    min_val = min(X[col].min(), X_test[col].min())
    shift   = -min_val + 1 if min_val <= 0 else 0
    X[col]      = np.log1p(X[col] + shift)
    X_test[col] = np.log1p(X_test[col] + shift)

# 6) Split train/validation (mantenendo target originale per metriche)
X_train, X_val, y_train_orig, y_val_orig = train_test_split(
    X, y_orig, test_size=0.2, random_state=42
)
# Trasforma il target in log1p per ottimizzare RMSLE
y_train = np.log1p(y_train_orig)
y_val   = np.log1p(y_val_orig)

# 7) Spazio di ricerca iperparametri
param_dist = {
    'learning_rate':    [0.01, 0.05, 0.1],
    'max_depth':        [5, 7, 10, 12],
    'min_child_weight': [1, 5, 10],
    'gamma':            [0, 0.1, 0.2, 0.5],
    'subsample':        [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha':        [0, 0.01, 0.1, 1],
    'reg_lambda':       [1, 2, 5],
    'booster':          ['gbtree', 'dart']
}

# 8) RandomizedSearchCV con early stopping
xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_estimators=1000,
    verbosity=0
)
search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_mean_squared_log_error',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)
search.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    early_stopping_rounds=50,
    verbose=False
)

best = search.best_estimator_

# 9) Valuta su CV e validation set
cv_rmsle  = np.sqrt(-search.best_score_)
y_val_log = best.predict(X_val)
y_val_pred = np.expm1(y_val_log)
val_rmse   = np.sqrt(mean_squared_error(y_val_orig, y_val_pred))
val_rmsle  = np.sqrt(mean_squared_log_error(y_val_orig, y_val_pred))

print("Best params:", search.best_params_)
print(f"CV RMSLE: {cv_rmsle:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}, RMSLE: {val_rmsle:.4f}")

# 10) Predizioni finali sul test set (invertendo il log)
df_test['Pred_Calories'] = np.expm1(best.predict(X_test))
print(df_test[['id', 'Pred_Calories']].head())



In [6]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

def elimina_variabili_vif_pvalue(X_train, y_train, vif_threshold=10.0, pvalue_threshold=0.05):
    """
    Rimuove variabili da X_train basandosi su VIF e p-value.
    
    - Elimina solo variabili con VIF > soglia e p-value > soglia.
    - Ricalcola VIF e p-value dopo ogni eliminazione.
    """
    
    # Copia dei dati per lavorare in sicurezza
    X_current = X_train.copy()
    
    # Aggiungi costante per statsmodels
    X_const = sm.add_constant(X_current)
    
    while True:
        # Modello OLS per calcolare p-value
        model = sm.OLS(y_train, X_const).fit()
        pvalues = model.pvalues.drop('const')  # escludi l'intercetta
        
        # Calcolo VIF
        vif = pd.DataFrame()
        vif["Feature"] = X_current.columns
        vif["VIF"] = [variance_inflation_factor(X_current.values, i) for i in range(X_current.shape[1])]
        
        # Unisco p-value e VIF
        stats = vif.copy()
        stats["p-value"] = pvalues.values
        
        # Trova candidati da eliminare: VIF alto + p-value alto
        candidates = stats[(stats["VIF"] > vif_threshold) & (stats["p-value"] > pvalue_threshold)]
        
        if candidates.empty:
            print("\nNessuna variabile da eliminare. Selezione completata.")
            break
        
        # Elimina la variabile con il VIF più alto tra i candidati
        worst_feature = candidates.sort_values(by="VIF", ascending=False)["Feature"].iloc[0]
        print(f"Rimuovo '{worst_feature}' con VIF = {candidates.loc[candidates['Feature'] == worst_feature, 'VIF'].values[0]:.2f} "
              f"e p-value = {candidates.loc[candidates['Feature'] == worst_feature, 'p-value'].values[0]:.4f}")
        
        # Aggiorna i dati
        X_current = X_current.drop(columns=[worst_feature])
        X_const = sm.add_constant(X_current)
    
    print("\nFeature finali selezionate:")
    print(X_current.columns.tolist())
    
    return X_current

X_current = elimina_variabili_vif_pvalue(X_train, y_train, vif_threshold=10.0, pvalue_threshold=0.05)


Nessuna variabile da eliminare. Selezione completata.

Feature finali selezionate:
['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'BSA', 'Delta_Temp', 'HR_per_kg', 'Heart_Beats_Total', 'log_Duration', 'sqrt_Duration', 'BMR', 'MaxHR', 'pct_MaxHR', 'HR_Dur2']


In [7]:
# Calcolo dei limiti IQR su X_train
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Tabella dei limiti
iqr_limits = pd.DataFrame({
    'Q1': Q1,
    'Q3': Q3,
    'IQR': IQR,
    'lower': lower_limit,
    'upper': upper_limit
})
print(iqr_limits)

# Rimuovo gli outlier da X_train e y_train
mask = ~((X_train < lower_limit) | (X_train > upper_limit)).any(axis=1)
X_train_iqr = X_train[mask]
y_train_iqr = y_train.loc[mask]

print(f"Record prima del filtro: {X_train.shape[0]}")
print(f"Record dopo filtro IQR:  {X_train_iqr.shape[0]}")

                         Q1         Q3       IQR     lower      upper
Sex                0.693147   1.098612  0.405465  0.084950   1.706810
Age                3.367296   3.970292  0.602996  2.462802   4.874786
Height             5.105945   5.225747  0.119801  4.926244   5.405448
Weight             4.158883   4.477337  0.318454  3.681202   4.955017
Duration           2.197225   3.178054  0.980829  0.725981   4.649298
Heart_Rate         4.488636   4.644391  0.155755  4.255005   4.878023
Body_Temp          3.703768   3.730501  0.026733  3.663668   3.770601
BMI                3.188624   3.276570  0.087946  3.056704   3.408489
BSA                0.988039   1.132696  0.144657  0.771054   1.349681
Delta_Temp         1.280934   1.547563  0.266629  0.880991   1.947506
HR_per_kg          0.739428   0.919793  0.180365  0.468880   1.190342
Heart_Beats_Total  6.591674   7.751045  1.159371  4.852617   9.490102
log_Duration       1.162283   1.429846  0.267562  0.760939   1.831189
sqrt_Duration      1

In [9]:
# Fit XGBoost on IQR‐filtered training data using the best params
xgb_iqr = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    **grid.best_params_
)
xgb_iqr.fit(X_train_iqr, y_train_iqr)

# Valutazione sul validation set
y_val_pred_iqr = xgb_iqr.predict(X_val[X_train_iqr.columns])
mse_iqr = mean_squared_error(y_val, y_val_pred_iqr)
rmse_iqr = np.sqrt(mse_iqr)
print(f"IQR model Validation MSE: {mse_iqr:.4f}, RMSE: {rmse_iqr:.4f}")
rmsle_iqr = np.sqrt(np.mean((np.log1p(y_val_pred_iqr) - np.log1p(y_val)) ** 2))
print(f"IQR model Validation RMSLE: {rmsle_iqr:.4f}")
# Predizione su test set
df_test['Pred_Calories_iqr'] = xgb_iqr.predict(X_test[X_train_iqr.columns])
print(df_test[['id', 'Pred_Calories_iqr']].head())

IQR model Validation MSE: 14.1958, RMSE: 3.7677
IQR model Validation RMSLE: 0.1390
       id  Pred_Calories_iqr
0  750000          27.572760
1  750001         108.658775
2  750002          86.211700
3  750003         124.607124
4  750004          76.455338


In [10]:
from sklearn.ensemble import RandomForestRegressor

# 1) Fit Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# 2) Predict on validation and compute RMSLE
y_val_pred_rf = rf.predict(X_val)
rmsle_rf = np.sqrt(np.mean((np.log1p(y_val_pred_rf) - np.log1p(y_val)) ** 2))
print(f"Random Forest Validation RMSLE: {rmsle_rf:.4f}")

# 3) Predict on test set and store in df_test
df_test['Pred_Calories_rf'] = rf.predict(X_test)
print(df_test[['id', 'Pred_Calories_rf']].head())

Random Forest Validation RMSLE: 0.0625
       id  Pred_Calories_rf
0  750000            27.550
1  750001           108.520
2  750002            88.350
3  750003           124.850
4  750004            75.548
