# XGBoost on NN Residuals - Three-Stage Pipeline

This notebook implements the final stage of the residual modeling pipeline:
1. Linear Regression (exp_005) - captures linear patterns
2. Neural Network on LR residuals (exp_006) - captures non-linear patterns  
3. XGBoost on NN residuals (exp_007) - captures remaining patterns

Expected to improve upon the 0.02047 baseline from direct XGBoost modeling.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

## Load Data

In [2]:
# Load training data
train_df = pd.read_csv('/home/nonroot/snapshots/playground-series-s5e5/21025419367/code/data/train.csv')
test_df = pd.read_csv('/home/nonroot/snapshots/playground-series-s5e5/21025419367/code/data/test.csv')

# Load residuals from NN stage (exp_006)
residuals_nn = pd.read_csv('/home/code/experiments/006_neural_network_residuals/residuals_after_nn.csv')
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Residuals from NN shape: {residuals_nn.shape}")
print(f"\nResiduals statistics:")
print(residuals_nn['residual'].describe())

Training data shape: (8000, 9)
Test data shape: (2000, 9)
Residuals from NN shape: (8000, 2)

Residuals statistics:
count    8000.000000
mean       -0.050198
std        20.603560
min       -81.094214
25%       -13.991381
50%         0.005975
75%        13.958964
max        74.479004
Name: residual, dtype: float64


## Feature Engineering

Create features for XGBoost stage. Based on winning solutions analysis:
- Log1p transforms of numeric features
- Product features (pairwise interactions)
- Binned features for categorical handling
- Groupby z-score features

In [3]:
def engineer_features(df):
    """Engineer features for XGBoost on residuals"""
    df = df.copy()
    
    # Original numeric features
    numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    
    # 1. Log1p transforms
    for col in numeric_features:
        df[f'{col}_log1p'] = np.log1p(df[col])
    
    # 2. Product features (pairwise interactions) - these are valuable but caused overfitting in direct modeling
    # Now applied to residuals where they may capture remaining patterns
    df['product_Weight_Duration'] = df['Weight'] * df['Duration']
    df['product_Duration_Heart_Rate'] = df['Duration'] * df['Heart_Rate']
    df['product_Height_Weight'] = df['Height'] * df['Weight']
    df['product_Age_Weight'] = df['Age'] * df['Weight']
    df['product_Duration_Body_Temp'] = df['Duration'] * df['Body_Temp']
    
    # 3. Ratio features
    df['ratio_Weight_Height'] = df['Weight'] / (df['Height'] + 1e-6)
    df['ratio_BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2 + 1e-6)
    df['ratio_Duration_Weight'] = df['Duration'] / (df['Weight'] + 1e-6)
    df['ratio_Heart_Rate_Age'] = df['Heart_Rate'] / (df['Age'] + 1e-6)
    
    # 4. Sum features
    df['sum_Weight_Duration'] = df['Weight'] + df['Duration']
    df['sum_Height_Weight'] = df['Height'] + df['Weight']
    df['sum_Age_Weight'] = df['Age'] + df['Weight']
    
    # 5. Difference features
    df['diff_Height_Weight'] = df['Height'] - df['Weight']
    df['diff_Age_Weight'] = df['Age'] - df['Weight']
    df['diff_Duration_Heart_Rate'] = df['Duration'] - df['Heart_Rate']
    
    # 6. Binned features (equal-width bins)
    for col in numeric_features:
        df[f'{col}_binned'] = pd.cut(df[col], bins=9, labels=False)
    
    # 7. One-hot encode Sex
    if 'Sex' in df.columns:
        sex_dummies = pd.get_dummies(df['Sex'], prefix='Sex')
        df = pd.concat([df, sex_dummies], axis=1)
    
    return df

# Apply feature engineering to train and test data
print("Engineering features for training data...")
train_features = engineer_features(train_df)
print(f"Train features shape: {train_features.shape}")

print("Engineering features for test data...")
test_features = engineer_features(test_df)
print(f"Test features shape: {test_features.shape}")

# Define feature columns (exclude id and target)
feature_cols = [col for col in train_features.columns if col not in ['id', 'Calories']]
print(f"Number of features: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:10]}...")  # Show first 10

Engineering features for training data...
Train features shape: (8000, 38)
Engineering features for test data...
Test features shape: (2000, 38)
Number of features: 36
Feature columns: ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Age_log1p', 'Height_log1p', 'Weight_log1p']...


## Prepare Data for XGBoost

Target: residuals from NN stage (remaining patterns after Linear+NN)
Features: engineered features above

In [None]:
# Target is residuals from NN stage
y_residuals = residuals_nn['residual'].values

# Features for XGBoost
X_train = train_features[feature_cols]
X_test = test_features[feature_cols]

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_residuals shape: {y_residuals.shape}")
print(f"y_residuals range: [{y_residuals.min():.2f}, {y_residuals.max():.2f}]")

## Cross-Validation Setup

Use 5-fold CV with same seed as previous stages to ensure consistency.

In [None]:
# 5-fold CV
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

# Initialize arrays for OOF predictions
oof_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))

# Store fold scores
fold_scores = []

print(f"Starting {n_splits}-fold CV...")

## Load OOF Predictions from Previous Stages

Need Linear Regression and Neural Network OOF predictions to combine with XGBoost predictions.

In [None]:
# Load OOF predictions from Linear Regression (exp_005)
lr_oof = pd.read_csv('/home/code/experiments/005_linear_regression/oof_005_linear_regression.csv')
lr_predictions = lr_oof['Calories_pred'].values

# Load OOF predictions from Neural Network (exp_006)  
nn_oof = pd.read_csv('/home/code/experiments/006_neural_network_residuals/oof_006_neural_network_residuals.csv')
nn_residual_predictions = nn_oof['residual_pred'].values

print(f"LR OOF shape: {lr_predictions.shape}")
print(f"NN residual OOF shape: {nn_residual_predictions.shape}")
print(f"LR predictions range: [{lr_predictions.min():.2f}, {lr_predictions.max():.2f}]")
print(f"NN residual predictions range: [{nn_residual_predictions.min():.2f}, {nn_residual_predictions.max():.2f}]")

## Train XGBoost on Residuals

Train XGBoost to capture remaining patterns in residuals after Linear+NN.

In [None]:
# XGBoost parameters (conservative, as recommended in strategy)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 5,
    'learning_rate': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 800,
    'random_state': SEED,
    'n_jobs': 4
}

print("Training XGBoost on residuals...")
print(f"Parameters: {params}")

fold = 1
for train_idx, val_idx in kf.split(X_train):
    print(f"\nFold {fold}/{n_splits}")
    
    # Split data
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_residuals[train_idx], y_residuals[val_idx]
    
    # Train XGBoost model
    model = xgb.XGBRegressor(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predict on validation set (residuals)
    val_residual_pred = model.predict(X_val)
    oof_predictions[val_idx] = val_residual_pred
    
    # Predict on test set
    test_fold_pred = model.predict(X_test)
    test_predictions += test_fold_pred / n_splits
    
    # Calculate fold score (RMSLE on combined predictions)
    # Combined prediction = LR + NN residual + XGB residual
    val_combined_pred = (lr_predictions[val_idx] + 
                        nn_residual_predictions[val_idx] + 
                        val_residual_pred)
    
    # Clip predictions to avoid log(negative)
    val_combined_pred = np.clip(val_combined_pred, 1e-6, None)
    
    fold_score = np.sqrt(mean_squared_log_error(train_df['Calories'].iloc[val_idx], val_combined_pred))
    fold_scores.append(fold_score)
    
    print(f"  Fold {fold} RMSLE: {fold_score:.6f}")
    
    fold += 1

print(f"\n{'='*50}")
print(f"CV RMSLE: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
print(f"Fold scores: {fold_scores}")

## Analyze Results

Compare performance to baseline and previous stages.

In [None]:
# Calculate combined OOF predictions
oof_combined = lr_predictions + nn_residual_predictions + oof_predictions

# Clip to avoid log issues
oof_combined = np.clip(oof_combined, 1e-6, None)

# Overall CV score
cv_score = np.sqrt(mean_squared_log_error(train_df['Calories'], oof_combined))

print(f"{'='*50}")
print(f"FINAL PIPELINE RESULTS")
print(f"{'='*50}")
print(f"Linear Regression only: 0.208762 (from exp_005)")
print(f"Linear + NN:             0.201961 (from exp_006)")
print(f"Linear + NN + XGBoost:   {cv_score:.6f}")
print(f"Improvement from NN:     {0.208762 - 0.201961:.6f}")
print(f"Improvement from XGB:    {0.201961 - cv_score:.6f}")
print(f"Total improvement:       {0.208762 - cv_score:.6f}")
print(f"\nBaseline XGBoost:        0.020470 (from exp_000)")
print(f"Pipeline vs Baseline:    {cv_score - 0.020470:.6f}")

# Check if we beat the baseline
if cv_score < 0.02047:
    print(f"\n✅ SUCCESS: Pipeline BEATS baseline by {0.02047 - cv_score:.6f}")
else:
    print(f"\n❌ Pipeline does NOT beat baseline (difference: {cv_score - 0.02047:.6f})")

## Save Predictions

In [None]:
# Save OOF predictions
oof_df = pd.DataFrame({
    'id': train_df['id'],
    'Calories_actual': train_df['Calories'],
    'Calories_pred': oof_combined,
    'lr_pred': lr_predictions,
    'nn_residual_pred': nn_residual_predictions,
    'xgb_residual_pred': oof_predictions
})

oof_df.to_csv('/home/code/experiments/007_xgboost_on_nn_residuals/oof_007_xgboost_residuals.csv', index=False)
print(f"OOF predictions saved. Shape: {oof_df.shape}")

# Save test predictions (combined pipeline)
test_combined_pred = (nn_oof['Calories_pred'].iloc[:len(test_df)].values +  # Use NN test predictions
                      test_predictions)

# For the linear component, we need to generate test predictions
# Load linear regression model and predict on test
from sklearn.linear_model import Ridge

# Re-train linear model on full data for test predictions
linear_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Sex_M', 'Sex_F']
X_train_linear = train_features[linear_features]
X_test_linear = test_features[linear_features]

lr_model = Ridge(alpha=1.0, random_state=SEED)
lr_model.fit(X_train_linear, train_df['Calories'])
lr_test_pred = lr_model.predict(X_test_linear)

# Final test predictions
test_final_pred = lr_test_pred + test_predictions

# Clip predictions
test_final_pred = np.clip(test_final_pred, train_df['Calories'].min(), train_df['Calories'].max())

# Save test predictions
test_pred_df = pd.DataFrame({
    'id': test_df['id'],
    'Calories_pred': test_final_pred
})

test_pred_df.to_csv('/home/code/experiments/007_xgboost_on_nn_residuals/test_007_xgboost_residuals.csv', index=False)
print(f"Test predictions saved. Shape: {test_pred_df.shape}")
print(f"Test predictions range: [{test_final_pred.min():.2f}, {test_final_pred.max():.2f}]")

## Generate Submission

In [None]:
# Create submission file
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Calories': test_final_pred
})

# Ensure proper format and clipping
submission_df['Calories'] = submission_df['Calories'].clip(
    lower=train_df['Calories'].min(), 
    upper=train_df['Calories'].max()
)

# Save submission
submission_path = '/home/submission/submission_007_xgboost_residuals.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"\nSubmission statistics:")
print(submission_df['Calories'].describe())

# Show first few rows
print(f"\nFirst 5 rows:")
print(submission_df.head())