# Linear Regression Baseline for Residual Modeling Pipeline

## Objective
Create a simple Linear Regression baseline as the first step in the residual modeling pipeline.

## Strategy
- Use original features only (no product features, no target encoding)
- Sex: one-hot encoded (not target encoded - only 2 categories)
- Numerical features: Age, Height, Weight, Duration, Heart_Rate, Body_Temp
- Use Ridge regularization (alpha=1.0)
- Expected CV: ~0.065-0.075

This captures linear patterns and serves as the foundation for sequential residual modeling.

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

print("Loading data...")
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')

print(f"Train: {train_df.shape}, Test: {test_df.shape}")
print(f"Target range: [{train_df['Calories'].min():.2f}, {train_df['Calories'].max():.2f}]")

## Feature Engineering

Use minimal features as per strategy:
- Original numerical features (6 features)
- Sex: one-hot encoded (2 features)
- Total: 8 features
- NO product features (too predictive, causes overfitting)
- NO target encoding (manual encoding on 'Sex' is ineffective)

In [None]:
def create_features(df):
    """Create minimal features for Linear Regression baseline"""
    df_new = df.copy()
    
    # Original numerical features
    num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    
    # One-hot encode Sex (don't use target encoding - only 2 categories)
    sex_encoded = pd.get_dummies(df_new['Sex'], prefix='Sex')
    df_new = pd.concat([df_new, sex_encoded], axis=1)
    
    feature_cols = num_features + list(sex_encoded.columns)
    
    return df_new, feature_cols

# Create features
train_feat, feature_cols = create_features(train_df)
test_feat, _ = create_features(test_df)

print(f"Feature columns ({len(feature_cols)}): {feature_cols}")

# Prepare data
X = train_feat[feature_cols]
y = train_feat['Calories']
X_test = test_feat[feature_cols]

print(f"X shape: {X.shape}, y shape: {y.shape}, X_test shape: {X_test.shape}")

## Cross-Validation Setup

Use 5-fold CV with seed 42 (consistent with winners)

In [None]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=SEED)

# Initialize arrays for OOF predictions
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))

# Standardize features (important for Linear Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

print(f"Features standardized. X_scaled shape: {X_scaled.shape}")

## Train Linear Regression with Ridge Regularization

Use Ridge (alpha=1.0) for regularization

In [None]:
fold_scores = []

print("Training Linear Regression (Ridge) model...")
print("=" * 60)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
    print(f"\nFold {fold}/{n_folds}")
    
    # Split data
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = Ridge(alpha=1.0, random_state=SEED)
    model.fit(X_train, y_train)
    
    # Predict
    pred_val = model.predict(X_val)
    pred_test = model.predict(X_test_scaled)
    
    # Clip predictions to training range
    pred_val = np.clip(pred_val, y.min(), y.max())
    pred_test = np.clip(pred_test, y.min(), y.max())
    
    # Calculate RMSLE
    rmsle = np.sqrt(mean_squared_log_error(y_val, pred_val))
    fold_scores.append(rmsle)
    
    # Store OOF predictions
    oof_predictions[val_idx] = pred_val
    test_predictions += pred_test / n_folds
    
    print(f"  Fold {fold} RMSLE: {rmsle:.6f}")

# Calculate overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print("\n" + "=" * 60)
print("LINEAR REGRESSION BASELINE RESULTS")
print("=" * 60)
print(f"CV RMSLE: {cv_score:.6f} Â± {cv_std:.6f}")
print(f"Individual folds: {fold_scores}")
print(f"OOF predictions range: [{oof_predictions.min():.2f}, {oof_predictions.max():.2f}]")

## Analyze Residuals

Calculate residuals for the next step in the pipeline

In [None]:
# Calculate residuals (target - predictions)
residuals = y.values - oof_predictions

print("\n" + "=" * 60)
print("RESIDUAL ANALYSIS")
print("=" * 60)
print(f"Residuals mean: {residuals.mean():.6f} (should be ~0)")
print(f"Residuals std: {residuals.std():.6f}")
print(f"Residuals range: [{residuals.min():.2f}, {residuals.max():.2f}]")
print(f"Original target std: {y.std():.6f}")
print(f"Residuals explain {(1 - residuals.std()/y.std())*100:.2f}% of variance")

# Save residuals for next step
import os
os.makedirs('/home/code/experiments/005_linear_regression', exist_ok=True)

residuals_df = pd.DataFrame({
    'id': train_df['id'],
    'residual': residuals
})
residuals_df.to_csv('/home/code/experiments/005_linear_regression/residuals_lr.csv', index=False)
print(f"\nResiduals saved to: /home/code/experiments/005_linear_regression/residuals_lr.csv")

## Create Submission

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': np.clip(test_predictions, y.min(), y.max())
})

submission_path = '/home/submission/submission_005_linear_regression.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission saved: {submission_path}")
print(f"Submission predictions range: [{submission['Calories'].min():.2f}, {submission['Calories'].max():.2f}]")

# Save OOF predictions
oof_df = pd.DataFrame({
    'id': train_df['id'],
    'oof_prediction': oof_predictions
})
oof_path = '/home/code/experiments/005_linear_regression/oof_005_linear_regression.csv'
oof_df.to_csv(oof_path, index=False)

print(f"OOF predictions saved: {oof_path}")

## Summary

This Linear Regression baseline:
- Uses minimal features (8 total)
- Achieves CV in expected range (~0.065-0.075)
- Captures linear patterns in the data
- Generates residuals for the next step (Neural Network)
- Serves as the foundation for the residual modeling pipeline