# Lab 1.6.1: Tabular Data Challenge - SOLUTIONS

This notebook contains complete solutions to all exercises from Lab 1.6.1.

In [None]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import torch
import torch.nn as nn

np.random.seed(42)

# Load data
housing = fetch_california_housing()
X, y = housing.data, housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data loaded successfully!")

## Exercise 1 Solution: Tune XGBoost Hyperparameters

In [None]:
# Exercise 1: Tuning XGBoost hyperparameters to beat the default

# First, establish baseline with defaults
default_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    device='cuda',
    verbosity=0
)
default_model.fit(X_train, y_train)
default_pred = default_model.predict(X_test)
default_rmse = np.sqrt(mean_squared_error(y_test, default_pred))
print(f"Default RMSE: ${default_rmse*100000:,.0f}")

# Tuned parameters - found through experimentation
tuned_params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,              # Slightly shallower to prevent overfitting
    'learning_rate': 0.08,       # Slightly lower learning rate
    'n_estimators': 300,         # More trees to compensate
    'min_child_weight': 3,       # Slightly more conservative
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,            # L1 regularization
    'reg_lambda': 1.0,           # L2 regularization
    'device': 'cuda',
    'random_state': 42,
    'verbosity': 0
}

tuned_model = xgb.XGBRegressor(**tuned_params)
tuned_model.fit(X_train, y_train)
tuned_pred = tuned_model.predict(X_test)
tuned_rmse = np.sqrt(mean_squared_error(y_test, tuned_pred))

print(f"Tuned RMSE: ${tuned_rmse*100000:,.0f}")
print(f"Improvement: ${(default_rmse - tuned_rmse)*100000:,.0f} ({(default_rmse - tuned_rmse)/default_rmse*100:.1f}%)")

## Exercise 2 Solution: Experiment with Neural Network Architectures

In [None]:
# Exercise 2: Different neural network architectures

class DeepTabularMLP(nn.Module):
    """Deeper network with residual-like connections."""
    
    def __init__(self, input_dim):
        super().__init__()
        
        # Deeper architecture
        self.block1 = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3)
        )
        
        self.block2 = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3)
        )
        
        self.block3 = nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2)
        )
        
        self.block4 = nn.Sequential(
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1)
        )
        
        self.output = nn.Linear(64, 1)
    
    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        return self.output(x).squeeze(-1)

# Prepare data for neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert to tensors
X_train_t = torch.FloatTensor(X_train_scaled).to(device)
y_train_t = torch.FloatTensor(y_train).to(device)
X_test_t = torch.FloatTensor(X_test_scaled).to(device)

# Create and train model
deep_model = DeepTabularMLP(input_dim=8).to(device)
optimizer = torch.optim.AdamW(deep_model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = nn.MSELoss()

# Training loop
print("Training deeper neural network...")
deep_model.train()
for epoch in range(300):
    optimizer.zero_grad()
    outputs = deep_model(X_train_t)
    loss = criterion(outputs, y_train_t)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# Evaluate
deep_model.eval()
with torch.no_grad():
    nn_pred = deep_model(X_test_t).cpu().numpy()

nn_rmse = np.sqrt(mean_squared_error(y_test, nn_pred))
print(f"\nDeep NN RMSE: ${nn_rmse*100000:,.0f}")
print(f"XGBoost RMSE: ${tuned_rmse*100000:,.0f}")
print(f"\nResult: {'Neural Network wins!' if nn_rmse < tuned_rmse else 'XGBoost still wins!'}")

## Exercise 3 Solution: Cross-Validation Comparison

In [None]:
# Exercise 3: Proper cross-validation comparison

from sklearn.model_selection import cross_val_score

# XGBoost CV
xgb_model = xgb.XGBRegressor(**tuned_params)
xgb_cv_scores = cross_val_score(
    xgb_model, X, y,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

print("5-Fold Cross-Validation Results:")
print("=" * 50)
print(f"\nXGBoost:")
print(f"  CV RMSE: ${-xgb_cv_scores.mean()*100000:,.0f} (+/- ${xgb_cv_scores.std()*100000:,.0f})")
print(f"  Individual folds: {[-s*100000 for s in xgb_cv_scores]}")

# For comparison, let's also do Random Forest
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, max_depth=16, n_jobs=-1, random_state=42)
rf_cv_scores = cross_val_score(
    rf_model, X, y,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

print(f"\nRandom Forest:")
print(f"  CV RMSE: ${-rf_cv_scores.mean()*100000:,.0f} (+/- ${rf_cv_scores.std()*100000:,.0f})")

# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])
ridge_cv_scores = cross_val_score(
    ridge_pipeline, X, y,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

print(f"\nRidge Regression:")
print(f"  CV RMSE: ${-ridge_cv_scores.mean()*100000:,.0f} (+/- ${ridge_cv_scores.std()*100000:,.0f})")

# Summary
print("\n" + "=" * 50)
print("Summary (sorted by CV RMSE):")
results = [
    ('XGBoost', -xgb_cv_scores.mean(), xgb_cv_scores.std()),
    ('Random Forest', -rf_cv_scores.mean(), rf_cv_scores.std()),
    ('Ridge', -ridge_cv_scores.mean(), ridge_cv_scores.std())
]
results.sort(key=lambda x: x[1])

for i, (name, mean, std) in enumerate(results, 1):
    print(f"  {i}. {name}: ${mean*100000:,.0f} (+/- ${std*100000:,.0f})")

## Key Takeaways

1. **XGBoost consistently outperforms neural networks on this tabular dataset**
2. **Hyperparameter tuning can improve performance by 1-3%**
3. **Cross-validation provides more robust estimates than single splits**
4. **Neural networks need more effort (architecture tuning, scaling) for similar results**