# Loop 6 Analysis: Residual Structure Verification and Linear Component Optimization

## Objectives
1. Verify residual structure from exp_005 (Linear Regression baseline)
2. Optimize Linear Regression hyperparameters (alpha sweep)
3. Analyze feature importance for linear component
4. Create direct Neural Network baseline for comparison
5. Prepare for Neural Network on residuals

## Key Questions (from Evaluator)
- Do residuals contain predictable structure? (Critical for pipeline viability)
- Is alpha=1.0 optimal for Ridge regression?
- Which features are most important for linear component?
- Does direct modeling beat residual modeling?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

print("Loading data...")
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')
residuals_df = pd.read_csv('/home/code/experiments/005_linear_regression/residuals_lr.csv')

print(f"Train: {train_df.shape}, Test: {test_df.shape}")
print(f"Residuals: {residuals_df.shape}")
print(f"Target range: [{train_df['Calories'].min():.2f}, {train_df['Calories'].max():.2f}]")

## 1. Residual Structure Analysis

Analyze whether residuals contain predictable patterns or are just noise.

In [None]:
# Train a simple model on residuals to verify predictability
print("\n" + "=" * 60)
print("TESTING RESIDUAL PREDICTABILITY")
print("=" * 60)

# Load and prepare data
analysis_df = residuals_df.copy()
features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

# Use original features to predict residuals
X = analysis_df[features].copy()
y_residual = analysis_df['residual'].copy()

print(f"Residuals shape: {y_residual.shape}")
print(f"Residuals range: [{y_residual.min():.2f}, {y_residual.max():.2f}]")
print(f"Residuals std: {y_residual.std():.6f}")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Test with Decision Tree (can capture non-linear patterns)
dt_scores = []

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y_residual.iloc[train_idx], y_residual.iloc[val_idx]
    
    # Decision Tree (depth=3 - simple model)
    dt = DecisionTreeRegressor(max_depth=3, random_state=SEED)
    dt.fit(X_train, y_train)
    
    # Predict
    pred_val = dt.predict(X_val)
    
    # For residuals, use RMSE instead of RMSLE (residuals can be negative)
    rmse = np.sqrt(np.mean((y_val - pred_val) ** 2))
    dt_scores.append(rmse)
    
    print(f"Fold {fold}: RMSE = {rmse:.6f}")

dt_cv = np.mean(dt_scores)
print(f"\nDecision Tree CV RMSE: {dt_cv:.6f}")
print(f"Residual std: {y_residual.std():.6f}")
if dt_cv < y_residual.std():
    print("✓ RESIDUALS CONTAIN PREDICTABLE STRUCTURE → Pipeline viable!")
else:
    print("⚠️  Residuals may be mostly noise")

In [None]:
# Plot residual distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(analysis_df['residual'], kde=True, bins=50)
plt.title('Residual Distribution')
plt.xlabel('Residual')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.scatterplot(x='oof_prediction', y='residual', data=analysis_df, alpha=0.5)
plt.title('Residuals vs Predictions')
plt.xlabel('OOF Prediction')
plt.ylabel('Residual')
plt.axhline(y=0, color='r', linestyle='--')

plt.tight_layout()
plt.savefig('/home/code/exploration/residual_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

# Check for patterns in residuals vs original features
print("\n" + "=" * 60)
print("RESIDUAL CORRELATIONS WITH ORIGINAL FEATURES")
print("=" * 60)

features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
correlations = []

for feature in features:
    corr = analysis_df['residual'].corr(analysis_df[feature])
    correlations.append(corr)
    print(f"{feature:12s}: {corr:7.4f}")

# Check if any correlations are significant
significant_corrs = [abs(c) for c in correlations if abs(c) > 0.1]
if len(significant_corrs) > 0:
    print(f"\n✓ Found {len(significant_corrs)} features with |correlation| > 0.1")
    print("✓ Residuals contain structure that can be predicted!")
else:
    print("\n⚠️  No strong correlations found - residuals may be mostly noise")

In [None]:
# Train a simple model on residuals to verify predictability
print("\n" + "=" * 60)
print("TESTING RESIDUAL PREDICTABILITY")
print("=" * 60)

# Use original features to predict residuals
X = analysis_df[features].copy()
y_residual = analysis_df['residual'].copy()

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Test with Decision Tree (can capture non-linear patterns)
dt_scores = []
rf_scores = []

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y_residual.iloc[train_idx], y_residual.iloc[val_idx]
    
    # Decision Tree (depth=3 - simple model)
    dt = DecisionTreeRegressor(max_depth=3, random_state=SEED)
    dt.fit(X_train, y_train)
    dt_pred = dt.predict(X_val)
    dt_rmsle = np.sqrt(mean_squared_log_error(y_val, dt_pred))
    dt_scores.append(dt_rmsle)
    
    print(f"Fold {fold}: Decision Tree RMSLE = {dt_rmsle:.6f}")

dt_cv = np.mean(dt_scores)
print(f"\nDecision Tree CV RMSLE on residuals: {dt_cv:.6f} ± {np.std(dt_scores):.6f}")

if dt_cv < analysis_df['residual'].std():
    print("✓ Residuals are predictable! Decision Tree can reduce residual error")
    print(f"✓ Potential improvement: {analysis_df['residual'].std():.6f} → {dt_cv:.6f}")
else:
    print("⚠️  Decision Tree cannot predict residuals better than mean")
    print("⚠️  Residuals may be mostly noise")

## 2. Linear Regression Hyperparameter Optimization

Test different alpha values for Ridge regression.

In [None]:
print("\n" + "=" * 60)
print("RIDGE REGRESSION ALPHA SWEEP")
print("=" * 60)

# Define feature creation function
def create_lr_features(df):
    """Create minimal features for Linear Regression"""
    df_new = df.copy()
    num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    sex_encoded = pd.get_dummies(df_new['Sex'], prefix='Sex')
    df_new = pd.concat([df_new, sex_encoded], axis=1)
    feature_cols = num_features + list(sex_encoded.columns)
    return df_new, feature_cols

# Prepare data
train_feat, feature_cols = create_lr_features(train_df)
X = train_feat[feature_cols]
y = train_feat['Calories']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Alpha values to test
alphas = [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0]
alpha_results = []

print("Testing alpha values:", alphas)
print("-" * 60)

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

for alpha in alphas:
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Ridge regression
        model = Ridge(alpha=alpha, random_state=SEED)
        model.fit(X_train, y_train)
        
        # Predict and evaluate
        pred_val = model.predict(X_val)
        pred_val = np.clip(pred_val, y.min(), y.max())
        rmsle = np.sqrt(mean_squared_log_error(y_val, pred_val))
        fold_scores.append(rmsle)
    
    cv_score = np.mean(fold_scores)
    alpha_results.append((alpha, cv_score, fold_scores))
    print(f"Alpha {alpha:5.1f}: CV = {cv_score:.6f}")

# Find best alpha
best_alpha, best_score, _ = min(alpha_results, key=lambda x: x[1])
print("\n" + "-" * 60)
print(f"Best alpha: {best_alpha}")
print(f"Best CV: {best_score:.6f}")
print(f"Improvement over alpha=1.0: {[s for a,s,_ in alpha_results if a==1.0][0] - best_score:.6f}")

## 3. Feature Importance Analysis

Analyze which features are most important for the linear component.

In [None]:
print("\n" + "=" * 60)
print("FEATURE IMPORTANCE ANALYSIS")
print("=" * 60)

# Train model with best alpha on full data
best_model = Ridge(alpha=best_alpha, random_state=SEED)
best_model.fit(X_scaled, y)

# Get coefficients
coefficients = best_model.coef_
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
})
feature_importance = feature_importance.sort_values('abs_coefficient', ascending=False)

print("Feature coefficients (sorted by absolute value):")
print(feature_importance.to_string(index=False))

# Plot feature importance
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(data=feature_importance.head(8), x='coefficient', y='feature')
plt.title('Feature Coefficients (Best Ridge)')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')

plt.subplot(1, 2, 2)
sns.barplot(data=feature_importance.head(8), x='abs_coefficient', y='feature')
plt.title('Feature Importance (Absolute Coefficients)')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')

plt.tight_layout()
plt.savefig('/home/code/exploration/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

# Identify least important features
print("\nLeast important features:")
print(feature_importance.tail(3).to_string(index=False))

# Test model without least important feature
least_important = feature_importance.iloc[-1]['feature']
print(f"\nTesting model without {least_important}...")

features_reduced = [f for f in feature_cols if f != least_important]
X_reduced = train_feat[features_reduced]
scaler_reduced = StandardScaler()
X_reduced_scaled = scaler_reduced.fit_transform(X_reduced)

fold_scores_reduced = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X_reduced_scaled), 1):
    X_train, X_val = X_reduced_scaled[train_idx], X_reduced_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = Ridge(alpha=best_alpha, random_state=SEED)
    model.fit(X_train, y_train)
    
    pred_val = model.predict(X_val)
    pred_val = np.clip(pred_val, y.min(), y.max())
    rmsle = np.sqrt(mean_squared_log_error(y_val, pred_val))
    fold_scores_reduced.append(rmsle)

cv_reduced = np.mean(fold_scores_reduced)
print(f"CV without {least_important}: {cv_reduced:.6f}")
print(f"Difference: {cv_reduced - best_score:.6f}")

## 4. Direct Neural Network Baseline

Train a Neural Network directly on original target for comparison.

In [None]:
print("\n" + "=" * 60)
print("DIRECT NEURAL NETWORK BASELINE")
print("=" * 60)

# Prepare data
train_feat, feature_cols = create_lr_features(train_df)
X = train_feat[feature_cols]
y = train_feat['Calories']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Simple MLP architecture
mlp_scores = []
oof_mlp = np.zeros(len(train_df))

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
    print(f"\nFold {fold}/5")
    
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # MLP with small architecture
    mlp = MLPRegressor(
        hidden_layer_sizes=(64, 32),
        activation='relu',
        alpha=0.001,  # L2 regularization
        batch_size=32,
        learning_rate='adaptive',
        learning_rate_init=0.001,
        max_iter=200,
        early_stopping=True,
        validation_fraction=0.2,
        n_iter_no_change=10,
        random_state=SEED + fold
    )
    
    mlp.fit(X_train, y_train)
    
    # Predict
    pred_val = mlp.predict(X_val)
    pred_val = np.clip(pred_val, y.min(), y.max())
    
    # Evaluate using RMSLE
    rmsle = np.sqrt(mean_squared_log_error(y_val, pred_val))
    mlp_scores.append(rmsle)
    oof_mlp[val_idx] = pred_val
    
    print(f"  RMSLE: {rmsle:.6f}")

mlp_cv = np.mean(mlp_scores)
print(f"\nMLP CV RMSLE: {mlp_cv:.6f}")
print(f"Linear Regression CV: 0.208762")
print(f"Improvement: {0.208762 - mlp_cv:.6f}")

if mlp_cv < 0.208762:
    print("\u2713 Direct MLP beats Linear Regression!")
else:
    print("\u26a0\ufe0f Linear Regression still better - residual modeling may help")

## Summary and Recommendations

Based on the analysis, provide clear recommendations for next steps.

In [None]:
print("=" * 60)
print("ANALYSIS SUMMARY AND RECOMMENDATIONS")
print("=" * 60)

print("\n1. RESIDUAL STRUCTURE VERIFICATION:")
print(f"   - Residuals std: {analysis_df['residual'].std():.6f}")
print(f"   - Variance explained: {(1 - analysis_df['residual'].std()/y.std())*100:.2f}%")
print(f"   - Decision Tree can predict residuals: CV = {dt_cv:.6f}")
if dt_cv < analysis_df['residual'].std():
    print("   ✓ RESIDUALS CONTAIN PREDICTABLE STRUCTURE → Pipeline viable!")
else:
    print("   ⚠️  Residuals may be mostly noise")

print("\n2. HYPERPARAMETER OPTIMIZATION:")
print(f"   - Best alpha: {best_alpha}")
print(f"   - Best CV: {best_score:.6f}")
print(f"   - Improvement over alpha=1.0: {[s for a,s,_ in alpha_results if a==1.0][0] - best_score:.6f}")
if best_alpha != 1.0:
    print("   ✓ Different alpha improves performance - use best alpha for final model")
else:
    print("   ✓ Alpha=1.0 is already optimal")

print("\n3. FEATURE IMPORTANCE:")
print(f"   - Most important: {feature_importance.iloc[0]['feature']} (coeff: {feature_importance.iloc[0]['coefficient']:.4f})")
print(f"   - Least important: {feature_importance.iloc[-1]['feature']} (coeff: {feature_importance.iloc[-1]['coefficient']:.4f})")
if cv_reduced < best_score:
    print("   ✓ Removing least important feature improves CV")
else:
    print("   ✓ All features contribute positively")

print("\n4. DIRECT vs RESIDUAL MODELING:")
print(f"   - Linear Regression CV: {best_score:.6f}")
print(f"   - Direct MLP CV: {mlp_cv:.6f}")
if mlp_cv < best_score:
    print("   ✓ Direct MLP beats Linear Regression - consider direct approach")
else:
    print("   ✓ Linear Regression is strong baseline")

print("\n" + "=" * 60)
print("NEXT STEPS RECOMMENDATION")
print("=" * 60)
print("1. Re-train Linear Regression with best alpha ({})".format(best_alpha))
print("2. Generate new residuals with optimized model")
print("3. Train Neural Network on residuals (if dt_cv < residual_std)")
print("4. Compare sequential approach vs direct MLP")
print("5. Proceed with full residual modeling pipeline")