# Ridge Regression Method

This notebook implements Ridge regression with engineered features:
- Lag features (1-6 months)
- Rolling statistics (mean, geometric mean)
- Time-series cross-validation
- Regularization tuning
- Competition metric evaluation

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path('..').resolve()))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from src.data import DatasetPaths, load_all_training_tables
from src.features import build_time_lagged_features
from src.models import competition_score, build_linear_pipeline

plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
paths = DatasetPaths(root_dir=str(Path('..').resolve()))
train = load_all_training_tables(paths)
nht = train['new_house_transactions']
print(f"Training records: {len(nht)}")

## Feature Engineering

In [None]:
# Build lag and rolling features
lag_feats = build_time_lagged_features(nht)
print(f"Feature table shape: {lag_feats.shape}")
print(f"\nFeatures:")
feature_cols = [c for c in lag_feats.columns if c.startswith('lag_') or c.startswith('roll_')]
print(feature_cols)

In [None]:
# Prepare X, y
from src.data import prepare_train_target
target_wide, sector_index = prepare_train_target(nht)

y_long = target_wide.unstack().reset_index(name='y')
y_long = y_long.rename(columns={'level_0': 'sector_id'})

df = lag_feats.merge(y_long, on=['time', 'sector_id'], how='left')
df = df.dropna(subset=feature_cols).copy()

X = df[feature_cols]
y = df['y']

print(f"\nModeling dataset: {X.shape}")
print(f"Target statistics:")
print(y.describe())

## Hyperparameter Tuning: Alpha (Regularization)

In [None]:
# Grid search over alpha
alphas = np.logspace(-2, 3, 15)
results = []

# Time-series split: train on t <= 54, validate on t > 54
mask_train = df['time'] <= 54
X_tr, y_tr = X[mask_train], y[mask_train]
X_va, y_va = X[~mask_train], y[~mask_train]

print(f"Train: {X_tr.shape}, Validation: {X_va.shape}")

for alpha in alphas:
    pipe = build_linear_pipeline(alpha=alpha, kind='ridge')
    pipe.fit(X_tr, y_tr)
    yhat = pipe.predict(X_va)
    
    score_dict = competition_score(y_va.values, yhat)
    results.append({
        'alpha': alpha,
        'score': score_dict['score'],
        'good_rate': score_dict['good_rate']
    })

results_df = pd.DataFrame(results)
best_alpha = results_df.loc[results_df['score'].idxmin(), 'alpha']
print(f"\nBest alpha: {best_alpha:.4f}")
print(results_df.sort_values('score').head())

In [None]:
# Plot regularization curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(results_df['alpha'], results_df['score'], marker='o', linewidth=2)
axes[0].axvline(best_alpha, color='red', linestyle='--', label=f'Best: {best_alpha:.4f}')
axes[0].set_xscale('log')
axes[0].set_title('Competition Score vs Alpha')
axes[0].set_xlabel('Alpha (regularization)')
axes[0].set_ylabel('Competition Score')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(results_df['alpha'], results_df['good_rate'], marker='o', linewidth=2, color='orange')
axes[1].axvline(best_alpha, color='red', linestyle='--', label=f'Best: {best_alpha:.4f}')
axes[1].set_xscale('log')
axes[1].set_title('Good Rate vs Alpha')
axes[1].set_xlabel('Alpha (regularization)')
axes[1].set_ylabel('Good Rate')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Final Model Training & Validation

In [None]:
# Train final model with best alpha
pipe_final = build_linear_pipeline(alpha=best_alpha, kind='ridge')
pipe_final.fit(X_tr, y_tr)

yhat_va = pipe_final.predict(X_va)
final_score = competition_score(y_va.values, yhat_va)

print(f"\nFinal Validation Results:")
print(f"  Competition Score: {final_score['score']:.4f}")
print(f"  Good Rate: {final_score['good_rate']:.4f}")
print(f"  Predictions < 0: {(yhat_va < 0).sum()}")
print(f"  Predictions = 0: {(yhat_va == 0).sum()}")

In [None]:
# Feature importance (coefficient magnitudes)
coefs = pipe_final.named_steps['ridge'].coef_
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': coefs,
    'abs_coefficient': np.abs(coefs)
}).sort_values('abs_coefficient', ascending=False)

print(f"\nTop 10 Features by Coefficient Magnitude:")
print(feature_importance.head(10))

fig, ax = plt.subplots(1, 1, figsize=(10, 6))
top_features = feature_importance.head(15)
ax.barh(top_features['feature'], top_features['abs_coefficient'], edgecolor='black')
ax.set_xlabel('Absolute Coefficient')
ax.set_title('Top 15 Features by Importance')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## Error Analysis

In [None]:
# APE distribution
ape = np.abs((y_va.values - yhat_va) / np.maximum(y_va.values, 1e-12))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(ape[ape < 2], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(1.0, color='red', linestyle='--', label='APE = 100%')
axes[0].set_title('APE Distribution')
axes[0].set_xlabel('Absolute Percentage Error')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].scatter(y_va.values, yhat_va, alpha=0.3, s=10)
axes[1].plot([0, y_va.max()], [0, y_va.max()], 'r--', label='Perfect')
axes[1].set_title('Predictions vs True')
axes[1].set_xlabel('True Values')
axes[1].set_ylabel('Predictions')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nAPE Statistics:")
print(f"  Mean APE: {ape.mean():.4f}")
print(f"  Median APE: {np.median(ape):.4f}")
print(f"  APE < 100%: {(ape < 1.0).mean():.2%}")

## Summary

**Method**: Ridge regression with lag/rolling features

**Advantages**:
- Captures temporal dependencies
- Interpretable coefficients
- Regularization prevents overfitting

**Disadvantages**:
- Worse than simple baselines due to metric sensitivity
- Produces some negative predictions
- Doesn't handle zeros well