In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# ===============================
# Load datasets (from ../data/)
features = pd.read_csv("../data/clean_va_price.csv")
labels   = pd.read_csv("../data/resilience_metrics.csv")

# Merge on Industry
ml_df = pd.merge(features, labels, on="Industry", how="inner")

# Drop missing values in important columns
ml_df = ml_df.dropna(subset=["GrowthRate", "Volatility", "Baseline", "Recovered_Years"])

print("✅ Merged dataset shape:", ml_df.shape)
display(ml_df.head())

# ===============================
# Features + Labels
X = ml_df[["GrowthRate", "Volatility", "Baseline"]].copy()
X["Growth_x_Volatility"] = X["GrowthRate"] * X["Volatility"]
y = ml_df["Recovered_Years"]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Train ridge regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

# Predict + evaluate
y_pred = ridge.predict(X_test_scaled)

print("\n--- Ridge Regression Results ---")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² :", r2_score(y_test, y_pred))

results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
display(results.head(10))

# ===============================
# Save model + scaler (into ../models/)
os.makedirs("../models", exist_ok=True)
joblib.dump(ridge, "../models/ridge_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")
print("✅ ridge_model.pkl and scaler.pkl saved in ../models/")
