In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def save_results(model_name, rmsle_cv, rmsle_train, mae_train):
    """Saves the performance metrics to a CSV file for comparison."""
    try:
        results_df = pd.read_csv('model_comparison_results.csv')
    except FileNotFoundError:
        results_df = pd.DataFrame(columns=['Model', 'CV_RMSLE_Mean', 'Train_RMSLE', 'Train_MAE'])

    new_result = pd.DataFrame({
        'Model': [model_name],
        'CV_RMSLE_Mean': [rmsle_cv],
        'Train_RMSLE': [rmsle_train],
        'Train_MAE': [mae_train]
    })
    

    if model_name in results_df['Model'].values:
        results_df = results_df[results_df['Model'] != model_name]
    
    results_df = pd.concat([results_df, new_result], ignore_index=True)
    results_df.to_csv('model_comparison_results.csv', index=False)


try:
    X_train = pd.read_csv('X_train_processed.csv')
    X_test = pd.read_csv('X_test_processed.csv')
    y_train = np.load('y_train_processed.npy')
    X_test_ID = pd.read_csv('test.csv')['Id']
except FileNotFoundError:
    print("Error: Processed data files not found. Ensure 'main_data_preprocessing.ipynb' was run first.")
    exit()

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

print("--- Implementing Ridge Regression ---")

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

ridge = Ridge(random_state=42)
param_grid = {
    'alpha': [1.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag']
}

grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=kfolds, 
                           n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

best_ridge = grid_search.best_estimator_
best_alpha = grid_search.best_params_['alpha']
print(f"\nBest Hyperparameters found: {grid_search.best_params_}")

best_ridge.fit(X_train, y_train)
y_train_pred = best_ridge.predict(X_train)

cv_scores = cross_val_score(best_ridge, X_train, y_train, 
                            scoring='neg_mean_squared_error', cv=kfolds)
cv_rmse_scores = np.sqrt(-cv_scores)


final_cv_rmsle = cv_rmse_scores.mean()
final_train_rmsle = rmsle(y_train, y_train_pred)
final_train_mae = mean_absolute_error(y_train, y_train_pred)

print("\n--- Evaluation on Training Data (Log-Transformed) ---")
print(f"Ridge CV Mean RMSE: {final_cv_rmsle:.4f} (Std: {cv_rmse_scores.std():.4f})")
print(f"Ridge Training RMSLE: {final_train_rmsle:.4f}")
print(f"Ridge Training MAE: {final_train_mae:.4f}")


y_test_pred_log = best_ridge.predict(X_test)
y_test_pred_price = np.expm1(y_test_pred_log) 


coefficients = pd.Series(best_ridge.coef_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 8))
coefficients.head(10).plot(kind='barh', title='Top 10 Positive Feature Coefficients (Ridge)')
plt.show()

plt.figure(figsize=(10, 8))
coefficients.tail(10).plot(kind='barh', title='Top 10 Negative Feature Coefficients (Ridge)')
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(y_train_pred, y_train - y_train_pred, c='blue', marker='o', alpha=0.5)
plt.hlines(y=0, xmin=y_train_pred.min(), xmax=y_train_pred.max(), color='red')
plt.title('Residual Plot (Ridge Regression)')
plt.xlabel('Predicted Log(SalePrice)')
plt.ylabel('Residuals')
plt.show()


ridge_results = pd.DataFrame({'Id': X_test_ID, 'SalePrice_Predicted': y_test_pred_price})
ridge_results.to_csv('ridge_predictions.csv', index=False) # Uncomment to save

save_results(
    model_name='Ridge Regression',
    rmsle_cv=final_cv_rmsle,
    rmsle_train=final_train_rmsle,
    mae_train=final_train_mae
)

print("\nRidge Regression Model implementation complete.")
print(f"Test predictions (in $): First 5 values: {y_test_pred_price[:5]}")