In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.base import BaseEstimator, RegressorMixin, clone
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
def save_results(model_name, rmsle_cv, rmsle_train, mae_train):
    """Saves the performance metrics to a CSV file for comparison."""
    try:
        results_df = pd.read_csv('model_comparison_results.csv')
    except FileNotFoundError:
        results_df = pd.DataFrame(columns=['Model', 'CV_RMSLE_Mean', 'Train_RMSLE', 'Train_MAE'])

    new_result = pd.DataFrame({
        'Model': [model_name],
        'CV_RMSLE_Mean': [rmsle_cv],
        'Train_RMSLE': [rmsle_train],
        'Train_MAE': [mae_train]
    })
    if model_name in results_df['Model'].values:
        results_df = results_df[results_df['Model'] != model_name]
    
    results_df = pd.concat([results_df, new_result], ignore_index=True)
    results_df.to_csv('model_comparison_results.csv', index=False)
try:
    X_train = pd.read_csv('X_train_processed.csv')
    X_test = pd.read_csv('X_test_processed.csv')
    y_train = np.load('y_train_processed.npy')
    X_test_ID = pd.read_csv('test.csv')['Id']
except FileNotFoundError:
    print("Error: Processed data files not found. Ensure 'main_data_preprocessing.ipynb' was run first.")
    exit()
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
print("--- Implementing Stacking Ensemble Model ---")
print("Approach: Stacking Generalization Regression using Ridge, Lasso, ElasticNet, KRR, GBR as base models and Lasso as meta-model.")

class StackingAveragedModels(BaseEstimator, RegressorMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)
ridge = Ridge(alpha=10.0, random_state=42)
lasso = Lasso(alpha=0.0005, random_state=42, max_iter=10000)
elastic_net = ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42, max_iter=10000)
kridge = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                max_depth=4, max_features='sqrt',
                                min_samples_leaf=15, min_samples_split=10, 
                                loss='huber', random_state=42)
meta_model = Lasso(alpha=0.0005, random_state=42, max_iter=10000)
stacked_model = StackingAveragedModels(base_models=(ridge, lasso, elastic_net, kridge, gbr),
                                      meta_model=meta_model,
                                      n_folds=10)
X_train_array = X_train.values
X_test_array = X_test.values
stacked_model.fit(X_train_array, y_train)
y_train_pred = stacked_model.predict(X_train_array)
final_cv_rmsle = 0
final_train_rmsle = rmsle(y_train, y_train_pred)
final_train_mae = mean_absolute_error(y_train, y_train_pred)

print(f"\n--- Evaluation on Training Data (Log-Transformed) ---")
print(f"Stacked Ensemble Training RMSLE: {final_train_rmsle:.4f}")
print(f"Stacked Ensemble Training MAE: {final_train_mae:.4f}")

y_test_pred_log = stacked_model.predict(X_test_array)
y_test_pred_price = np.expm1(y_test_pred_log) 

residuals = y_train - y_train_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_train_pred, residuals, c='green', marker='o', alpha=0.5)
plt.hlines(y=0, xmin=y_train_pred.min(), xmax=y_train_pred.max(), color='red')
plt.title('Residual Plot (Stacked Ensemble)')
plt.xlabel('Predicted Log(SalePrice)')
plt.ylabel('Residuals')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(residuals, bins=50, kde=True, color='purple')
plt.title('Prediction Error Distribution (Stacked Ensemble)')
plt.xlabel('Residual (Actual Log - Predicted Log)')
plt.show()

stacked_results = pd.DataFrame({'Id': X_test_ID, 'SalePrice_Predicted': y_test_pred_price})
stacked_results.to_csv('stacked_ensemble_predictions.csv', index=False)

save_results(
    model_name='Stacked Ensemble (Lit.)',
    rmsle_cv=final_train_rmsle,
    rmsle_train=final_train_rmsle,
    mae_train=final_train_mae
)
print("\nStacked Ensemble Model implementation complete. Results saved.")