In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
def save_results(model_name, rmsle_cv, rmsle_train, mae_train):
    """Saves the performance metrics to a CSV file for comparison."""
    try:
        results_df = pd.read_csv('model_comparison_results.csv')
    except FileNotFoundError:
        results_df = pd.DataFrame(columns=['Model', 'CV_RMSLE_Mean', 'Train_RMSLE', 'Train_MAE'])

    new_result = pd.DataFrame({
        'Model': [model_name],
        'CV_RMSLE_Mean': [rmsle_cv],
        'Train_RMSLE': [rmsle_train],
        'Train_MAE': [mae_train]
    })
    
    if model_name in results_df['Model'].values:
        results_df = results_df[results_df['Model'] != model_name]
    
    results_df = pd.concat([results_df, new_result], ignore_index=True)
    results_df.to_csv('model_comparison_results.csv', index=False)

try:
    X_train = pd.read_csv('X_train_processed.csv')
    X_test = pd.read_csv('X_test_processed.csv')
    y_train = np.load('y_train_processed.npy')
    X_test_ID = pd.read_csv('test.csv')['Id']
except FileNotFoundError:
    print("Error: Processed data files not found. Ensure 'main_data_preprocessing.ipynb' was run first.")
    exit()

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
print("--- Implementing Lasso Regression (Literature Approach: Regularization) ---")
print("Focus: The Lasso model, coupled with log-transformed target variable and features, provides interpretability and strong feature selection.")
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
lasso = Lasso(random_state=42, max_iter=10000)
param_grid = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01] 
}
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=kfolds, 
                           n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
best_lasso = grid_search.best_estimator_
best_alpha = grid_search.best_params_['alpha']
print(f"\nBest Hyperparameters found: {grid_search.best_params_}")
best_lasso.fit(X_train, y_train)

y_train_pred = best_lasso.predict(X_train)
cv_scores = cross_val_score(best_lasso, X_train, y_train, 
                            scoring='neg_mean_squared_error', cv=kfolds)
cv_rmse_scores = np.sqrt(-cv_scores)
final_cv_rmsle = cv_rmse_scores.mean()
final_train_rmsle = rmsle(y_train, y_train_pred)
final_train_mae = mean_absolute_error(y_train, y_train_pred)

print("\n--- Evaluation on Training Data (Log-Transformed) ---")
print(f"Lasso CV Mean RMSE: {final_cv_rmsle:.4f} (Std: {cv_rmse_scores.std():.4f})")
print(f"Lasso Training RMSLE: {final_train_rmsle:.4f}")
print(f"Lasso Training MAE: {final_train_mae:.4f}")
coefficients = pd.Series(best_lasso.coef_, index=X_train.columns)
nonzero_coefs = coefficients[coefficients != 0]

print(f"\nFeatures selected by Lasso (non-zero coefficients): {len(nonzero_coefs)} / {len(coefficients)}")
plt.figure(figsize=(10, 8))
nonzero_coefs.sort_values(ascending=False).head(15).plot(kind='barh', title='Top 15 Non-Zero Feature Coefficients (Lasso)')
plt.show() 
plt.figure(figsize=(10, 6))
plt.scatter(y_train_pred, y_train - y_train_pred, c='red', marker='o', alpha=0.5)
plt.hlines(y=0, xmin=y_train_pred.min(), xmax=y_train_pred.max(), color='blue')
plt.title('Residual Plot (Lasso Regression)')
plt.xlabel('Predicted Log(SalePrice)')
plt.ylabel('Residuals')
plt.show()

y_test_pred_log = best_lasso.predict(X_test)
y_test_pred_price = np.expm1(y_test_pred_log) 
lasso_results = pd.DataFrame({'Id': X_test_ID, 'SalePrice_Predicted': y_test_pred_price})
lasso_results.to_csv('lasso_regression_predictions.csv', index=False)


save_results(
    model_name='Lasso (Lit.)',
    rmsle_cv=final_cv_rmsle,
    rmsle_train=final_train_rmsle,
    mae_train=final_train_mae
)
print("\nLasso Regression Model implementation complete. Results saved.")