# House Price Prediction - Surprise Housing
## Ridge and Lasso Regression Analysis

### Business Objective
Build a regression model using regularization to predict house prices in the Australian market and identify significant predictors.

## 1. Import Libraries

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Statistical libraries
from scipy import stats
from scipy.stats import norm

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('ggplot')
%matplotlib inline

## 2. Load and Understand the Data

In [2]:
# Load the dataset
df = pd.read_csv('train.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\n" + "="*80)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
# Check data types and non-null counts
df.info()

In [None]:
# Statistical summary of numerical features
df.describe()

In [None]:
# Check target variable distribution
print("Target Variable (SalePrice) Statistics:")
print(df['SalePrice'].describe())

# Visualize target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['SalePrice'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Sale Price')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Sale Price')

# Q-Q plot
stats.probplot(df['SalePrice'], dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot of Sale Price')

plt.tight_layout()
plt.show()

print(f"\nSkewness: {df['SalePrice'].skew():.2f}")
print(f"Kurtosis: {df['SalePrice'].kurtosis():.2f}")

## 3. Data Quality Checks

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percent': missing_percent
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

print(f"Total features with missing values: {len(missing_df)}")
print("\nTop features with missing values:")
print(missing_df.head(10))

In [None]:
# Visualize missing values
if len(missing_df) > 0:
    plt.figure(figsize=(12, 6))
    missing_df.head(20)['Missing_Percent'].plot(kind='barh')
    plt.xlabel('Missing Percentage')
    plt.title('Top 20 Features with Missing Values')
    plt.tight_layout()
    plt.show()

In [None]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# If duplicates exist, remove them
if duplicates > 0:
    df = df.drop_duplicates()
    print(f"Removed {duplicates} duplicate rows")
    print(f"New dataset shape: {df.shape}")

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Separate numerical and categorical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Id' from numerical features if present
if 'Id' in numerical_features:
    numerical_features.remove('Id')

# Remove 'SalePrice' from numerical features for correlation analysis
numerical_features_without_target = [col for col in numerical_features if col != 'SalePrice']

print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")

In [None]:
# Correlation analysis with target variable
correlations = df[numerical_features].corr()['SalePrice'].sort_values(ascending=False)
print("Top 15 features correlated with SalePrice:")
print(correlations.head(15))

print("\nBottom 10 features correlated with SalePrice:")
print(correlations.tail(10))

In [None]:
# Visualize correlation heatmap for top features
top_features = correlations.head(11).index.tolist()  # Top 10 + SalePrice
plt.figure(figsize=(12, 10))
sns.heatmap(df[top_features].corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Top Features with SalePrice')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots for top correlated features
top_corr_features = correlations.head(6).index.tolist()[1:]  # Exclude SalePrice itself

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, feature in enumerate(top_corr_features[:6]):
    axes[idx].scatter(df[feature], df['SalePrice'], alpha=0.5)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('SalePrice')
    axes[idx].set_title(f'{feature} vs SalePrice (r={correlations[feature]:.2f})')

plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical features - check unique values
print("Categorical Features - Unique Value Counts:")
for col in categorical_features[:10]:  # Show first 10
    print(f"\n{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts().head())

In [None]:
# Box plots for important categorical features
important_cat_features = ['OverallQual', 'Neighborhood', 'ExterQual', 'KitchenQual']

for feature in important_cat_features:
    if feature in df.columns:
        plt.figure(figsize=(14, 6))
        if df[feature].dtype == 'object':
            order = df.groupby(feature)['SalePrice'].median().sort_values(ascending=False).index
            sns.boxplot(x=feature, y='SalePrice', data=df, order=order)
        else:
            sns.boxplot(x=feature, y='SalePrice', data=df)
        plt.xticks(rotation=45)
        plt.title(f'SalePrice Distribution by {feature}')
        plt.tight_layout()
        plt.show()

## 5. Data Preparation and Cleaning

In [None]:
# Create a copy for processing
df_processed = df.copy()

# Drop 'Id' column as it's not useful for prediction
if 'Id' in df_processed.columns:
    df_processed = df_processed.drop('Id', axis=1)

print(f"Dataset shape after dropping Id: {df_processed.shape}")

In [None]:
# Handle missing values
# For features where NA means 'None', fill with 'None'
none_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 
                 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 
                 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']

for feature in none_features:
    if feature in df_processed.columns:
        df_processed[feature] = df_processed[feature].fillna('None')

print("Filled NA values with 'None' for categorical features where NA means absence")

In [None]:
# For numerical features, fill with 0 where NA means absence
zero_features = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 
                 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']

for feature in zero_features:
    if feature in df_processed.columns:
        df_processed[feature] = df_processed[feature].fillna(0)

print("Filled NA values with 0 for numerical features where NA means absence")

In [None]:
# For LotFrontage, fill with median by neighborhood
if 'LotFrontage' in df_processed.columns:
    df_processed['LotFrontage'] = df_processed.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))
    print("Filled LotFrontage with neighborhood median")

# For Electrical, fill with mode
if 'Electrical' in df_processed.columns:
    df_processed['Electrical'] = df_processed['Electrical'].fillna(df_processed['Electrical'].mode()[0])
    print("Filled Electrical with mode")

In [None]:
# Drop columns with too many missing values (>40%)
threshold = 0.4
missing_pct = df_processed.isnull().sum() / len(df_processed)
cols_to_drop = missing_pct[missing_pct > threshold].index.tolist()

if cols_to_drop:
    print(f"Dropping columns with >{threshold*100}% missing values: {cols_to_drop}")
    df_processed = df_processed.drop(cols_to_drop, axis=1)

# Fill remaining missing values
for col in df_processed.columns:
    if df_processed[col].isnull().sum() > 0:
        if df_processed[col].dtype == 'object':
            df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
        else:
            df_processed[col] = df_processed[col].fillna(df_processed[col].median())

print(f"\nRemaining missing values: {df_processed.isnull().sum().sum()}")

In [None]:
# Handle outliers in target variable (SalePrice)
# Remove extreme outliers using IQR method
Q1 = df_processed['SalePrice'].quantile(0.25)
Q3 = df_processed['SalePrice'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 3 * IQR  # Using 3*IQR for more conservative outlier removal
upper_bound = Q3 + 3 * IQR

outliers = df_processed[(df_processed['SalePrice'] < lower_bound) | (df_processed['SalePrice'] > upper_bound)]
print(f"Number of outliers in SalePrice: {len(outliers)}")

# Remove outliers
df_processed = df_processed[(df_processed['SalePrice'] >= lower_bound) & (df_processed['SalePrice'] <= upper_bound)]
print(f"Dataset shape after outlier removal: {df_processed.shape}")

## 6. Feature Engineering

In [None]:
# Create new features based on domain knowledge

# Total square footage
if all(col in df_processed.columns for col in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']):
    df_processed['TotalSF'] = df_processed['TotalBsmtSF'] + df_processed['1stFlrSF'] + df_processed['2ndFlrSF']
    print("Created TotalSF feature")

# Total bathrooms
if all(col in df_processed.columns for col in ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath']):
    df_processed['TotalBathrooms'] = (df_processed['BsmtFullBath'] + df_processed['BsmtHalfBath'] * 0.5 + 
                                       df_processed['FullBath'] + df_processed['HalfBath'] * 0.5)
    print("Created TotalBathrooms feature")

# Total porch area
porch_cols = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
if all(col in df_processed.columns for col in porch_cols):
    df_processed['TotalPorchSF'] = df_processed[porch_cols].sum(axis=1)
    print("Created TotalPorchSF feature")

# House age
if 'YearBuilt' in df_processed.columns and 'YrSold' in df_processed.columns:
    df_processed['HouseAge'] = df_processed['YrSold'] - df_processed['YearBuilt']
    print("Created HouseAge feature")

# Years since remodel
if 'YearRemodAdd' in df_processed.columns and 'YrSold' in df_processed.columns:
    df_processed['YearsSinceRemod'] = df_processed['YrSold'] - df_processed['YearRemodAdd']
    print("Created YearsSinceRemod feature")

# Is house remodeled?
if 'YearBuilt' in df_processed.columns and 'YearRemodAdd' in df_processed.columns:
    df_processed['IsRemodeled'] = (df_processed['YearRemodAdd'] != df_processed['YearBuilt']).astype(int)
    print("Created IsRemodeled feature")

## 7. Create Dummy Variables

In [None]:
# Get categorical columns
categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
print(f"Number of categorical features: {len(categorical_cols)}")

# Create dummy variables
df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)

print(f"\nDataset shape after creating dummy variables: {df_processed.shape}")

## 8. Prepare Data for Modeling

In [None]:
# Separate features and target
X = df_processed.drop('SalePrice', axis=1)
y = df_processed['SalePrice']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Split data into training and testing sets (70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

In [None]:
# Feature scaling - StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier interpretation
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Feature scaling completed using StandardScaler")

## 9. Model Building - Linear Regression (Baseline)

In [None]:
# Build baseline Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_lr = lr_model.predict(X_train_scaled)
y_test_pred_lr = lr_model.predict(X_test_scaled)

# Evaluation metrics
r2_train_lr = r2_score(y_train, y_train_pred_lr)
r2_test_lr = r2_score(y_test, y_test_pred_lr)
rmse_train_lr = np.sqrt(mean_squared_error(y_train, y_train_pred_lr))
rmse_test_lr = np.sqrt(mean_squared_error(y_test, y_test_pred_lr))

print("="*60)
print("LINEAR REGRESSION (Baseline Model)")
print("="*60)
print(f"Training R² Score: {r2_train_lr:.4f}")
print(f"Testing R² Score: {r2_test_lr:.4f}")
print(f"Training RMSE: ${rmse_train_lr:,.2f}")
print(f"Testing RMSE: ${rmse_test_lr:,.2f}")
print("="*60)

## 10. Model Building - Ridge Regression

In [None]:
# Ridge Regression with cross-validation to find optimal alpha
# Test a range of alpha values
alphas = [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000]

ridge_cv = RidgeCV(alphas=alphas, cv=5, scoring='r2')
ridge_cv.fit(X_train_scaled, y_train)

optimal_alpha_ridge = ridge_cv.alpha_
print(f"Optimal Alpha for Ridge Regression: {optimal_alpha_ridge}")

In [None]:
# Build Ridge model with optimal alpha
ridge_model = Ridge(alpha=optimal_alpha_ridge)
ridge_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_ridge = ridge_model.predict(X_train_scaled)
y_test_pred_ridge = ridge_model.predict(X_test_scaled)

# Evaluation metrics
r2_train_ridge = r2_score(y_train, y_train_pred_ridge)
r2_test_ridge = r2_score(y_test, y_test_pred_ridge)
rmse_train_ridge = np.sqrt(mean_squared_error(y_train, y_train_pred_ridge))
rmse_test_ridge = np.sqrt(mean_squared_error(y_test, y_test_pred_ridge))
mae_test_ridge = mean_absolute_error(y_test, y_test_pred_ridge)

print("="*60)
print(f"RIDGE REGRESSION (Alpha = {optimal_alpha_ridge})")
print("="*60)
print(f"Training R² Score: {r2_train_ridge:.4f}")
print(f"Testing R² Score: {r2_test_ridge:.4f}")
print(f"Training RMSE: ${rmse_train_ridge:,.2f}")
print(f"Testing RMSE: ${rmse_test_ridge:,.2f}")
print(f"Testing MAE: ${mae_test_ridge:,.2f}")
print("="*60)

In [None]:
# Cross-validation score for Ridge
ridge_cv_scores = cross_val_score(ridge_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"\nRidge Cross-Validation R² Scores: {ridge_cv_scores}")
print(f"Mean CV R² Score: {ridge_cv_scores.mean():.4f} (+/- {ridge_cv_scores.std() * 2:.4f})")

In [None]:
# Get feature importance for Ridge
ridge_coef = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': ridge_model.coef_
})
ridge_coef['Abs_Coefficient'] = np.abs(ridge_coef['Coefficient'])
ridge_coef = ridge_coef.sort_values('Abs_Coefficient', ascending=False)

print("\nTop 20 Most Important Features (Ridge):")
print(ridge_coef.head(20))

In [None]:
# Visualize top features for Ridge
plt.figure(figsize=(12, 8))
top_20_ridge = ridge_coef.head(20)
plt.barh(range(len(top_20_ridge)), top_20_ridge['Coefficient'])
plt.yticks(range(len(top_20_ridge)), top_20_ridge['Feature'])
plt.xlabel('Coefficient Value')
plt.title('Top 20 Feature Coefficients - Ridge Regression')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 11. Model Building - Lasso Regression

In [None]:
# Lasso Regression with cross-validation to find optimal alpha
alphas_lasso = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]

lasso_cv = LassoCV(alphas=alphas_lasso, cv=5, random_state=42, max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)

optimal_alpha_lasso = lasso_cv.alpha_
print(f"Optimal Alpha for Lasso Regression: {optimal_alpha_lasso}")

In [None]:
# Build Lasso model with optimal alpha
lasso_model = Lasso(alpha=optimal_alpha_lasso, random_state=42, max_iter=10000)
lasso_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_lasso = lasso_model.predict(X_train_scaled)
y_test_pred_lasso = lasso_model.predict(X_test_scaled)

# Evaluation metrics
r2_train_lasso = r2_score(y_train, y_train_pred_lasso)
r2_test_lasso = r2_score(y_test, y_test_pred_lasso)
rmse_train_lasso = np.sqrt(mean_squared_error(y_train, y_train_pred_lasso))
rmse_test_lasso = np.sqrt(mean_squared_error(y_test, y_test_pred_lasso))
mae_test_lasso = mean_absolute_error(y_test, y_test_pred_lasso)

print("="*60)
print(f"LASSO REGRESSION (Alpha = {optimal_alpha_lasso})")
print("="*60)
print(f"Training R² Score: {r2_train_lasso:.4f}")
print(f"Testing R² Score: {r2_test_lasso:.4f}")
print(f"Training RMSE: ${rmse_train_lasso:,.2f}")
print(f"Testing RMSE: ${rmse_test_lasso:,.2f}")
print(f"Testing MAE: ${mae_test_lasso:,.2f}")
print("="*60)

In [None]:
# Cross-validation score for Lasso
lasso_cv_scores = cross_val_score(lasso_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"\nLasso Cross-Validation R² Scores: {lasso_cv_scores}")
print(f"Mean CV R² Score: {lasso_cv_scores.mean():.4f} (+/- {lasso_cv_scores.std() * 2:.4f})")

In [None]:
# Get feature importance for Lasso
lasso_coef = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lasso_model.coef_
})
lasso_coef['Abs_Coefficient'] = np.abs(lasso_coef['Coefficient'])
lasso_coef = lasso_coef.sort_values('Abs_Coefficient', ascending=False)

# Count non-zero coefficients
non_zero_coef = (lasso_coef['Coefficient'] != 0).sum()
print(f"\nNumber of features selected by Lasso: {non_zero_coef} out of {len(lasso_coef)}")
print(f"Number of features eliminated: {len(lasso_coef) - non_zero_coef}")

print("\nTop 20 Most Important Features (Lasso):")
print(lasso_coef.head(20))

In [None]:
# Visualize top features for Lasso
plt.figure(figsize=(12, 8))
top_20_lasso = lasso_coef[lasso_coef['Coefficient'] != 0].head(20)
plt.barh(range(len(top_20_lasso)), top_20_lasso['Coefficient'])
plt.yticks(range(len(top_20_lasso)), top_20_lasso['Feature'])
plt.xlabel('Coefficient Value')
plt.title('Top 20 Feature Coefficients - Lasso Regression')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 12. Model Comparison

In [None]:
# Create comparison table
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge', 'Lasso'],
    'Train R²': [r2_train_lr, r2_train_ridge, r2_train_lasso],
    'Test R²': [r2_test_lr, r2_test_ridge, r2_test_lasso],
    'Train RMSE': [rmse_train_lr, rmse_train_ridge, rmse_train_lasso],
    'Test RMSE': [rmse_test_lr, rmse_test_ridge, rmse_test_lasso],
    'Test MAE': [mean_absolute_error(y_test, y_test_pred_lr), mae_test_ridge, mae_test_lasso]
})

print("="*80)
print("MODEL COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# R² comparison
x = np.arange(len(comparison_df))
width = 0.35
axes[0].bar(x - width/2, comparison_df['Train R²'], width, label='Train R²')
axes[0].bar(x + width/2, comparison_df['Test R²'], width, label='Test R²')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('R² Score')
axes[0].set_title('R² Score Comparison')
axes[0].set_xticks(x)
axes[0].set_xticklabels(comparison_df['Model'])
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# RMSE comparison
axes[1].bar(x - width/2, comparison_df['Train RMSE'], width, label='Train RMSE')
axes[1].bar(x + width/2, comparison_df['Test RMSE'], width, label='Test RMSE')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('RMSE')
axes[1].set_title('RMSE Comparison')
axes[1].set_xticks(x)
axes[1].set_xticklabels(comparison_df['Model'])
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 13. Residual Analysis

In [None]:
# Residual plots for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Linear Regression residuals
residuals_lr = y_test - y_test_pred_lr
axes[0].scatter(y_test_pred_lr, residuals_lr, alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Linear Regression - Residual Plot')
axes[0].grid(True, alpha=0.3)

# Ridge residuals
residuals_ridge = y_test - y_test_pred_ridge
axes[1].scatter(y_test_pred_ridge, residuals_ridge, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Values')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Ridge Regression - Residual Plot')
axes[1].grid(True, alpha=0.3)

# Lasso residuals
residuals_lasso = y_test - y_test_pred_lasso
axes[2].scatter(y_test_pred_lasso, residuals_lasso, alpha=0.5)
axes[2].axhline(y=0, color='r', linestyle='--')
axes[2].set_xlabel('Predicted Values')
axes[2].set_ylabel('Residuals')
axes[2].set_title('Lasso Regression - Residual Plot')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Actual vs Predicted plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Linear Regression
axes[0].scatter(y_test, y_test_pred_lr, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price')
axes[0].set_ylabel('Predicted Price')
axes[0].set_title(f'Linear Regression (R²={r2_test_lr:.4f})')
axes[0].grid(True, alpha=0.3)

# Ridge
axes[1].scatter(y_test, y_test_pred_ridge, alpha=0.5)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Price')
axes[1].set_ylabel('Predicted Price')
axes[1].set_title(f'Ridge Regression (R²={r2_test_ridge:.4f})')
axes[1].grid(True, alpha=0.3)

# Lasso
axes[2].scatter(y_test, y_test_pred_lasso, alpha=0.5)
axes[2].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[2].set_xlabel('Actual Price')
axes[2].set_ylabel('Predicted Price')
axes[2].set_title(f'Lasso Regression (R²={r2_test_lasso:.4f})')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 14. Alpha Sensitivity Analysis

In [None]:
# Test Ridge with double alpha
ridge_2x = Ridge(alpha=optimal_alpha_ridge * 2)
ridge_2x.fit(X_train_scaled, y_train)
y_test_pred_ridge_2x = ridge_2x.predict(X_test_scaled)
r2_ridge_2x = r2_score(y_test, y_test_pred_ridge_2x)

print(f"Ridge with 2x Alpha ({optimal_alpha_ridge * 2}):")
print(f"R² Score: {r2_ridge_2x:.4f}")
print(f"Change in R²: {r2_ridge_2x - r2_test_ridge:.4f}")

# Get top features
ridge_2x_coef = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': ridge_2x.coef_
})
ridge_2x_coef['Abs_Coefficient'] = np.abs(ridge_2x_coef['Coefficient'])
ridge_2x_coef = ridge_2x_coef.sort_values('Abs_Coefficient', ascending=False)
print("\nTop 10 Features with 2x Alpha (Ridge):")
print(ridge_2x_coef.head(10))

In [None]:
# Test Lasso with double alpha
lasso_2x = Lasso(alpha=optimal_alpha_lasso * 2, random_state=42, max_iter=10000)
lasso_2x.fit(X_train_scaled, y_train)
y_test_pred_lasso_2x = lasso_2x.predict(X_test_scaled)
r2_lasso_2x = r2_score(y_test, y_test_pred_lasso_2x)

print(f"\nLasso with 2x Alpha ({optimal_alpha_lasso * 2}):")
print(f"R² Score: {r2_lasso_2x:.4f}")
print(f"Change in R²: {r2_lasso_2x - r2_test_lasso:.4f}")

# Get top features
lasso_2x_coef = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lasso_2x.coef_
})
lasso_2x_coef['Abs_Coefficient'] = np.abs(lasso_2x_coef['Coefficient'])
lasso_2x_coef = lasso_2x_coef.sort_values('Abs_Coefficient', ascending=False)
non_zero_2x = (lasso_2x_coef['Coefficient'] != 0).sum()
print(f"\nNumber of features selected: {non_zero_2x}")
print("\nTop 10 Features with 2x Alpha (Lasso):")
print(lasso_2x_coef[lasso_2x_coef['Coefficient'] != 0].head(10))

## 15. Key Findings and Recommendations

In [None]:
# Summary of key findings
print("="*80)
print("KEY FINDINGS AND BUSINESS RECOMMENDATIONS")
print("="*80)

print("\n1. OPTIMAL ALPHA VALUES:")
print(f"   - Ridge Regression: α = {optimal_alpha_ridge}")
print(f"   - Lasso Regression: α = {optimal_alpha_lasso}")

print("\n2. MODEL PERFORMANCE:")
print(f"   - Lasso R² Score: {r2_test_lasso:.4f}")
print(f"   - Ridge R² Score: {r2_test_ridge:.4f}")
print(f"   - Lasso selected {non_zero_coef} features out of {len(lasso_coef)}")

print("\n3. TOP 5 MOST IMPORTANT PREDICTORS (Lasso):")
top_5_features = lasso_coef[lasso_coef['Coefficient'] != 0].head(5)
for idx, row in top_5_features.iterrows():
    print(f"   {idx+1}. {row['Feature']}: {row['Coefficient']:.2f}")

print("\n4. BUSINESS INSIGHTS:")
print("   - Overall quality is the strongest price predictor")
print("   - Living area size significantly impacts price")
print("   - Location (neighborhood) plays a crucial role")
print("   - Recent construction/renovation adds substantial value")
print("   - Basement and garage features are important")

print("\n5. RECOMMENDED MODEL:")
if r2_test_lasso >= r2_test_ridge:
    print("   - LASSO Regression (better feature selection and interpretability)")
else:
    print("   - RIDGE Regression (slightly better performance)")

print("="*80)

## 16. Save Results for Reference

In [None]:
# Save top features from Lasso model
lasso_features_selected = lasso_coef[lasso_coef['Coefficient'] != 0].copy()
lasso_features_selected.to_csv('lasso_selected_features.csv', index=False)
print(f"Saved {len(lasso_features_selected)} selected features to 'lasso_selected_features.csv'")

# Save model comparison
comparison_df.to_csv('model_comparison.csv', index=False)
print("Saved model comparison to 'model_comparison.csv'")

print("\nAnalysis complete!")