In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
import time
import matplotlib.pyplot as plt

# Load the dataset
file_path = '../data/AmesData.csv'
ames_df = pd.read_csv(file_path)

# Preprocess the data
ames_df.replace({'MasVnrArea': 'nan'}, 0, inplace=True)
num_imputer = SimpleImputer(strategy='median')
num_cols_with_missing = ames_df.select_dtypes(include=[np.number]).columns[ames_df.select_dtypes(include=[np.number]).isnull().any()].tolist()
ames_df[num_cols_with_missing] = num_imputer.fit_transform(ames_df[num_cols_with_missing])
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_cols_with_missing = ames_df.select_dtypes(exclude=[np.number]).columns[ames_df.select_dtypes(exclude=[np.number]).isnull().any()].tolist()
ames_df[cat_cols_with_missing] = cat_imputer.fit_transform(ames_df[cat_cols_with_missing])
bool_cols = ames_df.select_dtypes(include=[bool]).columns.tolist()
ames_df[bool_cols] = ames_df[bool_cols].astype(int)
ames_df = ames_df.drop(ames_df[ames_df['GrLivArea'] > 4000].index)
ames_df = pd.get_dummies(ames_df, drop_first=True)
scaler = StandardScaler()
num_features = ames_df.select_dtypes(include=[np.number]).columns.drop('SalePrice')
ames_df[num_features] = scaler.fit_transform(ames_df[num_features])
saleprice_scaler = StandardScaler()
ames_df['SalePrice'] = saleprice_scaler.fit_transform(ames_df['SalePrice'].values.reshape(-1, 1))

# Separate features and target variable
X = ames_df.drop('SalePrice', axis=1)
y = ames_df['SalePrice']

# Fit PCA with the preprocessed data
pca = PCA()
pca.fit(X)

# Plot the cumulative explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

# Apply PCA
n_components = 20
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define training and evaluation function
def train_and_evaluate_model(model, model_name, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    y_pred = model.predict(X_test)

    # Reverse scale the predictions and true values
    y_test_usd = saleprice_scaler.inverse_transform(y_test.values.reshape(-1, 1))
    y_pred_usd = saleprice_scaler.inverse_transform(y_pred.reshape(-1, 1))

    mae = mean_absolute_error(y_test_usd, y_pred_usd)
    mse = mean_squared_error(y_test_usd, y_pred_usd)
    rmse = root_mean_squared_error(y_test_usd, y_pred_usd)
    r2 = r2_score(y_test_usd, y_pred_usd)

    print(f"\nModel: {model_name}")
    print(f"MAE (in USD): ${mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R^2: {r2}")
    print(f"Training Time: {training_time}s")

    return y_pred

# Function to train and evaluate multiple models
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Lasso Regression": Lasso(),
        "Ridge Regression": Ridge(),
        "Elastic Net": ElasticNet(),
        "Random Forest": RandomForestRegressor(),
        "Support Vector Regression": SVR(),
        "Artificial Neural Network": MLPRegressor(max_iter=1000),
        "XGBoost": XGBRegressor(),
        "Gradient Boosting": GradientBoostingRegressor()
    }

    predictions = {}

    for model_name, model in models.items():
        y_pred = train_and_evaluate_model(model, model_name, X_train, X_test, y_train, y_test)
        predictions[model_name] = y_pred

    return predictions

# Train and evaluate all models
predictions = train_and_evaluate_models(X_train, X_test, y_train, y_test)

# Function to plot actual vs predicted values
def plot_actual_vs_predicted(y_test, y_pred, model_name):
    y_test_usd = saleprice_scaler.inverse_transform(y_test.values.reshape(-1, 1))
    y_pred_usd = saleprice_scaler.inverse_transform(y_pred.reshape(-1, 1))

    plt.figure(figsize=(10, 6))
    plt.scatter(y_test_usd, y_pred_usd, alpha=0.3)
    plt.plot([y_test_usd.min(), y_test_usd.max()], [y_test_usd.min(), y_test_usd.max()], 'r--', linewidth=2)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'Actual vs Predicted: {model_name}')
    plt.grid(True)
    plt.show()

# Plot actual vs predicted values for all models
for model_name, y_pred in predictions.items():
    plot_actual_vs_predicted(y_test, y_pred, model_name)

# Hyperparameter tuning function
def hyperparameter_tuning(model, param_grid, X_train, y_train):
    grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=200, cv=5, verbose=2, random_state=42, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Define parameter grids for hyperparameter tuning
param_grids = {
    "Random Forest": {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    },
    "Lasso Regression": {
        'alpha': np.logspace(-6, 6, 13)
    },
    "Ridge Regression": {
        'alpha': [0.1, 1, 10, 100]
    },
    "Elastic Net": {
        'alpha': np.logspace(-6, 6, 13),
        'l1_ratio': [0.1, 0.5, 0.7, 1.0]
    },
    "XGBoost": {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    "Gradient Boosting": {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.8, 1.0]
    },
    "Support Vector Regression": {
        'C': [0.001, 0.01, 0.1, 1],
        'epsilon': [0.01, 0.1, 0.5],
        'kernel': ['linear', 'poly', 'rbf']
    },
    "Artificial Neural Network": {
        'hidden_layer_sizes': [(100, 100, 100), (200, 200, 200)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.05, 0.10, 1],
        'learning_rate': ['constant', 'adaptive']
    }
}

# Hyperparameter tuning and evaluation for all models
for model_name, param_grid in param_grids.items():
    print(f"\nHyperparameter tuning for {model_name}")
    model = models[model_name]
    best_model = hyperparameter_tuning(model, param_grid, X_train, y_train)
    y_pred_best = train_and_evaluate_model(best_model, f"Optimized {model_name}", X_train, X_test, y_train, y_test)
    plot_actual_vs_predicted(y_test, y_pred_best, f"Optimized {model_name}")
