In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

# Load and split data
file_path = '../data/AmesData.csv'
def load_and_split_data(file_path, test_size=0.2, random_state=42):
    ames_df = pd.read_csv(file_path)
    train_df, test_df = train_test_split(ames_df, test_size=test_size, random_state=random_state)
    return train_df, test_df

train_df, test_df = load_and_split_data(file_path)

# Preprocess data
def preprocess_data(df):
    df.replace({'MasVnrArea': 'nan'}, 0, inplace=True)
    num_imputer = SimpleImputer(strategy='median')
    num_cols_with_missing = df.select_dtypes(include=[np.number]).columns[df.select_dtypes(include=[np.number]).isnull().any()].tolist()
    df[num_cols_with_missing] = num_imputer.fit_transform(df[num_cols_with_missing])
    cat_imputer = SimpleImputer(strategy='most_frequent')
    cat_cols_with_missing = df.select_dtypes(exclude=[np.number]).columns[df.select_dtypes(exclude=[np.number]).isnull().any()].tolist()
    df[cat_cols_with_missing] = cat_imputer.fit_transform(df[cat_cols_with_missing])
    df.drop(df[df['GrLivArea'] > 4000].index, inplace=True)
    df = pd.get_dummies(df, drop_first=True)
    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Separate features and target variable
X_train = train_df.drop(['SalePrice'], axis=1)
y_train = train_df['SalePrice']
X_test = test_df.drop(['SalePrice'], axis=1)
y_test = test_df['SalePrice']

# Scale 'SalePrice' separately
saleprice_scaler = StandardScaler()
y_train = saleprice_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test = saleprice_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

# Apply PCA and RFE
def apply_pca_rfe(X_train, y_train, n_components=50, n_features_to_select=20):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    
    print(f"Shape of X_train_pca: {X_train_pca.shape}")
    
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    n_features_to_select = min(n_features_to_select, X_train_pca.shape[1])
    print(f"Number of features to select: {n_features_to_select}")
    
    rfe = RFE(estimator=rf_model, n_features_to_select=n_features_to_select)
    X_train_rfe = rfe.fit_transform(X_train_pca, y_train)
    
    print(f"Shape of X_train_rfe: {X_train_rfe.shape}")
    
    return pca, rfe, X_train_rfe

pca, rfe, X_train_rfe = apply_pca_rfe(X_train, y_train, n_components=50, n_features_to_select=20)
X_test_pca = pca.transform(X_test)
X_test_rfe = rfe.transform(X_test_pca)

# Train and evaluate models
def train_evaluate_model(model, X_train, y_train, X_test, y_test, saleprice_scaler, model_name):
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    y_pred = model.predict(X_test)
    y_pred_original = saleprice_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
    y_test_original = saleprice_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
    mae = mean_absolute_error(y_test_original, y_pred_original)
    mse = mean_squared_error(y_test_original, y_pred_original)
    print(f"{model_name} Performance:")
    print(f"Mean Absolute Error (MAE): ${mae:.2f}")
    print(f"Mean Squared Error (MSE): ${mse:.2f}")
    print(f"Training Time: {training_time:.2f} seconds")
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test_original, y_pred_original, color='blue', label='Data Points')
    plt.plot([y_test_original.min(), y_pred_original.max()], [y_test_original.min(), y_pred_original.max()], 'r--', lw=2, label='Ideal Fit')
    plt.xlabel('Actual SalePrice')
    plt.ylabel('Predicted SalePrice')
    plt.title(f'Actual vs Predicted SalePrice \n {model_name} with RFE')
    plt.legend()
    plt.show()
    return model

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Elastic Net Regression': ElasticNet(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'Support Vector Regressor': SVR(),
    'XGBoost Regressor': XGBRegressor(),
    'Artificial Neural Networks': MLPRegressor(max_iter=1000, random_state=42)
}

# Train and evaluate each model
for model_name, model in models.items():
    train_evaluate_model(model, X_train_rfe, y_train, X_test_rfe, y_test, saleprice_scaler, model_name)

# Hyperparameter tuning
def hyperparameter_tuning(model, param_grid, X_train, y_train, search_type='random', n_iter=100):
    if search_type == 'random':
        search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=n_iter, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
    else:
        search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    start_time = time.time()
    search.fit(X_train, y_train)
    end_time = time.time()
    print(f"Best Parameters: {search.best_params_}")
    print(f"Hyperparameter Tuning Time: {end_time - start_time:.2f} seconds")
    return search.best_params_

# Define hyperparameter grids
param_grids = {
    'Random Forest Regressor': {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    'Gradient Boosting Regressor': {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.8, 1.0]
    },
    'Support Vector Regressor': {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 0.5, 1.0],
        'kernel': ['linear', 'poly', 'rbf']
    },
    'XGBoost Regressor': {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'Artificial Neural Networks': {
        'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant', 'adaptive']
    }
}

# Hyperparameter tuning for each model
for model_name, param_grid in param_grids.items():
    model = models[model_name]
    best_params = hyperparameter_tuning(model, param_grid, X_train_rfe, y_train)
    model.set_params(**best_params)
    train_evaluate_model(model, X_train_rfe, y_train, X_test_rfe, y_test, saleprice_scaler, f"Optimized {model_name}")
