In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from math import sqrt
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from enum import Enum
from itertools import product

In [5]:
# Load the data
train_df = pd.read_csv('datasets/train.csv')
test_df = pd.read_csv('datasets/test.csv')

In [6]:
# NAN Solver : Really big block of spaghetti code

# defined different methods of handling nans:
# 1. set zero or 'vanilla' value
# 2. fill median
# 3. fill average
# 4. drop
# 5. interpolate

class NaNSolution(Enum):
    DEFAULT = 1
    MEDIAN = 2
    MEAN = 3
    DROP = 4
    INTERPOLATE = 5
    
# Refactor
def update_best_model(model_name, new_rmse, new_score, flag_LotFrontage):
    global least_rmse, best_score, best_regressor, best_flag_LotFrontage
    if new_rmse < least_rmse:
        least_rmse = new_rmse
        best_score = new_score
        best_regressor = model_name
        best_flag_LotFrontage = flag_LotFrontage

# Create permutations
permutations = product(NaNSolution, repeat=1)  # Adjust 'repeat' as needed

best_score = 0
# remember best RSME result :
least_rmse  = 9999999

# best score NaN handler configuration :
best_flag_LotFrontage = NaNSolution.DEFAULT
best_flag_MasVnrType = NaNSolution.DEFAULT
best_regressor = "None"

# ====================================================================
# Iterate over permutations
for permutation in permutations:
    
    # Resetting the dataframes
    permute_train_df = train_df.copy()
    permute_test_df = test_df.copy()
    
    flag_LotFrontage = permutation

    # reload data each permutation :
    permute_train_df = train_df
    permute_test_df = test_df

    match flag_LotFrontage:
        case NaNSolution.DEFAULT:
        # default :
            permute_train_df['LotFrontage'] = permute_train_df['LotFrontage'].fillna(0)

        case NaNSolution.MEDIAN:
            permute_train_df['LotFrontage'] = permute_train_df['LotFrontage'].fillna(permute_train_df.median())

        case NaNSolution.MEAN:
            permute_train_df['LotFrontage'] = permute_train_df['LotFrontage'].fillna(permute_train_df.mean())

        case NaNSolution.DROP:
            permute_train_df['LotFrontage'] = permute_train_df['LotFrontage'].dropna()

        case NaNSolution.INTERPOLATE:
            permute_train_df['LotFrontage'] = permute_train_df['LotFrontage'].interpolate(method='linear', limit_direction='forward', axis=0)

    # -------------------------------
    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median') # NaNs should already by filled by this point (at least for most significant columns), hopefully.

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)
    # -------------------------------
    # Separating target variable and predictors
    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Removing columns with too many missing values (>50% missing)
    too_many_missing = [col for col in X.columns if X[col].isnull().sum() > X.shape[0] * 0.5]
    X.drop(too_many_missing, axis=1, inplace=True)

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)

      # -----------------------------------------------

    # Define and train models - ** TODO : still need to tune these systematically **
    model_linear = LinearRegression()
    model_ridge_cv = RidgeCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5)
    model_lasso_cv = LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10], cv=5, max_iter=100000)
    model_tree = DecisionTreeRegressor(random_state=0)
    model_forest = RandomForestRegressor(n_estimators=100, random_state=0)
    
    # Define the parameter distribution for each model
    param_dist_ridge = {'alpha': uniform(0.01, 100)}
    param_dist_lasso = {'alpha': uniform(0.0001, 10)}
    param_dist_tree = {'max_depth': [None] + list(range(1, 31)), 'min_samples_split': randint(2, 11)}
    param_dist_forest = {'n_estimators': randint(50, 200), 'max_depth': [None] + list(range(1, 31))}

    # Create a RandomizedSearchCV object for each model
    rand_ridge = RandomizedSearchCV(Ridge(), param_distributions=param_dist_ridge, n_iter=100, cv=5)
    rand_ridge.fit(X_train, y_train)
    rand_lasso = RandomizedSearchCV(Lasso(max_iter=100000), param_distributions=param_dist_lasso, n_iter=100, cv=5)
    rand_tree = RandomizedSearchCV(DecisionTreeRegressor(random_state=0), param_distributions=param_dist_tree, n_iter=100, cv=5)
    rand_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0), param_distributions=param_dist_forest, n_iter=100, cv=5)

    # Fit models
    model_linear.fit(X_train, y_train)
    model_ridge_cv.fit(X_train, y_train)
    model_lasso_cv.fit(X_train, y_train)
    model_tree.fit(X_train, y_train)
    model_forest.fit(X_train, y_train)

    # Predictions and RMSE
    predictions_linear = model_linear.predict(X_valid)
    predictions_ridge = model_ridge_cv.predict(X_valid)
    predictions_lasso = model_lasso_cv.predict(X_valid)
    predictions_tree = model_tree.predict(X_valid)
    predictions_forest = model_forest.predict(X_valid)

    rmse_linear = sqrt(mean_squared_error(y_valid, predictions_linear))
    rmse_ridge = sqrt(mean_squared_error(y_valid, predictions_ridge))
    rmse_lasso = sqrt(mean_squared_error(y_valid, predictions_lasso))
    rmse_tree = sqrt(mean_squared_error(y_valid, predictions_tree))
    rmse_forest = sqrt(mean_squared_error(y_valid, predictions_forest))

    print(f'Linear Regression RMSE: {rmse_linear}')
    print(f'Ridge Regression RMSE: {rmse_ridge}')
    print(f'Lasso Regression RMSE: {rmse_lasso}')
    print(f'Decision Tree RMSE: {rmse_tree}')
    print(f'Random Forest RMSE: {rmse_forest}')

      # Lastly, compare with best score, save configuration for best score.

    new_high_score = False

    # Check and update for Linear model
    update_best_model("Linear", rmse_linear, model_linear.score(X_train, y_train), flag_LotFrontage)

    # Check and update for Ridge model
    update_best_model("Ridge", rmse_ridge, model_ridge_cv.score(X_train, y_train), flag_LotFrontage)

    # Check and update for Lasso model
    update_best_model("Lasso", rmse_lasso, model_lasso_cv.score(X_train, y_train), flag_LotFrontage)

    # Check and update for Decision Tree model
    update_best_model("Tree", rmse_tree, model_tree.score(X_train, y_train), flag_LotFrontage)

    # Check and update for Random Forest model
    update_best_model("Forest", rmse_forest, model_forest.score(X_train, y_train), flag_LotFrontage)

    if new_high_score == True:
        best_flag_LotFrontage = flag_LotFrontage

    print(f'Trying LotFrontage NaN Handler... (1 - 5) : {flag_LotFrontage}')

# print result
print(f'Best Model Score                     : {best_score}')
print(f'\n\nLowest RSME                      : {least_rmse}')
print(f'Best Regressor                       : {best_regressor}')
print(f'Best LotFrontage NaN Handler (1 - 5) : {best_flag_LotFrontage}')

Linear Regression RMSE: 59738.63561964673
Ridge Regression RMSE: 47251.178315544945
Lasso Regression RMSE: 56726.558615929185
Decision Tree RMSE: 52821.18414518933
Random Forest RMSE: 34580.02593951611
Trying LotFrontage NaN Handler... (1 - 5) : (<NaNSolution.DEFAULT: 1>,)
Linear Regression RMSE: 59738.63561964673
Ridge Regression RMSE: 47251.178315544945
Lasso Regression RMSE: 56726.558615929185
Decision Tree RMSE: 52821.18414518933
Random Forest RMSE: 34580.02593951611
Trying LotFrontage NaN Handler... (1 - 5) : (<NaNSolution.MEDIAN: 2>,)
Linear Regression RMSE: 59738.63561964673
Ridge Regression RMSE: 47251.178315544945
Lasso Regression RMSE: 56726.558615929185
Decision Tree RMSE: 52821.18414518933
Random Forest RMSE: 34580.02593951611
Trying LotFrontage NaN Handler... (1 - 5) : (<NaNSolution.MEAN: 3>,)
Linear Regression RMSE: 59738.63561964673
Ridge Regression RMSE: 47251.178315544945
Lasso Regression RMSE: 56726.558615929185
Decision Tree RMSE: 52821.18414518933
Random Forest RMSE

In [7]:
best_model = rand_forest

# Fit the model on the entire training dataset
X_full = preprocessor.fit_transform(train_df.drop('SalePrice', axis=1))
y_full = train_df['SalePrice']
best_model.fit(X_full, y_full)

# Predict house prices on the test dataset
test_preprocessed = preprocessor.transform(test_df)
test_predictions = best_model.predict(test_preprocessed)

# Create submission DataFrame
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_predictions
})

# Save the DataFrame to a CSV file for submission
submission.to_csv('house_prices_submission.csv', index=False)

KeyboardInterrupt: 