In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

import catboost
from catboost import CatBoostRegressor

In [None]:
# Load the data
train_df = pd.read_csv('datasets/train.csv')
test_df = pd.read_csv('datasets/test.csv')

train_df['Fence']

#### Solve for the more straightforward cases

In [None]:
# solve for more straightforward cases:
# Alley (Grvl / Pave): 
most_common_alley = train_df['Alley'].mode()[0]
train_df['Alley'].fillna(most_common_alley, inplace=True)

# FireplaceQu
train_df['FireplaceQu'].fillna(0, inplace=True)

# Fence : 
# 281 non-null vs.
# 1179 null -- drop
train_df.drop(columns=['Fence'], inplace=True)

# 7 non-null
# 1453 null values
#train_df.drop(columns=['PoolQC'], inplace=True)

# Misc Feature : 
#train_df.drop(columns=['MiscFeature'], inplace=True)


#### Now for the NaN Solving & model benchmarking

In [None]:
# NAN Solver : Really big block of spaghetti code

# defined different methods of handling nans:
# 1. set zero or 'vanilla' value
# 2. fill median
# 3. fill average
# 4. drop
# 5. interpolate

from enum import Enum
from itertools import product

class NaNSolution(Enum):
    DEFAULT = 1
    MEDIAN = 2
    MEAN = 3
    DROP = 4
    INTERPOLATE = 5


# Create permutations
permutations = product(NaNSolution, repeat=2)


best_score = 0
# remember best RSME result :
least_rmse  = 9999999

# best score NaN handler configuration :
best_flag_LotFrontage = NaNSolution.DEFAULT
best_flag_MasVnrType = NaNSolution.DEFAULT
best_regressor = "None"

In [None]:

# Iterate over permutations
for permutation in permutations:
    flag_LotFrontage, flag_MasVnrType = permutation

    # reload data each permutation :
    permute_train_df = train_df
    permute_test_df = test_df

    match flag_LotFrontage:
        case NaNSolution.DEFAULT:
        # default :
            permute_train_df = permute_train_df['LotFrontage'].fillna(0) 

        case NaNSolution.MEDIAN:
            permute_train_df = permute_train_df['LotFrontage'].fillna(permute_train_df.median())

        case NaNSolution.MEAN:
            permute_train_df = permute_train_df['LotFrontage'].fillna(permute_train_df.mean())

        case NaNSolution.DROP:
            permute_train_df = permute_train_df['LotFrontage'].dropna()

        case NaNSolution.INTERPOLATE:
            permute_train_df = permute_train_df['LotFrontage'].interpolate(method='linear', limit_direction='forward', axis=0)

    # ['BrkFace', nan, 'Stone', 'BrkCmn']
    # Since these are categorical, we'll use each case to be for filling in a different categorical value instead, since we can't clearly interpret categories in another way.  

    match flag_MasVnrType:
        case NaNSolution.DEFAULT:
        # default :
            permute_train_df = permute_train_df['MasVnrType'].fillna('BrkFace') 

        case NaNSolution.MEDIAN:
            permute_train_df = permute_train_df['MasVnrType'].fillna('Stone')

        case NaNSolution.MEAN:
            permute_train_df = permute_train_df['MasVnrType'].fillna('BrkCmn')

        case NaNSolution.DROP:
            permute_train_df = permute_train_df['MasVnrType'].dropna()
        
        case NaNSolution.INTERPOLATE:
        # this one doesn't really apply here.
            continue

    print(f'Trying LotFrontage NaN Handler... (1 - 5) : {flag_LotFrontage}')
    print(f'Trying MasVnrType NaN Handler... (1 - 5)  : {flag_MasVnrType}')


    # -------------------------------
    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median') # NaNs should already by filled by this point (at least for most significant columns), hopefully.

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)
    # -------------------------------
    # Separating target variable and predictors
    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Removing columns with too many missing values (>50% missing)
    too_many_missing = [col for col in X.columns if X[col].isnull().sum() > X.shape[0] * 0.5]
    X.drop(too_many_missing, axis=1, inplace=True)

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)

      # -----------------------------------------------

    # Define and train models - ** tune these systematically **
    model_linear = LinearRegression()

    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import uniform, randint
    import xgboost as xgb

    # Define the parameter distribution for each model
    param_dist_ridge = {'alphas': uniform(0.01, 100)}
    param_dist_lasso = {'alphas': uniform(0.0001, 10)}
    param_dist_tree = {'max_depth': [None] + list(range(1, 31)), 'min_samples_split': randint(2, 11)}
    param_dist_forest = {'n_estimators': randint(50, 200), 'max_depth': [None] + list(range(1, 31))}
    param_dist_xgboost = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': randint(50, 350),
    'gamma': uniform(0, 1),
    'subsample': [0.8, 0.9, 1.0],
    }

    param_dist_catboost = {
        'depth': [3, 4, 5, 6, 7, 8, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'n_estimators': randint(50, 200),
        'l2_leaf_reg': uniform(1, 10),
        'subsample': [0.8, 0.9, 1.0],
    }
    

    model_linear = LinearRegression()
    model_ridge_cv = RidgeCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5)
    model_lasso_cv = LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10], cv=5, max_iter=10)
    model_tree = DecisionTreeRegressor(random_state=0)
    model_forest = RandomForestRegressor(n_estimators=6, random_state=0)
    model_xgboost = xgb.XGBRegressor(random_state=0)
    model_catboost = CatBoostRegressor(random_seed=0, verbose=False)

    # Perform hyperparameter tuning with RandomizedSearchCV
    random_search_xgboost = RandomizedSearchCV(
        model_xgboost, param_distributions=param_dist_xgboost, n_iter=5, cv=5, random_state=0, n_jobs=-1)
    random_search_catboost = RandomizedSearchCV(
    model_catboost, param_distributions=param_dist_catboost, n_iter=150, cv=5, random_state=0, n_jobs=-1)


    # Fit models
    model_linear.fit(X_train, y_train)
    model_ridge_cv.fit(X_train, y_train)
    model_lasso_cv.fit(X_train, y_train)
    model_tree.fit(X_train, y_train)
    model_forest.fit(X_train, y_train)
    random_search_xgboost.fit(X_train, y_train)
    random_search_catboost.fit(X_train, y_train)


    # Predictions and RMSE
    predictions_linear = model_linear.predict(X_valid)
    predictions_ridge = model_ridge_cv.predict(X_valid)
    predictions_lasso = model_lasso_cv.predict(X_valid)
    predictions_tree = model_tree.predict(X_valid)
    predictions_forest = model_forest.predict(X_valid)
    predictions_xgboost = random_search_xgboost.predict(X_valid)
    predictions_catboost = random_search_catboost.predict(X_valid)

    rmse_linear = sqrt(mean_squared_error(y_valid, predictions_linear))
    rmse_ridge = sqrt(mean_squared_error(y_valid, predictions_ridge))
    rmse_lasso = sqrt(mean_squared_error(y_valid, predictions_lasso))
    rmse_tree = sqrt(mean_squared_error(y_valid, predictions_tree))
    rmse_forest = sqrt(mean_squared_error(y_valid, predictions_forest))
    rmse_xgboost = sqrt(mean_squared_error(y_valid, predictions_xgboost))
    rmse_catboost = sqrt(mean_squared_error(y_valid, predictions_catboost))


    print(f'Linear Regression RMSE: {rmse_linear}')
    print(f'Ridge Regression RMSE: {rmse_ridge}')
    print(f'Lasso Regression RMSE: {rmse_lasso}')
    print(f'Decision Tree RMSE: {rmse_tree}')
    print(f'Random Forest RMSE: {rmse_forest}')
    print(f'XGBoost RMSE: {rmse_xgboost}')
    print(f'CatBoost RMSE: {rmse_catboost}')
    # Lastly, compare with best score, save configuration for best score.

    new_high_score = False

    if rmse_linear < least_rmse:
        best_score = model_linear.score(X_train, y_train)
        least_rmse = rmse_linear
        best_regressor = "Linear"
        new_high_score = True

    if rmse_ridge  < least_rmse:
        best_score = model_ridge_cv.score(X_train, y_train)
        least_rmse = rmse_ridge
        best_regressor = "Ridge"
        new_high_score = True

    if rmse_lasso  < least_rmse:
        best_score = model_lasso_cv.score(X_train, y_train)
        least_rmse = rmse_lasso
        best_regressor = "Lasso"
        new_high_score = True

    if rmse_tree  < least_rmse:
        best_score = model_tree.score(X_train, y_train)
        least_rmse = rmse_tree
        best_regressor = "Tree"
        new_high_score = True


    if rmse_forest  < least_rmse:
        best_score = model_forest.score(X_train, y_train)
        least_rmse = rmse_forest
        best_regressor = "Forest"
        new_high_score = True

    if rmse_xgboost < least_rmse: 
        best_score = model_xgboost.score(X_train, y_train)
        least_rmse = rmse_xgboost
        best_regressor = "XGBoost (Hyper Parameter Tuned)"
        new_high_score = True

    if rmse_catboost < least_rmse: 
        best_score = model_catboost.score(X_train, y_train)
        least_rmse = rmse_catboost
        best_regressor = "CatBoost (Hyper Parameter Tuned)"
        new_high_score = True

    if new_high_score == True:
        best_flag_LotFrontage = flag_LotFrontage
    break



# print result
print(f'Best Model Score                     : {best_score}')
print(f'Lowest RSME                          : {least_rmse}')
print(f'Best Regressor                       : {best_regressor}')
print(f'Best LotFrontage NaN Handler (1 - 5) : {best_flag_LotFrontage}')

#### Testing Random Forest

In [None]:
# (disabled testing code)
if False: 
    permute_train_df = train_df
    permute_test_df = test_df


    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median') # NaNs should already by filled by this point (at least for most significant columns), hopefully.

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)
    # -------------------------------
    # Separating target variable and predictors
    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Removing columns with too many missing values (>50% missing)
    too_many_missing = [col for col in X.columns if X[col].isnull().sum() > X.shape[0] * 0.5]
    X.drop(too_many_missing, axis=1, inplace=True)

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)

    # -----------------------------------------------

    # Define and train models - ** TODO : still need to tune these systematically **

    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import uniform, randint

    # Define the parameter distribution for each model
    param_dist_forest = {'n_estimators': randint(10, 300), 'max_depth': [None] + list(range(1, 31))}
    rand_forest = RandomizedSearchCV(RandomForestRegressor(random_state=0), param_distributions=param_dist_forest, n_iter=55, cv=5)
    rand_forest.fit(X_train, y_train)
    predictions_forest = rand_forest.predict(X_valid)
    rmse_forest = sqrt(mean_squared_error(y_valid, predictions_forest))
    print(f'Forest Regression RMSE: {rmse_forest}')


#### Testing CatBoost

In [None]:
# testing just Cat Boost: 
if False:
    # also trying iterative impute
    from sklearn.impute import IterativeImputer
    from sklearn.experimental import enable_iterative_imputer

    permute_train_df = train_df
    permute_test_df = test_df

    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    #numerical_transformer = SimpleImputer(strategy='median') # NaNs should already by filled by this point (at least for most significant columns), hopefully.
    numerical_transformer = IterativeImputer(max_iter=10, random_state=0)


    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(max_iter=10, random_state=0, strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)
    # -------------------------------
    # Separating target variable and predictors
    X = permute_train_df.drop('SalePrice', axis=1)
    y = permute_train_df['SalePrice']

    # Removing columns with too many missing values (>50% missing)
    too_many_missing = [col for col in X.columns if X[col].isnull().sum() > X.shape[0] * 0.5]
    X.drop(too_many_missing, axis=1, inplace=True)

    # Lists of numerical and categorical columns
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Applying the preprocessing transformations
    X_preprocessed = preprocessor.fit_transform(X)

    # Split the preprocessed data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, train_size=0.8, test_size=0.2, random_state=0)

    # -----------------------------------------------

    # Define and train models - ** TODO : still need to tune these systematically **

    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import uniform, randint

    param_dist_catboost = {
        'depth': [3, 4, 5, 6, 7, 8, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'n_estimators': randint(50, 200),
        'l2_leaf_reg': uniform(1, 10),
        'subsample': [0.8, 0.9, 1.0],
    }
    model_catboost = CatBoostRegressor(random_seed=0, verbose=False)
    random_search_catboost = RandomizedSearchCV(
    model_catboost, param_distributions=param_dist_catboost, n_iter=200, cv=5, random_state=0, n_jobs=-1)
    random_search_catboost.fit(X_train, y_train)
    predictions_catboost = random_search_catboost.predict(X_valid)
    rmse_catboost = sqrt(mean_squared_error(y_valid, predictions_catboost))
    print(f'CatBoost RMSE: {rmse_catboost}')

#### Output CSV Submission File

In [None]:
# Assuming you've chosen the best model based on RMSE
# For example, let's say the ____ was the best

# now, going to try catboost as best:
random_search_catboost = RandomizedSearchCV(
model_catboost, param_distributions=param_dist_catboost, n_iter=500, cv=5, random_state=0, n_jobs=-1)
random_search_catboost.fit(X_train, y_train)


best_model = random_search_catboost.best_estimator_
train_df = permute_train_df
test_df = permute_test_df
# Re-train the model on the entire training dataset
# Make sure 'train_df' is your entire training dataset
X_full = preprocessor.fit_transform(train_df.drop('SalePrice', axis=1))  # Preprocess features
y_full = train_df['SalePrice']  # Target variable
best_model.fit(X_full, y_full)

# Preprocess the test dataset
# Apply the same preprocessing steps used for the training dataset
test_preprocessed = preprocessor.transform(test_df)  # 'test_df' should be your raw test dataset

# Predict house prices on the test dataset
test_predictions = best_model.predict(test_preprocessed)

# Create submission DataFrame
# Replace 'Id' with the correct identifier column from your test dataset
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_predictions
})

# Save the DataFrame to a CSV file for submission
submission.to_csv('house_prices_submission_cat_improved.csv', index=False)