In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, probplot
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, RobustScaler

from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.model_selection import KFold

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load Dataset

In [None]:
train = pd.read_csv('train.csv',index_col=0)
test = pd.read_csv('test.csv', index_col=0)

print(f"Train Set shape: {train.shape}") #(1460, 80)
print(f"Test set shape: {test.shape}") #(1459, 79)


In [None]:
num_features = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'FullBath', 'HalfBath', 'BsmtFullBath', 'GarageCars', 'BsmtHalfBath', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',  'Bedroom', 'Kitchen', 'TotRmsAbvGrd', 'Fireplaces']
cat_nom_features = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']
cat_ord_features = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2',  'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']

# 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'GarageCars', 'MoSold', 'YrSold'


print(len(num_features+cat_nom_features+cat_ord_features))

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe(include='all').T

# Data Cleansing/Preprocessing

## Train Set

In [None]:
missing = train.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_df = missing.reset_index()
missing_df.columns = ['Feature', 'MissingCount']


plt.figure(figsize=(12, 10))
ax = sns.barplot(data=missing_df, x='Feature', y='MissingCount')

ax.bar_label(ax.containers[0], fontsize=10)


plt.title("Missing Value Distribution in Train Set")
plt.xlabel('Feature/Column')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

### Handle Missing Category Data

In [None]:
print("====================Category Nominal====================")
cat_nom_missing = [i for i in missing.index if i in cat_nom_features]
print(cat_nom_missing)
print("====================Category Ordinal====================")
cat_ord_missing = [i for i in missing.index if i in cat_ord_features]
print(cat_ord_missing)

Based on 'data_description.txt' this feature in Nominal/ordinal category has a valid value of NA/None:  
'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'

In [None]:
valid_none_features = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
print('Valid None/NA features based on `data_description.txt`', valid_none_features, '\n=====\nTotal none category:', len(valid_none_features))

In [None]:
none_to_zero_map = {
    'MasVnrType': ['MasVnrArea'],
    'BsmtFinType1': ['BsmtFinSF1'],
    'BsmtFinType2': ['BsmtFinSF2'],
    'BsmtQual': ['TotalBsmtSF', 'BsmtUnfSF'],
    'FireplaceQu': ['Fireplaces'],
    'GarageType': ['GarageCars', 'GarageArea'],
    'PoolQC': ['PoolArea'],
    'MiscFeature': ['MiscVal']
}

zero_to_none_map = {num: cat for cat, nums in none_to_zero_map.items() for num in nums}

In [None]:
train[valid_none_features].isna().sum()

In [None]:
# print(f"Train Valid None before filled:\nTrain\n{train[valid_none_features].isna().sum()}\n\nTest\n{test[valid_none_features].isna().sum()}\n\n")
for col in valid_none_features:
    if col in train.columns.values:
        train[col].fillna('None', inplace=True)
        test[col].fillna('None', inplace=True)
# print(f"Train Valid None after filled:\nTrain\n{train[valid_none_features].isna().sum()}\n\nTest\n{test[valid_none_features].isna().sum()}")


In [None]:
# Filled numerical feature that has realtion with nominal/ordinal with valid None/NA value with 0
for nominal_col, numeric_cols in none_to_zero_map.items():
    for numeric_col in numeric_cols:
        print(numeric_col)
        mask = (train[numeric_col].isnull()) & (train[nominal_col] == 'None')
        if mask.sum() > 0:
            train.loc[mask, numeric_col] = 0
            print(f"[Train Set] Filled {mask.sum()} missing in '{numeric_col}' with 0 based on '{nominal_col}' = 'None'")

        mask = (test[numeric_col].isnull()) & (test[nominal_col] == 'None')
        if mask.sum() > 0:
            test.loc[mask, numeric_col] = 0
            print(f"[Test Set] Filled {mask.sum()} missing in '{numeric_col}' with 0 based on '{nominal_col}' = 'None'")
        print("======================================")

In [None]:
for numeric_col, nominal_cols in none_to_zero_map.items():
    for nominal_col in nominal_cols:
        print(nominal_col)
        mask = (train[nominal_col].isnull()) & (train[nominal_col] == 'None')
        if mask.sum() > 0:
            train.loc[mask, nominal_col] = 'None'
            print(f"Filled {mask.sum()} missing in '{nominal_col}' with 'None' based on '{numeric_cols}' = 0")

        
        mask = (test[nominal_col].isnull()) & (test[nominal_col] == 'None')
        if mask.sum() > 0:
            test.loc[mask, nominal_col] = 'None'
            print(f"Filled {mask.sum()} missing in '{nominal_col}' with 'None' based on '{numeric_cols}' = 0")
        print("======================================")

In [None]:
### GarageYrBuilt
# If GarageType is not present or None/NA then we can safely assume the house doesn't have a garage
# Else if present we can safely assumed it was built when the house is built.
train['GarageYrBlt'] = train.apply(
    lambda row: 0 if pd.isna(row['GarageType']) \
    else (
        row['YearBuilt'] if pd.isna(row['GarageYrBlt']) \
        else row['GarageYrBlt']
    ), 
    axis=1
)

test['GarageYrBlt'] = test.apply(
    lambda row: 0 if pd.isna(row['GarageType']) \
    else (
        row['YearBuilt'] if pd.isna(row['GarageYrBlt']) \
        else row['GarageYrBlt']
    ), 
    axis=1
)
### Electrical
neighborhood_mode = train.groupby(by=['Neighborhood'])['Electrical'].agg(lambda x: x.value_counts().index[0])
global_mode = train['Electrical'].value_counts().index[0]
missing_feature_neighborhood = train[train['Electrical'].isnull()][['Neighborhood']]
missing_feature_neighborhood = missing_feature_neighborhood['Neighborhood'].unique().tolist()
print(f"Missing value in 'Electrical' to fix: {train['Electrical'].isnull().sum()}")
for hood in missing_feature_neighborhood:
    input_mod = 0
    local_neigh_mod = neighborhood_mode.loc[hood]
    if local_neigh_mod == 0 or local_neigh_mod == None or local_neigh_mod == '0':
        input_mod = global_mode
    else:
        input_mod = local_neigh_mod
    mask = (train['Electrical'].isnull()) & (train['Neighborhood'] == hood)

    train.loc[mask, 'Electrical'] = input_mod
print(f"Missing value in 'Electrical' after fix: {train['Electrical'].isnull().sum()}")
print("###############")



### Handle Missing Numerical Data

In [None]:
print("=======================Numerical========================")
num_missing = [i for i in missing.index if i in num_features]
print(num_missing)

In [None]:
for col in num_missing:
    neighborhood_median = train.groupby(by=['Neighborhood'])[col].median()
    global_median = train[col].median()
    missing_feature_neighborhood = train[train[col].isnull()][['Neighborhood']]
    missing_feature_neighborhood = missing_feature_neighborhood['Neighborhood'].unique().tolist()
    print(f"Missing value in {col} to fix: {train[col].isnull().sum()}")
    for hood in missing_feature_neighborhood:
        input_med = 0
        # print(f"Current Neighborhood: {hood}")
        local_neigh_med = neighborhood_median.loc[hood]
        # print(f"Global Median: {global_median} | {hood} Median: {local_neigh_med}")
        if local_neigh_med == 0 or local_neigh_med == None or local_neigh_med == '0':
            # print(f"{hood} median is {local_neigh_med}, changed plan to use global median: {global_median}")
            input_med = global_median
        else:
            input_med = local_neigh_med
        mask = (train[col].isnull()) & (train['Neighborhood'] == hood)
    
        train.loc[mask, col] = input_med
    print(f"Missing value in {col} after fix: {train[col].isnull().sum()}")
    print("###############")



In [None]:
train.isna().sum().sum()

## Test Set

In [None]:
missing = test.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_df = missing.reset_index()
missing_df.columns = ['Feature', 'MissingCount']


plt.figure(figsize=(12, 10))
ax = sns.barplot(data=missing_df, x='Feature', y='MissingCount')

ax.bar_label(ax.containers[0], fontsize=10)


plt.title("Missing Value Distribution in Train Set")
plt.xlabel('Feature/Column')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

### Handle Missing Category Data

In [None]:
print("====================Category Nominal====================")
cat_nom_missing = [i for i in missing.index if i in cat_nom_features]
print(cat_nom_missing)
print("====================Category Ordinal====================")
cat_ord_missing = [i for i in missing.index if i in cat_ord_features]
print(cat_ord_missing)

In [None]:
for col in cat_nom_missing:
    neighborhood_mode = train.groupby(by=['Neighborhood'])[col].agg(lambda x: x.value_counts().index[0])
    global_mode = train[col].value_counts().index[0]
    
    missing_feature_neighborhood = test[test[col].isnull()][['Neighborhood']]
    missing_feature_neighborhood = missing_feature_neighborhood['Neighborhood'].unique().tolist()
    
    print(f"Missing value in '{col}' to fix: {test[col].isnull().sum()}")
    for hood in missing_feature_neighborhood:
        input_mod = 0
        local_neigh_mod = neighborhood_mode.loc[hood]
        if local_neigh_mod == 0 or local_neigh_mod == None or local_neigh_mod == '0':
            input_mod = global_mode
        else:
            input_mod = local_neigh_mod
        mask = (test[col].isnull()) & (test['Neighborhood'] == hood)
    
        test.loc[mask, col] = input_mod
    print(f"Missing value in '{col}' after fix: {test[col].isnull().sum()}")
    print("###############")

In [None]:
test[test['KitchenQual'].isnull()][['Neighborhood', 'OverallQual', 'KitchenQual']]
# test[test['OverallQual'] == 5][['Neighborhood', 'OverallQual', 'KitchenQual']].head()

In [None]:
neigh_missing, overallqual_missing = test[test['KitchenQual'].isnull()][['Neighborhood', 'OverallQual']].values.tolist()[0]
neigh_overall_mode = train.groupby(['Neighborhood', 'OverallQual'])['KitchenQual'].agg(lambda x: x.value_counts().index[0])
input_mode = neigh_overall_mode.loc[(neigh_missing, overallqual_missing)]
mask = (test['KitchenQual'].isnull()) & (test['Neighborhood'] == neigh_missing) & (test['OverallQual'] == overallqual_missing)
test.loc[mask, 'KitchenQual'] = input_mode

### Handle Missing Numerical 

In [None]:
print("=======================Numerical========================")
num_missing = [i for i in missing.index if i in num_features]
print(num_missing)

In [None]:
for col in num_missing:
    neighborhood_median = train.groupby(by=['Neighborhood'])[col].median()
    global_median = train[col].median()
    
    missing_feature_neighborhood = test[test[col].isnull()][['Neighborhood']]
    missing_feature_neighborhood = missing_feature_neighborhood['Neighborhood'].unique().tolist()
    print(f"Missing value in {col} to fix: {test[col].isnull().sum()}")
    for hood in missing_feature_neighborhood:
        input_med = 0
        local_neigh_med = neighborhood_median.loc[hood]
        if local_neigh_med == 0 or local_neigh_med == None or local_neigh_med == '0':
            input_med = global_median
        else:
            input_med = local_neigh_med
        mask = (test[col].isnull()) & (test['Neighborhood'] == hood)
    
        test.loc[mask, col] = input_med
    print(f"Missing value in {col} after fix: {test[col].isnull().sum()}")
    print("###############")



In [None]:
test.isnull().sum().sum()

In [None]:
category_nominal_dictionary = {
    'MSSubClass': [
        20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190
    ],
    'MSZoning': ['A', 'C (all)', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'], # 'C'=>'C (all)'
    'Street': ['Grvl', 'Pave'],
    'Alley': ['Grvl', 'Pave', 'None'],
    'LotShape': ['Reg', 'IR1', 'IR2', 'IR3'],
    'LandContour': ['Lvl', 'Bnk', 'HLS', 'Low'],
    'Utilities': ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],
    'LotConfig': ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'],
    'LandSlope': ['Gtl', 'Mod', 'Sev'],
    'Neighborhood': [
        'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards',
        'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NoRidge', 'NPkVill', 'NridgHt',
        'NWAmes', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'
    ],
    'Condition1': ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'],
    'Condition2': ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'],
    'BldgType': ['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsI', 'TwnhsE'], # Twnhs category should not exist, based on data_description.txt
    'HouseStyle': [
        '1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl'
    ],
    'RoofStyle': ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed'],
    'RoofMatl': [
        'ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl'
    ],
    'Exterior1st': [
        'AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc',
        'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing'
    ],
    'Exterior2nd': [
        'AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd', 'HdBoard', 'ImStucc',
        'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng'
    ],# 'WdShing'=>'Wd Shng', 'CemntBd'=>'CmentBd', 'Brk Cmn'=>'BrkComm'
    'MasVnrType': ['BrkCmn', 'BrkFace', 'CBlock', 'None', 'Stone'],
    'Foundation': ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'],
    'BsmtExposure': ['Gd', 'Av', 'Mn', 'No', 'None'],
    'Heating': ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'],
    'CentralAir': ['N', 'Y'],
    'Electrical': ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix'],
    'Functional': ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
    'GarageType': ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'None'],
    'GarageFinish': ['Fin', 'RFn', 'Unf', 'None'],
    'PavedDrive': ['Y', 'P', 'N'],
    'Fence': ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'None'],
    'MiscFeature': ['Elev', 'Gar2', 'Othr', 'Shed', 'TenC', 'None'],
    'SaleType': ['WD', 'CWD', 'VWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI', 'ConLD', 'Oth'],
    'SaleCondition': ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial']
}

category_ordinal_dictionary = {
'OverallQual': list(range(1, 11)),
'OverallCond': list(range(1, 11)),
'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
'BsmtQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
'BsmtCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
'BsmtExposure': ['None', 'No', 'Mn', 'Av', 'Gd'],
'BsmtFinType1': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
'BsmtFinType2': ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
'FireplaceQu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
'GarageQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
'GarageCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
'PoolQC': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
'FireplaceQu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
'Fence': ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']
}

In [None]:
for col in cat_nom_features:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
for col, order in category_ordinal_dictionary.items():
    cat_type = pd.api.types.CategoricalDtype(categories=order, ordered=True)
    train[col] = train[col].astype(cat_type)
    test[col] = test[col].astype(cat_type)


# Exploring

In [None]:
train.info()

In [None]:
plt.figure(figsize=(10, 7))
sns.histplot(train['SalePrice'], kde=True, bins='auto', edgecolor='black', label='SalePrice Distribution', stat='density')

mean = train['SalePrice'].mean()
std = train['SalePrice'].std()
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, std)
plt.plot(x, p, 'r', linewidth=2, label='Normal Distribution')

plt.grid(True, linestyle='-')
plt.title(f"Distribution of SalePrice | Skewness: {round(train['SalePrice'].skew(), 2)} | Kurtosis: {round(train['SalePrice'].kurtosis(), 2)}")
plt.xlabel('SalePrice')
plt.ylabel('Frequency/Density')
plt.legend()
plt.show()

In [None]:
Q1 = train['SalePrice'].quantile(0.25)
Q3 = train['SalePrice'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

plt.figure(figsize=(10, 6))
ax = sns.boxplot(y=train['SalePrice'], flierprops=dict(markerfacecolor='red', marker='o'))

plt.axhline(Q1, color='red', linestyle='--', label=f"Q1 = {Q1:,.0f}")
plt.axhline(Q3, color='blue', linestyle='--', label=f"Q3 = {Q3:,.0f}")
plt.axhline(lower_bound, color='purple', linestyle=':', label=f"Lower Bound = {lower_bound:,.0f}")
plt.axhline(upper_bound, color='orange', linestyle=':', label=f"Upper Bound = {upper_bound:,.0f}")

plt.text(0.02, Q1, f' Q1: {Q1:,.0f}', color='red', va='bottom')
plt.text(0.02, Q3, f' Q3: {Q3:,.0f}', color='blue', va='top')
plt.text(0.02, lower_bound, f' Lower: {lower_bound:,.0f}', color='purple', va='bottom')
plt.text(0.02, upper_bound, f' Upper: {upper_bound:,.0f}', color='orange', va='top')


plt.title('Boxplot of SalePrice', fontsize=14)
plt.ylabel('Sale Price', fontsize=12)

plt.show()

In [None]:
num_features = train.select_dtypes(include=np.number).columns.tolist()

In [None]:
num_corr = train[num_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(num_corr, annot=True, fmt=".2f", vmin=-1, vmax=1, cmap="coolwarm", linecolor='black', linewidths=0.3, square=True)
plt.title('Correlation Matrix')

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(num_corr[['SalePrice']].sort_values(by='SalePrice', ascending=False), annot=True, cmap="coolwarm", fmt=".2f", linecolor='black', linewidths=0.5)
plt.title('Correlation Matrix Zoomed on Dependant Variable')

In [None]:
interest_features = num_corr.abs()[num_corr['SalePrice'] >= 0.3].index.tolist()
# interest_features = interest_features[interest_features.index != 'SalePrice']
candidate_corr_matrix = num_corr.loc[interest_features, interest_features]
print(interest_features)

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(candidate_corr_matrix.sort_values(by="SalePrice", ascending=False), annot=True, fmt=".2f", vmin=-1, vmax=1, cmap="coolwarm", linecolor='black', linewidths=0.3, square=True)
plt.title('Correlation Matrix')

GrLivArea (0.71 x SalePrice)   
GarageArea (0.62 x SalePrice)  
TotalBsmtSF (0.61 x SalePrice)  
1stFlrSF (0.61 x SalePrice) x removed  
FullBath (0.56 x SalePrice) x removed  
TotRmsAbvGrd (0.53 x SalePrice) x removed  
MasVnrArea (0.47 x SalePrice)  
FirePlaces (0.47 x SalePrice)  x removed
BsmtFinSF1 (0.39 x SalePrice) x removed  
LotFrontage (0.35 x SalePrice)  
WoodDeckSF (0.32 x SalePrice)  
2ndFlrSF (0.32 x SalePrice) x removed  
OpenPorchSF (0.32 x SalePrice)  
  
GrLivArea x TotRmsAbvGrd (0.83) (x)  
GrLivArea x FullBath (0.63) (x)  
GrLivArea x 2ndFlrSF (0.69)  
GrLivArea x 1stFlrSF (0.57)  
TotalBsmtSF x 1stFlrSF (0.82) (x)  
TotalBsmtSF x BsmtFinSF1 (0.52) (x)  


In [None]:
interest_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', 'MasVnrArea', 'LotFrontage', 'WoodDeckSF', 'OpenPorchSF']

In [None]:
temp = pd.DataFrame()
pt = PowerTransformer(method='yeo-johnson', standardize=False)
transformed_cols = [f"{col}_transformed" for col in interest_features]
train[interest_features] = pt.fit_transform(train[interest_features])
temp[transformed_cols] = pt.fit_transform(train[interest_features])
test[interest_features] = pt.transform(test[interest_features])

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(15, 30))
for i, col in enumerate(interest_features):
    # Scatter before
    axes[i, 0].scatter(train[col], train['SalePrice'], alpha=0.5, color='blue', marker='x')
    axes[i, 0].set_title(f"{col} vs SalePrice (Before)")
    axes[i, 0].set_xlabel(col)
    axes[i, 0].set_ylabel('SalePrice')
    
    # Scatter after
    axes[i, 1].scatter(temp[f"{col}_transformed"], train['SalePrice'], alpha=0.5, color='red', marker='x')
    axes[i, 1].set_title(f"{col}_transformed vs SalePrice (After)")
    axes[i, 1].set_xlabel(f"{col}_transformed")
    axes[i, 1].set_ylabel('SalePrice')

plt.tight_layout()
plt.show()

In [None]:
num_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF']

In [None]:
plt.figure(figsize=(20, len(cat_nom_features + cat_ord_features) * 3))
for idx, col in enumerate(cat_nom_features + cat_ord_features, 1):
    plt.subplot(len(cat_nom_features + cat_ord_features), 2, idx)
    ax = sns.countplot(data=train, x=col, order=train[col].value_counts().index, orient='y')
    ax.bar_label(ax.containers[0])
    plt.title(f"{col} Distribution")
    plt.xlabel(f"{col}")
    plt.ylabel(f"Count")
    plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
cat_features = ['MSSubClass', 'Neighborhood', 'HouseStyle', 'GarageFinish', 'OverallQual', 'BsmtFinType1', 'HeatingQC']


In [None]:
plt.figure(figsize=(12, len(cat_features) * 5))
for idx, col in enumerate(cat_features, 1):
    plt.subplot(len(cat_features), 2, idx)
    plt.Figure(figsize=(15, 12))
    sns.boxplot(data=train, x=col, y='SalePrice')
    plt.title(f"SalePrice Vs. {col}")
    plt.xlabel(col)
    plt.ylabel('SalePrice')
    plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
cat_features = ['Neighborhood', 'GarageFinish', 'OverallQual']

# Model Building

In [None]:
X_train, y_train = train[num_features + cat_features].copy(deep=True), train['SalePrice']
X_test = test[num_features + cat_features].copy(deep=True)
print(f"[Train]: {X_train.shape}, {y_train.shape}")
print(f"[Test]: {X_test.shape}")

In [None]:
X_train['AbvGrdTotalSF'] = (train['LotFrontage'] + train['LotArea'] + 
                            train['MasVnrArea'] + train['1stFlrSF'] + 
                            train['2ndFlrSF'] + train['GrLivArea'] + 
                            train['TotRmsAbvGrd'] + train['GarageArea'].where(train['GarageType'] != 'Basement', 0).fillna(0) + 
                            train['WoodDeckSF'] + train['OpenPorchSF'] + 
                            train['EnclosedPorch'] + train['3SsnPorch'] + 
                            train['ScreenPorch'] + train['PoolArea'])
X_test['AbvGrdTotalSF'] = (test['LotFrontage'] + test['LotArea'] + 
                           test['MasVnrArea'] + test['1stFlrSF'] + 
                           test['2ndFlrSF'] + test['GrLivArea'] + 
                           test['TotRmsAbvGrd'] + test['GarageArea'].where(test['GarageType'] != 'Basement', 0).fillna(0) + 
                           test['WoodDeckSF'] + test['OpenPorchSF'] + 
                           test['EnclosedPorch'] + test['3SsnPorch'] + 
                           test['ScreenPorch'] + test['PoolArea'])

X_train['BsmtTotalSF'] = (train['BsmtFinSF1'] + train['BsmtFinSF2'] + 
                          train['BsmtUnfSF'] + train['TotalBsmtSF'] + 
                          train['GarageArea'].where(train['GarageType'] != 'Basement', 0).fillna(0))
X_test['BsmtTotalSF'] = (test['BsmtFinSF1'] + test['BsmtFinSF2'] + 
                         test['BsmtUnfSF'] + test['TotalBsmtSF'] + 
                         test['GarageArea'].where(test['GarageType'] != 'Basement', 0).fillna(0))

X_train['PorchTotalSF'] = (train['WoodDeckSF'] + train['OpenPorchSF'] + 
                           train['EnclosedPorch'] + train['3SsnPorch'] + 
                           train['ScreenPorch'])
X_test['PorchTotalSF'] = (test['WoodDeckSF'] + test['OpenPorchSF'] + 
                          test['EnclosedPorch'] + test['3SsnPorch'] + 
                          test['ScreenPorch'])

X_train['BathTotal'] = ((train['BsmtFullBath'] + (0.5 * train['BsmtHalfBath'])) + 
                        (train['FullBath'] + (0.5 * train['HalfBath'])))
X_test['BathTotal'] = ((test['BsmtFullBath'] + (0.5 * test['BsmtHalfBath'])) + 
                       (test['FullBath'] + (0.5 * test['HalfBath'])))

X_train['HasMasVnr'] = (train['MasVnrArea'] > 0).astype(int)
X_test['HasMasVnr'] = (test['MasVnrArea'] > 0).astype(int)
category_nominal_dictionary['HasMasVnr'] = [0, 1]

X_train['HasWoodDeck'] = (train['WoodDeckSF'] > 0).astype(int)
X_test['HasWoodDeck'] = (test['WoodDeckSF'] > 0).astype(int)
category_nominal_dictionary['HasWoodDeck'] = [0, 1]

X_train['HasOpenPorch'] = (train['OpenPorchSF'] > 0).astype(int)
X_test['HasOpenPorch'] = (test['OpenPorchSF'] > 0).astype(int)
category_nominal_dictionary['HasOpenPorch'] = [0, 1]

X_train['HasEnclosedPorch'] = (train['EnclosedPorch'] > 0).astype(int)
X_test['HasEnclosedPorch'] = (test['EnclosedPorch'] > 0).astype(int)
category_nominal_dictionary['HasEnclosedPorch'] = [0, 1]

X_train['Has3SsnPorch'] = (train['3SsnPorch'] > 0).astype(int)
X_test['Has3SsnPorch'] = (test['3SsnPorch'] > 0).astype(int)
category_nominal_dictionary['Has3SsnPorch'] = [0, 1]

X_train['HasScreenPorch'] = (train['ScreenPorch'] > 0).astype(int)
X_test['HasScreenPorch'] = (test['ScreenPorch'] > 0).astype(int)
category_nominal_dictionary['HasScreenPorch'] = [0, 1]

X_train['HasPool'] = (train['PoolArea'] > 0).astype(int)
X_test['HasPool'] = (test['PoolArea'] > 0).astype(int)
category_nominal_dictionary['HasPool'] = [0, 1]

X_train['HasGarage'] = (train['GarageArea'] > 0).astype(int)
X_test['HasGarage'] = (test['GarageArea'] > 0).astype(int)
category_nominal_dictionary['HasGarage'] = [0, 1]

X_train['Has2ndFlr'] = (train['2ndFlrSF'] > 0).astype(int)
X_test['Has2ndFlr'] = (test['2ndFlrSF'] > 0).astype(int)
category_nominal_dictionary['Has2ndFlr'] = [0, 1]

num_features.extend(['AbvGrdTotalSF', 'BsmtTotalSF', 'PorchTotalSF', 'BathTotal'])
cat_nom_features.extend(['HasMasVnr', 'HasWoodDeck', 'HasOpenPorch', 'HasEnclosedPorch', 'Has3SsnPorch', 'HasScreenPorch', 'HasPool', 'HasGarage', 'Has2ndFlr'])

In [None]:
print(f"[Train]: {X_train.shape}, {y_train.shape}")
print(f"[Test]: {X_test.shape}")

In [None]:
nom_cat = [col for col in X_train.columns if col in cat_nom_features]
ohe = OneHotEncoder(categories = [category_nominal_dictionary[col] for col in nom_cat], 
                    handle_unknown='error', 
                    sparse=False)

X_tr = ohe.fit_transform(X_train[nom_cat])
X_te = ohe.transform(X_test[nom_cat])

dummy_cols = ohe.get_feature_names_out(nom_cat)
X_tr_df = pd.DataFrame(X_tr, columns=dummy_cols, index=X_train.index)
X_te_df = pd.DataFrame(X_te, columns=dummy_cols, index=X_test.index)

X_train = pd.concat([X_train.drop(columns=nom_cat), X_tr_df], axis=1)
X_test  = pd.concat([X_test.drop(columns=nom_cat), X_te_df],  axis=1)

In [None]:
ord_cat = [col for col in X_train.columns if col in cat_ord_features]
oe = OrdinalEncoder(categories=[category_ordinal_dictionary[col] for col in ord_cat],
                            handle_unknown='use_encoded_value',
                            unknown_value=-1)

X_train[ord_cat] = oe.fit_transform(X_train[ord_cat])
X_test[ord_cat] = oe.transform(X_test[ord_cat])

In [None]:
num_scaler = RobustScaler()
X_train[num_features] = num_scaler.fit_transform(X_train[num_features])
X_test[num_features] = num_scaler.transform(X_test[num_features])

In [None]:
print(f"[Train]: {X_train.shape}, {y_train.shape}")
print(f"[Test]: {X_test.shape}")

In [None]:
results = pd.DataFrame(columns=['Model', 'RMSE Train'])
models = {
    'ElasticNet (default)': ElasticNet(random_state=42),
    'RandomForest (default)': RandomForestRegressor(random_state=42),
    'XGBoost (default)': XGBRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
    
    new_row = pd.DataFrame([{'Model': name, 'RMSE Train': rmse_train}])
    results = pd.concat([results, new_row], ignore_index=True)

# Show results
results.sort_values(by='RMSE Train', ascending=True, inplace=True)
results.reset_index(drop=True, inplace=True)
print(results)

In [None]:
test.reset_index(inplace=True)

## RandomForest

In [None]:
# Define model and parameter search space
rf_model = RandomForestRegressor(random_state=42)

rf_search_space = {
    'n_estimators': Integer(100, 10000),
    'max_depth': Integer(3, 30),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Categorical(['auto', 'sqrt', 'log2'])
}

# Bayesian optimization with cross-validation
rf_opt = BayesSearchCV(
    estimator=rf_model,
    search_spaces=rf_search_space,
    scoring='neg_root_mean_squared_error',
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=50,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rf_opt.fit(X_train, y_train)
print("Training Complete", rf_opt.best_params_)

preds = rf_opt.predict(X_test)
y_pred = pd.DataFrame({'Id': test['Id'],'SalePrice': preds})
y_pred.to_csv('submission_rf.csv', index=False)


In [None]:
importances_rf = rf_opt.best_estimator_.feature_importances_
features_rf = pd.Series(importances_rf, index=X_train.columns)
top_20_rf = features_rf.sort_values(ascending=False).head(30)

# Display top 20
top_20_rf.plot(kind='barh', title='Top 20 Features - Random Forest', figsize=(8,6))
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.show()

In [None]:
results = pd.concat([results, pd.DataFrame({'Model': 'RandomForest (skopt)', 'RMSE Train': np.sqrt(mean_squared_error(y_train, rf_opt.predict(X_train)))}, index=[0])])
results

## XGBoost

In [None]:
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

xgb_search_space = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(3, 10),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0)
}

xgb_opt = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=xgb_search_space,
    scoring='neg_root_mean_squared_error',
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_iter=50,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

xgb_opt.fit(X_train, y_train)
print("Training Complete", xgb_opt.best_params_)

preds = xgb_opt.predict(X_test)
y_pred = pd.DataFrame({'Id': test['Id'], 'SalePrice': preds})
y_pred.to_csv('submission_xgb.csv', index=False)


In [None]:
# Assuming xgb_opt is your BayesSearchCV-trained XGBoost model
importances_xgb = xgb_opt.best_estimator_.feature_importances_
features_xgb = pd.Series(importances_xgb, index=X_train.columns)
top_20_xgb = features_xgb.sort_values(ascending=False).head(20)

# Display top 20
top_20_xgb.plot(kind='barh', title='Top 20 Features - XGBoost', figsize=(8,6))
plt.gca().invert_yaxis()
plt.xlabel("Importance")
plt.show()


In [None]:
results = pd.concat([results, pd.DataFrame({'Model': 'XGBoost (skopt)', 'RMSE Train': np.sqrt(mean_squared_error(y_train, xgb_opt.predict(X_train)))}, index=[0])])
results.sort_values(by='RMSE Train', ascending=True, inplace=True)
results.reset_index(drop=True, inplace=True)
results

## Ensemble

### Simple Average

In [None]:
preds_rf = rf_opt.predict(X_train)
preds_xgb = xgb_opt.predict(X_train)
results = pd.concat([results, pd.DataFrame({'Model': 'Simple Average Ensemble', 'RMSE Train': np.sqrt(mean_squared_error(y_train, (preds_rf + preds_xgb) / 2))}, index=[0])])


preds_rf = rf_opt.predict(X_test)
preds_xgb = xgb_opt.predict(X_test)

preds_ensemble = (preds_rf + preds_xgb) / 3

submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': preds_ensemble
})
submission.to_csv('submission_simple_average.csv', index=False)

### Weighted Average

In [None]:
w_rf, w_xgb = 0.6, 0.4

preds_rf = rf_opt.predict(X_train)
preds_xgb = xgb_opt.predict(X_train)
preds_ensemble = (w_rf * preds_rf) + (w_xgb * preds_xgb)
results = pd.concat([results, pd.DataFrame({'Model': 'Weighted Average Ensemble (6, 4)', 'RMSE Train': np.sqrt(mean_squared_error(y_train, preds_ensemble))}, index=[0])])


preds_rf = rf_opt.predict(X_test)
preds_xgb = xgb_opt.predict(X_test)
preds_ensemble = (w_rf * preds_rf) + (w_xgb * preds_xgb)

submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': preds_ensemble
})
submission.to_csv('submission_weighted_average_6_4.csv', index=False)

In [None]:
results.sort_values(by='RMSE Train', ascending=True, inplace=True)
results.reset_index(drop=True, inplace=True)
results