In [None]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from scipy.stats import skew, norm, probplot
import time
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import Ridge, HuberRegressor, LinearRegression
from sklearn.svm import SVR
from sklearn.cluster import KMeans
import catboost as cb
from xgboost import XGBRegressor
from mlxtend.regressor import StackingCVRegressor

In [None]:
#import data
house_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
house_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
#info of train dataset
house_df.info()

In [None]:
#head of train dataset
print(house_df.head())

In [None]:
#info of test dataset
house_test.info()

In [None]:
#head of test dataset
print(house_test.head())

In [None]:
#independent and dependent variables
y_house = house_df['SalePrice']
house_df = house_df.drop(['SalePrice'],axis=1)
house_df = house_df.set_index('Id')
house_test = house_test.set_index('Id')

In [None]:
#find out columns with missing value
null_list = []
for col in house_df.columns:
    null = house_df[col].isnull().sum()
    test_null = house_test[col].isnull().sum()
    if null != 0 or test_null != 0:
        null_list.append([col,null,test_null])
null_df = pd.DataFrame(null_list,columns=['Feature','Null','Test Null'])
null_df.set_index('Feature')
null_df['Total Null'] = null_df['Null'] + null_df['Test Null']
print("-------------------------")
print("Total columns with null:")
print(len(null_df))
print("-------------------------")
print("Total null values:")
print(null_df['Total Null'].sum(axis=0))
print("-------------------------")
sns.set_palette(sns.color_palette("pastel"))
sns.barplot(data=null_df.sort_values(by='Total Null',ascending = False).head(10), 
            x='Feature',y='Total Null')
plt.xticks(rotation = 70)
plt.title("Total Nulls in Feature")
plt.show()

In [None]:
data = pd.concat([house_df,house_test],axis=0).reset_index(drop=True)
data.shape

In [None]:
data.isna().mean().sort_values().plot(
kind='bar', figsize=(20,10), title='Percentage of missing values',
ylabel='Ratio of missing values per feature')

Dealing with missing values

First, we will do some exploration on all features

In [None]:
#combine train and test data
data = pd.concat([house_df,house_test],axis=0).reset_index(drop=True)
data.shape

In [None]:
#Check all the object variables
col_1 = data.select_dtypes(include=['object']).columns
for col in col_1:
    print(col, 'unique values :', data[col].sort_values().unique())

Some features have different value than it should (maybe those are typos). We will correct that

In [None]:
data['MSZoning'].replace('C (all)', 'C',inplace=True)
data['Neighborhood'].replace('NAmes', 'Names',inplace=True)
data['BldgType'].replace('2fmCon', '2FmCon',inplace=True)
data['BldgType'].replace('Twnhs', 'TwnhsI', inplace=True)

In [None]:
#Check the data again
col_1 = data.select_dtypes(include=['object']).columns
for col in col_1:
    print(col, 'unique values :', data[col].sort_values().unique())

1. MsZoning 

In [None]:
#Dealing with MSZoning variables
null = house_test[house_test['MSZoning'].isnull()][["Neighborhood","MSZoning"]]
display(null)
plot_data = pd.concat([data[data['Neighborhood'] == 'IDOTRR'],data[data['Neighborhood'] == 'Mitchel']],
                      axis = 0)
sns.histplot(data = plot_data, x ='MSZoning', hue ='Neighborhood',multiple="dodge", shrink=.9)
plt.title("Distribution of MSZoning Classification")
plt.show()

MSZning classification usually depends on the Neighborhood so we will impute the missing value by the mode in each area

In [None]:
house_test.loc[(house_test['Neighborhood'] == 'IDOTRR') & 
               (house_test['MSZoning'].isnull()), 'MSZoning'] = 'RM'
house_test.loc[(house_test['Neighborhood'] == 'Mitchel') & 
               (house_test['MSZoning'].isnull()), 'MSZoning'] = 'RL'

2. LotFrontage

   We could expect that LotFrontage somewhat has correlation with LotArea. Hence we will use
   LinearRegression to impute the missing values. We also manually filter out the outliers from the 
   data.

In [None]:
data_1 = data[(~data['LotFrontage'].isnull()) & (data['LotFrontage'] <= 150) &
              (data['LotArea'] <= 20000)]
sns.lmplot(data=data_1,x="LotArea",y="LotFrontage", line_kws={'color': 'black'})
plt.ylabel("LotFrontage")
plt.xlabel("LotArea")
plt.title(" Scatterplot of LotArea vs LotFrontage")
plt.show()

We could see that there is a linear relationship between LotArea and LotFrontage. Thus we can use linear regression to impute missing value of LotFrontage

In [None]:
LotA_LotF = LinearRegression()
LotA_LotF_X = data_1['LotArea'].values.reshape(-1, 1)
LotA_LotF_y = data_1['LotFrontage'].values
LotA_LotF.fit(LotA_LotF_X,LotA_LotF_y)
for table in [house_df, house_test]:
    table['LotFrontage'].fillna(LotA_LotF.intercept_ + table['LotArea'] * LotA_LotF.coef_[0] 
                                , inplace=True)

3. Alley

   Data description says NA means no alley access

In [None]:
for table in [house_df,house_test]:
    table['Alley'].fillna("None",inplace=True)

4. Utilities

   

In [None]:
house_df['Utilities'].value_counts()

In [None]:
house_test['Utilities'].value_counts()

Since there is only 1 data that uses NoSeWa, we will  fill the missing value in test set with AllPub.
We will just drop the NoSeWa row in our training dataset since it is not found in the test set and will contribute to overfitting if left alone.

In [None]:
#fill all utilities missing values on test dataset with AllPub
house_test['Utilities'].fillna("AllPub",inplace=True)

#drop train dataset row where Utilitiy has NoSeWa value
house_df.drop(house_df[house_df['Utilities'] == 'NoSeWa'].index, inplace = True)

5. Exterior features

   Exterior1st : Exterior covering on house
   
   Exterior2nd : Exterior covering on house (if more than one material)

   There are more than 10 types of materials used in both the metrics. We will see the barplot of 
   these features.

In [None]:
for metrics in ['Exterior1st','Exterior2nd']:
    table = data[metrics].value_counts(normalize=True).head()
    sns.barplot(x=table.index,y=table.values)
    plt.title("Distribution plot of "+metrics)
    plt.show()
    print("\n")

For Exterior1st and Exterior2nd, the mode is VinylSd. Hence, we will replace missing values with VinylSd

In [None]:
house_test['Exterior1st'] = house_test['Exterior1st'].fillna(data['Exterior1st'].mode()[0])
house_test['Exterior2nd'] = house_test['Exterior2nd'].fillna(data['Exterior2nd'].mode()[0])

6. MasVnrType 

   Data description says NA means no Masonry veneer. However, there is one data in test set with area 
   but missing type.

In [None]:
house_test[(house_test['MasVnrType'].isnull()) & 
           (house_test['MasVnrArea'].notnull())][['MasVnrType','MasVnrArea']]

In [None]:
table = data['MasVnrType'].value_counts(normalize=True).head()
sns.barplot(x=table.index,y=table.values)
plt.title("Distribution plot of MasVnrType")
plt.show()
print("\n")

Since around 60% of our data do not have Masonry veneer (have None value), it will be used to fill the null value in row 2611 and also the other rows.

In [None]:
house_test['MasVnrType'][2611] = "BrkFace"
house_test['MasVnrType'] = house_test['MasVnrType'].fillna(data['MasVnrType'].mode()[0])
house_test['MasVnrArea'] = house_test['MasVnrArea'].fillna(0)
house_df['MasVnrType'] = house_df['MasVnrType'].fillna(data['MasVnrType'].mode()[0])
house_df['MasVnrArea'] = house_df['MasVnrArea'].fillna(0)

7. Basement Metrics 

   Data description says BsmtFinType1 measures the Type 1 finished square feet of basement.
   However, we can see a few data in test data set having basement metrics but "0" squarefeets

In [None]:
for basement_metrics_cols in ['BsmtExposure','BsmtCond','BsmtQual']:
    if len(data[(data[basement_metrics_cols].isnull()) & (data['BsmtFinType1'].notnull())]) > 0 :
        print("\nPresent with BsmtFinType1 but " +  basement_metrics_cols + " undetected")
        display(data[(data[basement_metrics_cols].isnull()) & (data['BsmtFinType1'].notnull())])

In [None]:
# We assume missing basement exposure of unfinished basement is "No".
house_df.loc[((house_df['BsmtExposure'].isnull()) & (house_df['BsmtFinType1'].notnull())),
             'BsmtExposure'] = 'No'
house_test.loc[((house_test['BsmtExposure'].isnull()) & (house_test['BsmtFinType1'].notnull())), 
               'BsmtExposure'] = 'No'

# We impute missing basement condition with "mean" value of Typical.
house_test.loc[((house_test['BsmtCond'].isnull()) & (house_test['BsmtFinType1'].notnull())), 
               'BsmtCond'] = 'TA'
# We impute unfinished basement quality with "mean" value of Typical.
house_test.loc[((house_test['BsmtQual'].isnull()) & (house_test['BsmtFinType1'].notnull())), 
               'BsmtQual'] = 'TA'

There is one test data with missing square feet values. Let's check that data

In [None]:
house_test[house_test['BsmtFinSF1'].isnull()]

This test data do not have basement. Hence, those squarefeets metrics should be filled in with 0.

In [None]:
for square_feet_metrics in ['TotalBsmtSF','BsmtUnfSF','BsmtFinSF2','BsmtFinSF1']:
    house_test[square_feet_metrics][2121] = 0

There is two test data with missing basement bathroom values. Let's check them out first too.

In [None]:
house_test[house_test['BsmtFullBath'].isnull()]

The two test data do not have basement. Hence, those bathroom amount in basement should also be filled in with 0.

In [None]:
for bathroom_metrics in ['BsmtFullBath','BsmtHalfBath']:
    house_test[bathroom_metrics][2121] = 0
    house_test[bathroom_metrics][2189] = 0

The other data are assumed to not have basements hence filling in None.

In [None]:
for table in [house_df,house_test]:
    table[table.columns[table.columns.str.contains('Bsmt')]] = table[table.columns
                                                                     [table.columns.str.contains('Bsmt')]].fillna("None")

8. Electrical, Functional and Kitchen Quality 

   We will see the distribution of each value in these features.

In [None]:
for metrics in ['Electrical','Functional','KitchenQual']:
    table = data[metrics].value_counts(normalize=True)
    sns.barplot(x=table.index,y=table.values)
    plt.title("Distribution plot of "+ metrics)
    plt.show()
    print("\n")

These three metrics will be filled with the mode values.

In [None]:
house_df['Electrical'].fillna('SBrkr',inplace=True)
house_test['Functional'].fillna('Typ',inplace=True)
house_test['KitchenQual'].fillna('TA',inplace=True)

9. Garage 

In [None]:
data[data['GarageCars'].isnull()]

Simililarly, this test data do not have a garage, filling GarageArea and GarageCars with 0.

In [None]:
house_test['GarageCars'].fillna(0,inplace=True)
house_test['GarageArea'].fillna(0,inplace=True)

10. SaleType

In [None]:
display(data[data['SaleType'].isnull()])
table = data['SaleType'].value_counts(normalize=True)
sns.barplot(x=table.index,y=table.values)
plt.title("Distribution plot of SaleType")
plt.show()

For the SaleType column, we will impute the missing data with the mode

In [None]:
house_test['SaleType'].fillna('WD',inplace=True)

Check all the missing value again

In [None]:
null_list = []
for col in house_df.columns:
    null = house_df[col].isnull().sum()
    test_null = house_test[col].isnull().sum()
    if null != 0 or test_null != 0:
        null_list.append([col,null,test_null])
null_df = pd.DataFrame(null_list,columns=['Feature','Null','Test Null'])
null_df.set_index('Feature')
null_df['Total Null'] = null_df['Null'] + null_df['Test Null']
print("-------------------------")
print("Total columns with null:")
print(len(null_df))
print("-------------------------")
print("Total null values:")
print(null_df['Total Null'].sum(axis=0))
print("-------------------------")
sns.set_palette(sns.color_palette("pastel"))
sns.barplot(data=null_df.sort_values(by='Total Null',ascending = False).head(10), x='Feature',y='Total Null')
plt.xticks(rotation = 70)
plt.title("Total Nulls in Feature")
plt.show()

We do not have anything extra to infer these missing columns. Hence, we will treat them as "None" which is not having those items.

In [None]:
house_df['GarageYrBlt'].fillna(0,inplace=True)
house_test['GarageYrBlt'].fillna(0,inplace=True)
house_df.fillna("None", inplace=True)
house_test.fillna("None", inplace=True)

In [None]:
#Check the missing values again 
print(house_df.isnull().sum().sum() + house_test.isnull().sum().sum())

In [None]:
house_df.index = house_df.index - 1

Feature Engineering

In this section, we will look for and modify features so we can perform modelling properly

In [None]:
#Checking y variable
#Distribution plot
sns.distplot(y_house , fit=norm);

(mu, sigma) = norm.fit(y_house)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

# QQ-plot
fig = plt.figure()
res = probplot(y_house, plot=plt)
plt.show()

The first plot is a distribution plot where we compare the distribution of our target variable with a normal distribution.
We can see that our data is right-skewed.

The Q-Q plot the quantiles of our target feature against the quantiles of a normal distribution.
We can also easily see the skewness in the target feature.

We will transform our y variable

In [None]:
#Transform using log
y_house = np.log(y_house)

In [None]:
#See the distribution and QQ-plot again
sns.distplot(y_house , fit=norm);
(mu, sigma) = norm.fit(y_house)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = probplot(y_house, plot=plt)
plt.show()

We can now see the distribution plot is much closer to a normal distribution.

The Q-Q plot also shows that the quantiles of our target feature and the quantiles of a normal distribution are much closer now.

Feature creation

In this short section we will construct some new (important) features from existing features that can be fed into our model later on. There are many ways to increase our data, one of them is through creating combinations or ratio from the most relevant variables from the raw data.

I will add only a few extra features related to square-feet as I think the size of a house will be the main factor of its price.

We also transformed some features that are supposingly categorical but labelled as numerical as they are consisting of numbers.

In [None]:
#Find the total square feet
house_df['TotalSF'] = house_df['TotalBsmtSF'] + house_df['1stFlrSF'] + house_df['2ndFlrSF']
house_test['TotalSF'] = house_test['TotalBsmtSF'] + house_test['1stFlrSF'] + house_test['2ndFlrSF']

In [None]:
#change some datatype to string
for table in [house_df,house_test]:
    table['MSSubClass'] = table['MSSubClass'].apply(str)
    table['YrSold'] = table['YrSold'].astype(str)
    table['MoSold'] = table['MoSold'].astype(str)

Feature Encoding Round 1 (Ordinal)

Many machine learning models prefer or can only work with numerical values. It is common practice to transform the categorical values into numerical.

There are many ways though, to transform the features, one of which is through ordinal encoding. We use this method whenever our features has order (A is better than B) so that we can retain the information regarding the order.

In [None]:
qual_dict = {'None': 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
bsmt_fin_dict = {'None': 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}

for table in [house_df,house_test]:
    table["ExterQual"] = table["ExterQual"].map(qual_dict)
    table["ExterCond"] = table["ExterCond"].map(qual_dict)
    table["BsmtQual"] = table["BsmtQual"].map(qual_dict)
    table["BsmtCond"] = table["BsmtCond"].map(qual_dict)
    table["PoolQC"] = table["PoolQC"].map(qual_dict)
    table["HeatingQC"] = table["HeatingQC"].map(qual_dict)
    table["KitchenQual"] = table["KitchenQual"].map(qual_dict)
    table["FireplaceQu"] = table["FireplaceQu"].map(qual_dict)
    table["GarageQual"] = table["GarageQual"].map(qual_dict)
    table["GarageCond"] = table["GarageCond"].map(qual_dict)

    table["BsmtExposure"] = table["BsmtExposure"].map(
        {'None': 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}) 
    table["BsmtFinType1"] = table["BsmtFinType1"].map(bsmt_fin_dict)
    table["BsmtFinType2"] = table["BsmtFinType2"].map(bsmt_fin_dict)

    table["Functional"] = table["Functional"].map(
        {'None': 0, "Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, 
         "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8})

    table["GarageFinish"] = table["GarageFinish"].map(
        {'None': 0, "Unf": 1, "RFn": 2, "Fin": 3})

    table["Fence"] = table["Fence"].map(
        {'None': 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4})
    
    table["CentralAir"] = table["CentralAir"].map(
        {'N': 0, "Y": 1})
    
    table["PavedDrive"] = table["PavedDrive"].map(
        {'N': 0, "P": 1, "Y": 2})

    
    table["Street"] = table["Street"].map(
        {'Grvl': 0, "Pave": 1})
    
    table["Alley"] = table["Alley"].map(
        {'None': 0, "Grvl": 1, "Pave": 2})
    
    table["LandSlope"] = table["LandSlope"].map(
        {'Gtl': 0, "Mod": 1, "Sev": 2})
    
    table["LotShape"] = table["LotShape"].map(
        {'Reg': 0, "IR1": 1, "IR2": 2, "IR3": 3})
    
modified_cols = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual' \
                    ,'FireplaceQu','GarageQual','GarageCond','BsmtExposure','BsmtFinType1' \
                   ,'BsmtFinType2', 'Functional','GarageFinish','Fence','Street','Alley','LandSlope'\
                    ,'PavedDrive' ,'CentralAir','PoolQC','OverallQual','OverallCond','LotShape']

# Get list of categorical variables in holiday dataset
s = (house_df.dtypes == 'object')
object_cols = list(s[s].index)
object_cols = [x for x in object_cols if x not in modified_cols]

We can still go further to simplify our features. When the feature is highly skewed, we can group some values into "Others" to reduce the number of columns when we use one-hot encoding later on.

So, we will plot the distributions of the features and see how we should simplify them.

In [None]:
data = pd.merge(left = house_df, right = y_house , left_index= True, right_index = True)
data['SalePrice'] = np.exp(data['SalePrice'])

for col in object_cols:
    if data[col].nunique()> 1:
        print("\nSummary statistics and graph for "+ col)
        display(data.groupby(col)['SalePrice'].describe())
        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
        sns.countplot(data = data, x=col, ax= ax[0])
        ax[0].title.set_text("Count plot of " + col)
        sns.swarmplot(data=data,x=col,y='SalePrice', ax= ax[1])
        ax[1].title.set_text("Swarm plot of " + col +" versus Sale Price")
        if (data[col].nunique()>=15):
            ax[0].tick_params('x',labelrotation=70)
            ax[1].tick_params('x',labelrotation=70)
        fig.tight_layout()
        plt.show()

We can see that many of the features are highly skewed and some feature value counts are very low.
Hence, we will just group them as "Others". For features that only have two value, we will also just do the manual one-hot encoding here.

Those that have more than two unique values will be one-hot encoded below.

In [None]:
cond_1_keep = ['Norm','Feedr','Artery']
roof_style_keep = ['Gable','Hip']
foundation_keep = ['PConc','CBlock','BrkTil']
garage_keep = ['Attchd','Detchd','BuiltIn']
sale_keep = ['WD','New','COD']
sale_cond_keep = ['Normal','Abnorml','Partial']
peak_months = ['5','6','7']
lot_config_keep = ['Inside','Corner','CulDSac']
unfinished_style = ['1.5Unf','2.5Unf']
exter_remove = ['AsphShn','BrkComm','CBlock','ImStucc','Stone']
for table in [house_df,house_test]:
    table.loc[table['LandContour']!='Lvl','LandContour'] = 0
    table.loc[table['LandContour']!=0,'LandContour'] = 1
    
    table.loc[~table['Condition1'].isin(cond_1_keep),'Condition1'] = "Others"
    table.loc[table['Condition2']!="Norm",'Condition2'] = 0
    table.loc[table['Condition2']!= 0,'Condition2'] = 1
    
    table.loc[~table['RoofStyle'].isin(roof_style_keep),'RoofStyle'] = "Others"
    table.loc[table['RoofMatl']!='CompShg','RoofMatl'] = 0
    table.loc[table['RoofMatl']!=0,'RoofMatl'] = 1
    
    table.loc[~table['Foundation'].isin(foundation_keep),'Foundation'] = "Others"
    table.loc[table['Heating']!='GasA','Heating'] = 0
    table.loc[table['Heating']=='GasA','Heating'] = 1
    table.loc[table['Electrical']!='SBrkr','Electrical'] = 0
    table.loc[table['Electrical']!=0,'Electrical'] = 1
    
    table.loc[~table['GarageType'].isin(garage_keep),'GarageType'] = "Others"
    
    table.loc[~table['SaleType'].isin(sale_keep),'SaleType'] = "Others"
    table.loc[~table['SaleCondition'].isin(sale_cond_keep),'SaleCondition'] = "Others"
    table.loc[~table['SaleCondition'].isin(sale_cond_keep),'SaleCondition'] = "Others"
    
    table.loc[table['Exterior1st'].isin(exter_remove),'Exterior1st'] = "Others"
    table.loc[table['Exterior2nd'].isin(exter_remove),'Exterior2nd'] = "Others"
    
    table.loc[table['MoSold'].isin(peak_months),'PeakMonths'] = 1
    table.loc[table['PeakMonths']!=1,'PeakMonths'] = 0
    
    table.loc[~table['LotConfig'].isin(lot_config_keep),'LotConfig'] = "Others"
    
    table.loc[~table['HouseStyle'].isin(unfinished_style),'Unfinished'] = 1
    table.loc[table['Unfinished']!= 1 ,'Unfinished'] = 0
    table.loc[table['HouseStyle'].isin(['SFoyer','SLvl']),'IsSplit'] = 1
    table.loc[table['IsSplit']!= 1 ,'IsSplit'] = 0   
    table["HouseStyle"] = table["HouseStyle"].map(
        {'SFoyer': 0, "SLvl": 0, "1Story": 1, "1.5Fin": 2, "1.5Unf": 2, "2Story": 3, 
         "2.5Fin": 4, "2.5Unf": 4})
    
    table.drop('Utilities', axis = 1 , inplace = True)

    
modified_cols_round_2 = ['HouseStyle','LandContour','Condition2','RoofMatl','Heating',
                         'Electrical','Utilities']
object_cols = [x for x in object_cols if x not in modified_cols_round_2]

Feature Clustering

Before we go on to one-hot encode our categorical features. We can see that some of the features still have a lot of unique values.

This will cause our final training data to have a lot of columns as each and every of the unique values will be encoded into one extra column. So we need to simmplify the features using clusters.

To do that, we will use an unsupervised learning method which is K-Means to identify suitable clusters.

For neighborhoods, I intend to group them into 5 clusters and subclasses I will group them into 4 clusters.

To do that, we try to provide K-Means with as many information regarding the feature that we want to cluster as possible. We will use .describe() to include the various statistics regarding the feature and feed it into the model.

In [None]:
neighborhood = data.groupby(['Neighborhood'])['SalePrice'].describe()
display(neighborhood.head())

In [None]:
neighborhood_cluster = KMeans(n_clusters=5, random_state = 927)
neighborhood_cluster.fit(neighborhood)

In [None]:
neigh_cluster_table = pd.DataFrame(zip(list(neighborhood.index),list(neighborhood.loc[:,'mean']),
                      list(neighborhood_cluster.labels_)),columns = ['Neighborhood',
                       'MeanSalePrice','Neighborhood Cluster'])
for i  in range(len(neigh_cluster_table.groupby('Neighborhood Cluster')['Neighborhood'].unique())):
    print("Cluster " + str(i))
    print(neigh_cluster_table.groupby('Neighborhood Cluster')['Neighborhood'].unique()[i])
sns.scatterplot(data = neigh_cluster_table, x='Neighborhood',y = 'MeanSalePrice', hue='Neighborhood Cluster',palette=sns.color_palette("Set2",5))
plt.xticks(rotation=70)
plt.show()

In [None]:
subclass = data.groupby(['MSSubClass'])['SalePrice'].describe()
display(subclass.head())

In [None]:
subclass_cluster = KMeans(n_clusters=4, random_state = 927)
subclass_cluster.fit(subclass)

In [None]:
mssub_cluster_table = pd.DataFrame(zip(list(subclass.index),list(subclass.loc[:,'mean']),list(subclass_cluster.labels_)),columns = ['MSSubClass','MeanSalePrice','MSSubClass Cluster'])
for i  in range(len(mssub_cluster_table.groupby('MSSubClass Cluster')['MSSubClass'].unique())):
    print("Cluster " + str(i))
    print(mssub_cluster_table.groupby('MSSubClass Cluster')['MSSubClass'].unique()[i])
sns.scatterplot(data = mssub_cluster_table, x='MSSubClass',y = 'MeanSalePrice', hue='MSSubClass Cluster',palette=sns.color_palette("Set2",4))
plt.xticks(rotation=70)
plt.show()

In [None]:
mssub_cluster_table.drop('MeanSalePrice', axis = 1 ,inplace = True)
neigh_cluster_table.drop('MeanSalePrice', axis = 1, inplace = True)

In [None]:
house_df = pd.merge(left = house_df.reset_index(), right = mssub_cluster_table, 
                    how='left', on ='MSSubClass').set_index('Id')
house_df = pd.merge(left = house_df.reset_index(), right = neigh_cluster_table, how='left', 
              on ='Neighborhood').set_index('Id')
house_df.drop('MSSubClass', axis = 1 ,inplace = True)
house_df.drop('Neighborhood', axis = 1 ,inplace = True)

In [None]:
house_test = pd.merge(left = house_test.reset_index(), right = mssub_cluster_table, how='left', 
                on ='MSSubClass').set_index('Id')
house_test = pd.merge(left = house_test.reset_index(), right = neigh_cluster_table, how='left', 
                on ='Neighborhood').set_index('Id')
house_test.drop('MSSubClass', axis = 1 ,inplace = True)
house_test.drop('Neighborhood', axis = 1 ,inplace = True)

After merging the clusters into our training and test dataset, we check again the remaining categorical variables that we want to one-hot encode.

In [None]:
modified_cols.append('MSSubClass')
modified_cols.append('Neighborhood')

In [None]:
object_cols.append('MSSubClass Cluster')
object_cols.append('Neighborhood Cluster')
object_cols.remove('MSSubClass')
object_cols.remove('Neighborhood')

Next, we perform one-hot encoding to the remaining categorical variables

In [None]:
# One Hot Encoding for Other Columns
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(house_df[object_cols]))
OH_cols.index = house_df.index
OH_cols.columns = OH_encoder.get_feature_names(object_cols)
house_df = house_df.drop(object_cols, axis=1)
house_df = pd.concat([house_df, OH_cols], axis=1)

OH_cols = pd.DataFrame(OH_encoder.transform(house_test[object_cols]))
OH_cols.index = house_test.index
OH_cols.columns = OH_encoder.get_feature_names(object_cols)
house_test = house_test.drop(object_cols, axis=1)
house_test = pd.concat([house_test, OH_cols], axis=1)

Feature Transformation (Skewed Features)

We should also take care of the skewness of the features in our dataset. We use skew() from the scipy.stats module to identify which columns are skewed.

Any skewness greater than 0.5 is actually considered slightly skewed hence we will perform log-transformation for any values greather than that.

In [None]:
skewed = house_df[house_df.columns[~house_df.columns.isin(list(OH_cols.columns) + modified_cols +
        object_cols)]].apply(lambda x: skew(x.dropna().astype(float)))
skewed = skewed[skewed > 0.5]
skewed = skewed.index

house_df[skewed] = np.log1p(house_df[skewed])
house_test[skewed] = np.log1p(house_test[skewed])

Feature Scaling

While log-transformation took care of the skewness in the features, we also need to scale the features to a standardize the range.

Of the many scaling choices such as MinMaxScaler, StandardScaler, we picked RobustScaler.

The reasoning behind this is because we have seen that our data seems to be quite skewed and it will tend to have more outliers than a normal dataset. Using a RobustScaler can deal with that easily as it uses statistics that are insensitive to outliers to scale the data.

A robust scaler minuses the median and divides it by the interquatile range. Both of which are not affected by the outliers.

In [None]:
for col in house_df[house_df.columns]:
    if col not in (list(OH_cols.columns) + modified_cols + object_cols):
        scaler = RobustScaler()
        house_df[col] = scaler.fit_transform(house_df[[col]])
        house_test[col] = scaler.transform(house_test[[col]])

Feature Selection

Feature selection is a simple way to reduce redundant and irrelevant data from our dataset and some of them contribute close to nothing.
Removing the irrelevant data actually improves learning accuracy and greatly reduces the computation time.

By removing redundant data, we can reduce the chance of our model overfitting to the data too.

There are some ways to perform features selection and some of which we surely studied before such as the Pearson’s Correlation and Analysis of Variance (ANOVA). In this notebook, we will utilize the mutual info regression to estimate the dependency of the variables with our target variable.

Mutual information is a non-negative value and it shows the dependency between the variables. Meaning a mutual information of 0 will be saying that both of the features are completely independent. Hence, it is a safe bet for us to remove them off. Note the other name of mutual information is information gain (you may have heard it before).

Mutual information measures the amount of information one can obtain from one random variable given another. Source : Data Mining: Practical Machine Learning Tools and Techniques, 4th edition, 2016.

In [None]:
data = pd.merge(left = house_df, right = y_house , left_index= True, right_index = True)
mi = mutual_info_regression(X = data.drop('SalePrice', axis = 1), y = data['SalePrice'])
mi_df = pd.DataFrame(list(zip(data.columns,mi)), columns =['Feature','Mutual Info'])
mi_df = mi_df.sort_values('Mutual Info',ascending=False)

In [None]:
low_mi_df = mi_df[abs(mi_df['Mutual Info']) == 0]
filter_feature = sorted(list(low_mi_df['Feature']))
print("Number of low correlated features dropped: " + str(len(filter_feature)))
house_df = house_df.drop(filter_feature,axis=1)
house_test = house_test.drop(filter_feature,axis=1)

Polynomial and Interaction Features

Another part of feature creation ! In this part, we create new polynomial and interaction features from the high mutual information features to derive new combinations that might be useful to our model later on.

Polynomial features can allow our linear models to grasp on the non-linearity of the features and we can also see if there is some new interesting relationships between the features themselves by introducing interaction features.

We can actually generate polynomial and interaction features from all of our features (quite large) and further cherry pick the good features. There may be hidden interesting relationship to be uncovered there but I am quite satisfied with only using the highly depended features.

To read more about interaction features: https://stattrek.com/multiple-regression/interaction.aspx

In [None]:
top_mi_list = list(mi_df.head(20)['Feature'])
top_mi_subset = house_df[top_mi_list]
index_copy = top_mi_subset.index

poly = PolynomialFeatures(2, interaction_only=True)
poly_features = pd.DataFrame(poly.fit_transform(top_mi_subset),
                             columns=poly.get_feature_names_out(top_mi_list))
poly_features = poly_features.iloc[:,len(top_mi_list) + 1:]
poly_features.set_index(index_copy, inplace = True)
poly_and_price = pd.concat([y_house,poly_features],axis=1).dropna()
top_20_poly = abs(poly_and_price.corr()['SalePrice']).sort_values(ascending=False)[1:21]

In [None]:
house_df = pd.concat([house_df,poly_features[top_20_poly.index]],axis=1)

In [None]:
top_mi_subset = house_test[top_mi_list]
index_copy = top_mi_subset.index
poly_features = pd.DataFrame(poly.transform(top_mi_subset),
                             columns=poly.get_feature_names_out(top_mi_list))
poly_features = poly_features.iloc[:,len(top_mi_list) + 1:]
poly_features.set_index(index_copy, inplace = True)
house_test = pd.concat([house_test,poly_features[top_20_poly.index]],axis=1)

In [None]:
top_20_poly.index

Outlier Identification

Outliers, the one thing that statistic text books like to assume they are normal.

Too bad they are usually not. A bad outlier case actually increases the variance in our model and further reduces the power of our model to grasp onto the data. Outliers cause regression model (especially linear ones) to learn a skewed understanding towards the outlier.

Isolation Forest much like its' name, works to isolation a tree in a huge forest. It works by randomly sampling data based on randomly selected features and potray them in a binary decision tree structure. For an outlier, there are actually less splits needed in the forest to isolate them. Conversely, a datapoint that is not an outlier will require a lot more splits to be isloted. 

In [None]:
from sklearn.ensemble import IsolationForest
iso_forest = IsolationForest(random_state=0)
house_df_without_outlier = pd.Series(iso_forest.fit_predict(house_df), index = data.index)
house_df = house_df.loc[house_df_without_outlier.index[house_df_without_outlier == 1],:]

Another way to categorize outliers is by using standardized residuals from linear models. Standardized residuals is can easily identify an abnormal residuals as they are standardized and we can observe the residuals in standard deviation units. Anything larger than 3 standard deviations are usually considered outliers.

In [None]:
data = pd.merge(left = house_df, right = y_house , left_index= True, right_index = True)
linear = LinearRegression()
Y = data['SalePrice']
linear.fit(data.drop(['SalePrice'],axis=1), Y)
Y_hat = linear.predict(data.drop(['SalePrice'],axis=1))
residuals = Y - Y_hat
y_vs_yhat_df = pd.DataFrame(zip(Y.values,Y_hat,residuals),
                            columns=['y','yhat','residuals'],index=data.index)

r2 = r2_score(Y, Y_hat)
print("About " + str(round(r2 * 100,2)) + 
      "% of variation in the Sale Price can be explained by the model.")

sns.scatterplot(Y, Y_hat)
sns.lineplot(np.linspace(10.5,13.5),np.linspace(10.5,13.5), color='black', linewidth=2.5)
plt.show()

In [None]:
standard_residuals = (residuals - residuals.mean()) / residuals.std()
outliers = data[abs(standard_residuals) > 3]
y_vs_yhat_df.loc[y_vs_yhat_df.index.isin(outliers.index),'Outlier'] = 1
y_vs_yhat_df.loc[y_vs_yhat_df['Outlier'] != 1 ,'Outlier'] = 0

In [None]:
sns.scatterplot(data = y_vs_yhat_df, x='y', y='yhat',hue ='Outlier', palette = ['blue','red'])
sns.lineplot(np.linspace(10.5,13.5),np.linspace(10.5,13.5), color='black', linewidth=2.5)
plt.show()

In [None]:
house_df = house_df.loc[y_vs_yhat_df[y_vs_yhat_df['Outlier'] == 0].index,:]

In [None]:
house_df = house_df.drop(list(house_test.columns[house_test.nunique()== 1 ]),axis=1)
house_test = house_test.drop(list(house_test.columns[house_test.nunique()== 1]),axis=1)

Modelling

For this part, we will be using Ridge, XGB, Catboost, SVR, Huber and a Stacked regression.

The performance of the models will later be averaged out (ensemble model) and we will also implement a stacked regressor at the same time.

Stacked regressor is a type of Level 1 ensemble model that generalizes the predictions made by different models to get the final output. 

In [None]:
data = pd.merge(left = house_df, right = y_house , left_index= True, right_index = True)
train_y = data['SalePrice']
train_X = data.drop(['SalePrice'],axis=1)

dev_train, dev_test = train_test_split(data, test_size=0.2 ,shuffle=True)
dev_train_y = dev_train['SalePrice']
dev_train_X = dev_train.drop(['SalePrice'],axis=1)
dev_test_y = dev_test['SalePrice']
dev_test_X = dev_test.drop(['SalePrice'],axis=1)

In [None]:
ridgemodel = Ridge(alpha=26)

xgbmodel = XGBRegressor(alpha= 3, colsample_bytree=0.5, reg_lambda=3, learning_rate= 0.01,\
           max_depth=3, n_estimators=10000, subsample=0.65)

svrmodel = SVR(C=8, epsilon=0.00005, gamma=0.0008)

hubermodel = HuberRegressor(alpha=30,epsilon=3,fit_intercept=True,max_iter=2000)

cbmodel = cb.CatBoostRegressor(loss_function='RMSE',colsample_bylevel=0.3, depth=2, \
          l2_leaf_reg=20, learning_rate=0.005, n_estimators=15000, subsample=0.3,verbose=False)

stackmodel = StackingCVRegressor(regressors=(ridgemodel, xgbmodel, svrmodel, hubermodel, cbmodel),
             meta_regressor=cbmodel, use_features_in_secondary=True)

We will fit the models onto development train and test data sets first to have a quick overview of the model performances.

In [None]:
start = time.time()
print("Recording Modelling Time")
for i in [ridgemodel,hubermodel,cbmodel,svrmodel,xgbmodel,stackmodel]:
    i.fit(train_X,train_y)
    if i == stackmodel:
        i.fit(np.array(dev_train_X), np.array(dev_train_y))
end = time.time()
print("Time Elapsed: " + str(round((end - start)/60,0)) +"minutes.")

In [None]:
print("-----------------------------")
print("Overview of model performance")
print("-----------------------------")
for i in [ridgemodel,hubermodel,cbmodel,svrmodel,xgbmodel,stackmodel]:
    print("\n")
    print(i)
    print("RMSLE of Development train set: ")
    print(mean_squared_error(dev_train_y,i.predict(dev_train_X), squared=False))
    print("RMSLE of Development test set: ")
    print(mean_squared_error(dev_test_y,i.predict(dev_test_X), squared=False))
    print("\n")
print("-----------------------------")
print("RMSLE of Development train set using ensemble model: ")
fit = (svrmodel.predict(train_X) + xgbmodel.predict(train_X) +   stackmodel.predict(train_X) + ridgemodel.predict(train_X) + hubermodel.predict(train_X) + cbmodel.predict(train_X)) / 6
print(mean_squared_error(train_y,fit, squared=False))
print("-----------------------------")

This time we fit the models with all the data.

In [None]:
start = time.time()
print("Recording Modelling Time")
for i in [ridgemodel,hubermodel,cbmodel,svrmodel,xgbmodel,stackmodel]:
    i.fit(train_X,train_y)
    if i == stackmodel:
        i.fit(np.array(train_X), np.array(train_y))
end = time.time()
print("Time Elapsed: " + str(round((end - start)/60,0)) +"minutes.")

References

https://www.kaggle.com/code/limyenwee/stacked-ensemble-models-top-3-on-leaderboard by **YEN WEE LIM**