# Housing Sale Price Prediction : Advanced Regression Assignment

## Steps
1. Reading and Understanding the data
2. Data Inspection and Missing Value Treatment
3. Data Analysis and Visualization
4. Preparing the data for Modelling
    - Dummy Encoding
    - Train - Test split
    - Rescaling
5. Training the Model
6. Regularization and Tuning
7. Residual Analysis
8. Predictions and Evaluations on the test set

### Step 1: Reading and understanding the data

In [2]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read the data
# Some of the columns have NA explicitely defined and it represents a particular category. So it shouldn't be treated as missing value. Hence reading them as string
cols_custom_read = ['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType',\
                    'GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
custom_converters = {c : str for c in cols_custom_read}

housing_data = pd.read_csv('https://ml-course3-upgrad.s3.amazonaws.com/Assignment_+Advanced+Regression/train.csv', converters=custom_converters)
housing_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Understand the shape of the data
housing_data.shape

(1460, 81)

#### Observations
- Number of datapoints is 1460 which seems to be less
- There are 80 independent variables

In [5]:
# Understand the data types, presence of missing values
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

### Step 2: Data Inspection and Missing Value Treatment

In [6]:
continuous_vars = []
categorical_vars = []

##### MSSubClass

In [7]:
housing_data['MSSubClass'].value_counts()

20     536
60     299
50     144
120     87
30      69
160     63
70      60
80      58
90      52
190     30
85      20
75      16
45      12
180     10
40       4
Name: MSSubClass, dtype: int64

In [8]:
print(housing_data['MSSubClass'].dtypes)

int64


In [9]:
housing_data['MSSubClass'].isna().sum()
# No Missing value

0

In [10]:
#Since this is described as Categorical Variable in Data dictionary, convert to Object. Avoiding to map them to actual description otherwise it will be too verbose
housing_data['MSSubClass'] = housing_data['MSSubClass'].astype("object")
categorical_vars.append('MSSubClass')

##### MSZoning

In [11]:
housing_data['MSZoning'].value_counts()

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

- C (all) is not part of data dictonary. But let's assume it is one of the category and proceed

In [12]:
print(housing_data['MSZoning'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [13]:
housing_data['MSZoning'].isna().sum()
# No Missing value

0

In [14]:
categorical_vars.append('MSZoning')

##### LotFrontage

In [15]:
housing_data['LotFrontage'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])

## Looks like data is skewed towards max

count    1201.000000
mean       70.049958
std        24.284752
min        21.000000
25%        59.000000
50%        69.000000
75%        80.000000
90%        96.000000
99%       141.000000
100%      313.000000
max       313.000000
Name: LotFrontage, dtype: float64

In [16]:
print(housing_data['LotFrontage'].dtypes)
# Continuous variable and dtype is float so no need to convert

float64


In [17]:
housing_data['LotFrontage'].isna().sum()
# There are missing values

259

In [18]:
# Going with median to impute the value as mean would be heavily skewed by the outliers in this case
median_lot_frontage = housing_data['LotFrontage'].describe()['50%']
housing_data['LotFrontage'].fillna(median_lot_frontage,inplace=True)
continuous_vars.append('LotFrontage')

##### LotArea

In [19]:
housing_data['LotArea'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])
## Looks like data is skewed towards max

count      1460.000000
mean      10516.828082
std        9981.264932
min        1300.000000
25%        7553.500000
50%        9478.500000
75%       11601.500000
90%       14381.700000
99%       37567.640000
100%     215245.000000
max      215245.000000
Name: LotArea, dtype: float64

In [20]:
print(housing_data['LotArea'].dtypes)
# Continuous variable and dtype is float so no need to convert

int64


In [21]:
housing_data['LotArea'].isna().sum()
# There are missing values

0

In [22]:
continuous_vars.append('LotArea')

##### Street

In [23]:
housing_data['Street'].value_counts()

Pave    1454
Grvl       6
Name: Street, dtype: int64

In [24]:
print(housing_data['Street'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [25]:
housing_data['Street'].isna().sum()
# No Missing value

0

In [26]:
categorical_vars.append('Street')

##### Alley

In [27]:
housing_data['Alley'].value_counts()

NA      1369
Grvl      50
Pave      41
Name: Alley, dtype: int64

In [28]:
print(housing_data['Alley'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [29]:
housing_data['Alley'].isna().sum()
# No Missing Values

0

In [30]:
# Map NA to No Alley for better description
housing_data['Alley'] = housing_data['Alley'].apply(lambda x : 'NoAlley' if x == 'NA' else x)
housing_data['Alley'].value_counts()
categorical_vars.append('Alley')

##### LotShape

In [31]:
housing_data['LotShape'].value_counts()

Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64

In [32]:
print(housing_data['LotShape'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [33]:
housing_data['LotShape'].isna().sum()
# No Missing Values

0

In [34]:
categorical_vars.append('LotShape')

##### LandContour

In [35]:
housing_data['LandContour'].value_counts()

Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64

In [36]:
print(housing_data['LandContour'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [37]:
housing_data['LandContour'].isna().sum()
# No Missing Values

0

In [38]:
categorical_vars.append('LandContour')

##### Utilities

In [39]:
housing_data['Utilities'].value_counts()

AllPub    1459
NoSeWa       1
Name: Utilities, dtype: int64

In [40]:
print(housing_data['Utilities'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [41]:
housing_data['Utilities'].isna().sum()
# No Missing Values

0

In [42]:
categorical_vars.append('Utilities')

##### LotConfig

In [43]:
housing_data['LotConfig'].value_counts()

Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: LotConfig, dtype: int64

In [44]:
print(housing_data['LotConfig'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [45]:
housing_data['LotConfig'].isna().sum()
# No Missing Values

0

In [46]:
categorical_vars.append('LotConfig')

##### LandSlope

In [47]:
housing_data['LandSlope'].value_counts()

Gtl    1382
Mod      65
Sev      13
Name: LandSlope, dtype: int64

In [48]:
print(housing_data['LandSlope'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [49]:
housing_data['LandSlope'].isna().sum()
# No Missing Values

0

In [50]:
categorical_vars.append('LandSlope')

##### Neighborhood

In [51]:
housing_data['Neighborhood'].value_counts()

NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
SWISU       25
StoneBr     25
Blmngtn     17
MeadowV     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64

In [52]:
print(housing_data['Neighborhood'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [53]:
housing_data['Neighborhood'].isna().sum()
# No Missing Values

0

In [54]:
categorical_vars.append('Neighborhood')

##### Condition1

In [55]:
housing_data['Condition1'].value_counts()

Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: Condition1, dtype: int64

In [56]:
print(housing_data['Condition1'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [57]:
housing_data['Condition1'].isna().sum()
# No Missing Values

0

In [58]:
categorical_vars.append('Condition1')

##### Condition2

In [59]:
housing_data['Condition2'].value_counts()

Norm      1445
Feedr        6
RRNn         2
Artery       2
PosN         2
PosA         1
RRAe         1
RRAn         1
Name: Condition2, dtype: int64

In [60]:
print(housing_data['Condition2'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [61]:
housing_data['Condition2'].isna().sum()
# No Missing Values

0

In [62]:
categorical_vars.append('Condition2')

##### BldgType

In [63]:
housing_data['BldgType'].value_counts()

1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
Name: BldgType, dtype: int64

In [64]:
print(housing_data['BldgType'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [65]:
housing_data['BldgType'].isna().sum()
# No Missing Values

0

In [66]:
categorical_vars.append('BldgType')

##### HouseStyle

In [67]:
housing_data['HouseStyle'].value_counts()

1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: HouseStyle, dtype: int64

In [68]:
print(housing_data['HouseStyle'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [69]:
housing_data['HouseStyle'].isna().sum()
# No Missing Values

0

In [70]:
categorical_vars.append('HouseStyle')

##### OverallQual

In [71]:
housing_data['OverallQual'].value_counts()

5     397
6     374
7     319
8     168
4     116
9      43
3      20
10     18
2       3
1       2
Name: OverallQual, dtype: int64

In [72]:
print(housing_data['OverallQual'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert

int64


In [73]:
housing_data['OverallQual'].isna().sum()
# No Missing Values

0

In [74]:
# Since OverallQual is ordinal categorical variable, use label encoding 
housing_data['OverallQual'] = LabelEncoder().fit_transform(housing_data['OverallQual'])


##### OverallCond

In [75]:
housing_data['OverallCond'].value_counts()

5    821
6    252
7    205
8     72
4     57
3     25
9     22
2      5
1      1
Name: OverallCond, dtype: int64

In [76]:
print(housing_data['OverallCond'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert

int64


In [77]:
housing_data['OverallCond'].isna().sum()
# No Missing Values

0

In [78]:
# Since OverallCond is ordinal categorical variable, use label encoding 
housing_data['OverallCond'] = LabelEncoder().fit_transform(housing_data['OverallCond'])

##### YearBuilt

In [79]:
housing_data['YearBuilt'].value_counts()

2006    67
2005    64
2004    54
2007    49
2003    45
        ..
1906     1
1911     1
1913     1
1917     1
1872     1
Name: YearBuilt, Length: 112, dtype: int64

In [80]:
print(housing_data['YearBuilt'].dtypes)
# Ordinal Categorical variable and dtype is int, so no change needed

int64


In [81]:
housing_data['YearBuilt'] = LabelEncoder().fit_transform(housing_data['YearBuilt'])

In [82]:
housing_data['YearBuilt'].isna().sum()
# No Missing Values

0

##### YearRemodAdd

In [83]:
housing_data['YearRemodAdd'].value_counts()

1950    178
2006     97
2007     76
2005     73
2004     62
       ... 
2010      6
1983      5
1952      5
1986      5
1951      4
Name: YearRemodAdd, Length: 61, dtype: int64

In [84]:
print(housing_data['YearRemodAdd'].dtypes)
# Ordinal Categorical variable and dtype is int, so no change needed

int64


In [85]:
housing_data['YearRemodAdd'] = LabelEncoder().fit_transform(housing_data['YearRemodAdd'])

In [86]:
housing_data['YearRemodAdd'].isna().sum()
# No Missing Values

0

##### RoofStyle

In [87]:
housing_data['RoofStyle'].value_counts()

Gable      1141
Hip         286
Flat         13
Gambrel      11
Mansard       7
Shed          2
Name: RoofStyle, dtype: int64

In [88]:
print(housing_data['RoofStyle'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [89]:
housing_data['RoofStyle'].isna().sum()
# No Missing Values

0

##### RoofMatl

In [90]:
housing_data['RoofMatl'].value_counts()

CompShg    1434
Tar&Grv      11
WdShngl       6
WdShake       5
Membran       1
ClyTile       1
Metal         1
Roll          1
Name: RoofMatl, dtype: int64

In [91]:
print(housing_data['RoofMatl'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [92]:
housing_data['RoofMatl'].isna().sum()
# No Missing Values

0

##### Exterior1st

In [93]:
housing_data['Exterior1st'].value_counts()

VinylSd    515
HdBoard    222
MetalSd    220
Wd Sdng    206
Plywood    108
CemntBd     61
BrkFace     50
WdShing     26
Stucco      25
AsbShng     20
Stone        2
BrkComm      2
CBlock       1
AsphShn      1
ImStucc      1
Name: Exterior1st, dtype: int64

In [94]:
print(housing_data['Exterior1st'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [95]:
housing_data['Exterior1st'].isna().sum()
# No Missing Values

0

##### Exterior2nd

In [96]:
housing_data['Exterior2nd'].value_counts()

VinylSd    504
MetalSd    214
HdBoard    207
Wd Sdng    197
Plywood    142
CmentBd     60
Wd Shng     38
Stucco      26
BrkFace     25
AsbShng     20
ImStucc     10
Brk Cmn      7
Stone        5
AsphShn      3
CBlock       1
Other        1
Name: Exterior2nd, dtype: int64

In [97]:
print(housing_data['Exterior2nd'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [98]:
housing_data['Exterior2nd'].isna().sum()
# No Missing Values

0

##### MasVnrType

In [99]:
housing_data['MasVnrType'].value_counts()

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64

In [100]:
print(housing_data['MasVnrType'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [101]:
housing_data['MasVnrType'].isna().sum()

8

In [102]:
#Imputing MasVnrType with Mode. 
mode_mas_vnr_type = housing_data['MasVnrType'].mode()[0]
print(f"Mode: {mode_mas_vnr_type}")
housing_data['MasVnrType'].fillna(mode_mas_vnr_type,inplace=True)

Mode: None


##### MasVnrArea

In [103]:
housing_data['MasVnrArea'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])
## Data is skewed towards right

count    1452.000000
mean      103.685262
std       181.066207
min         0.000000
25%         0.000000
50%         0.000000
75%       166.000000
90%       335.000000
99%       791.920000
100%     1600.000000
max      1600.000000
Name: MasVnrArea, dtype: float64

In [104]:
print(housing_data['MasVnrArea'].dtypes)
# Continuous variable and dtype is float, so no need to convert

float64


In [105]:
housing_data['MasVnrArea'].isna().sum()

8

In [106]:
# Since Missing values for MasVnrArea is exactly where MasVnrType is missing, we have to consider the value for Mode while imputing MasVnrArea
#Since Mode for MasVnrType is none, corresponding MasVnrArea should be 0
imputed_mas_vnr_area = 0
housing_data['MasVnrArea'].fillna(imputed_mas_vnr_area,inplace=True)

##### ExterQual

In [107]:
housing_data['ExterQual'].value_counts()

TA    906
Gd    488
Ex     52
Fa     14
Name: ExterQual, dtype: int64

In [108]:
print(housing_data['ExterQual'].dtypes)
# ExterQual is Ordinal Categorical variable, so it should be label encoded 

object


In [109]:
housing_data['ExterQual'].isna().sum()

0

In [110]:
# Label Encode ExterQual
housing_data['ExterQual'] = housing_data['ExterQual'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
housing_data['ExterQual'].value_counts()

2    906
3    488
4     52
1     14
Name: ExterQual, dtype: int64

##### ExterCond

In [111]:
housing_data['ExterCond'].value_counts()

TA    1282
Gd     146
Fa      28
Ex       3
Po       1
Name: ExterCond, dtype: int64

In [112]:
print(housing_data['ExterCond'].dtypes)
# ExterCond is Ordinal Categorical variable, so it should be label encoded 

object


In [113]:
housing_data['ExterCond'].isna().sum()

0

In [114]:
# Label Encode ExterCond
housing_data['ExterCond'] = housing_data['ExterCond'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
housing_data['ExterCond'].value_counts()

2    1282
3     146
1      28
4       3
0       1
Name: ExterCond, dtype: int64

##### Foundation

In [115]:
housing_data['Foundation'].value_counts()

PConc     647
CBlock    634
BrkTil    146
Slab       24
Stone       6
Wood        3
Name: Foundation, dtype: int64

In [116]:
print(housing_data['Foundation'].dtypes)
# Categorical variable and dtype is object so no need to convert
categorical_vars.append('Foundation')

object


In [117]:
housing_data['Foundation'].isna().sum()

0

##### BsmtQual

In [118]:
housing_data['BsmtQual'].value_counts()

TA    649
Gd    618
Ex    121
NA     37
Fa     35
Name: BsmtQual, dtype: int64

In [119]:
print(housing_data['BsmtQual'].dtypes)
# BsmtQual is Ordinal Categorical variable, so it should be label encoded 

object


In [120]:
housing_data['BsmtQual'].isna().sum()

0

In [121]:
# Label Encode BsmtQual
housing_data['BsmtQual'] = housing_data['BsmtQual'].map({'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
housing_data['BsmtQual'].value_counts()

3    649
4    618
5    121
0     37
2     35
Name: BsmtQual, dtype: int64

##### BsmtCond

In [122]:
housing_data['BsmtCond'].value_counts()

TA    1311
Gd      65
Fa      45
NA      37
Po       2
Name: BsmtCond, dtype: int64

In [123]:
print(housing_data['BsmtCond'].dtypes)
# BsmtCond is Ordinal Categorical variable, so it should be label encoded 

object


In [124]:
housing_data['BsmtCond'].isna().sum()

0

In [125]:
# Label Encode BsmtCond
housing_data['BsmtCond'] = housing_data['BsmtCond'].map({'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
housing_data['BsmtCond'].value_counts()

3    1311
4      65
2      45
0      37
1       2
Name: BsmtCond, dtype: int64

##### BsmtExposure

In [126]:
housing_data['BsmtExposure'].value_counts()

No    953
Av    221
Gd    134
Mn    114
NA     38
Name: BsmtExposure, dtype: int64

In [127]:
print(housing_data['BsmtExposure'].dtypes)
# BsmtExposure is Ordinal Categorical variable, so it should be label encoded 

object


In [128]:
housing_data['BsmtExposure'].isna().sum()

0

In [129]:
# Label Encode BsmtCond
housing_data['BsmtExposure'] = housing_data['BsmtExposure'].map({'NA':0,'No':1,'Mn':2,'Av':3,'Gd':4})
housing_data['BsmtExposure'].value_counts()

1    953
3    221
4    134
2    114
0     38
Name: BsmtExposure, dtype: int64

##### BsmtFinType1

In [130]:
housing_data['BsmtFinType1'].value_counts()

Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
NA      37
Name: BsmtFinType1, dtype: int64

In [131]:
print(housing_data['BsmtFinType1'].dtypes)
# BsmtFinType1 is Ordinal Categorical variable, so label encode
continuous_vars.append('BsmtFinType1')

object


In [132]:
housing_data['BsmtFinType1'].isna().sum()

0

In [133]:
# Label Encode BsmtCond
housing_data['BsmtFinType1'] = housing_data['BsmtFinType1'].map({'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6})
housing_data['BsmtFinType1'].value_counts()

1    430
6    418
5    220
4    148
3    133
2     74
0     37
Name: BsmtFinType1, dtype: int64

##### BsmtFinSF1

In [134]:
housing_data['BsmtFinSF1'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean      443.639726
std       456.098091
min         0.000000
25%         0.000000
50%       383.500000
75%       712.250000
90%      1065.500000
99%      1572.410000
100%     5644.000000
max      5644.000000
Name: BsmtFinSF1, dtype: float64

In [135]:
print(housing_data['BsmtFinSF1'].dtypes)
# Continuous variable and dtype is int so no need to convert
continuous_vars.append('BsmtFinSF1')

int64


In [136]:
housing_data['BsmtFinSF1'].isna().sum()
# There are missing values

0

##### BsmtFinType2

In [137]:
housing_data['BsmtFinType2'].value_counts()

Unf    1256
Rec      54
LwQ      46
NA       38
BLQ      33
ALQ      19
GLQ      14
Name: BsmtFinType2, dtype: int64

In [138]:
print(housing_data['BsmtFinType2'].dtypes)
# BsmtFinType2 is Ordinal Categorical variable, so label encode
continuous_vars.append('BsmtFinType2')

object


In [139]:
housing_data['BsmtFinType2'].isna().sum()

0

In [140]:
# Label Encode BsmtCond
housing_data['BsmtFinType2'] = housing_data['BsmtFinType2'].map({'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6})
housing_data['BsmtFinType2'].value_counts()

1    1256
3      54
2      46
0      38
4      33
5      19
6      14
Name: BsmtFinType2, dtype: int64

##### BsmtFinSF2

In [141]:
housing_data['BsmtFinSF2'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean       46.549315
std       161.319273
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
90%       117.200000
99%       830.380000
100%     1474.000000
max      1474.000000
Name: BsmtFinSF2, dtype: float64

In [142]:
print(housing_data['BsmtFinSF2'].dtypes)
# Continuous variable and dtype is int so no need to convert
continuous_vars.append('BsmtFinSF2')

int64


In [143]:
housing_data['BsmtFinSF2'].isna().sum()
# There are missing values

0

##### BsmtUnfSF

In [144]:
housing_data['BsmtUnfSF'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean      567.240411
std       441.866955
min         0.000000
25%       223.000000
50%       477.500000
75%       808.000000
90%      1232.000000
99%      1797.050000
100%     2336.000000
max      2336.000000
Name: BsmtUnfSF, dtype: float64

In [145]:
print(housing_data['BsmtUnfSF'].dtypes)
# Continuous variable and dtype is int so no need to convert
continuous_vars.append('BsmtUnfSF')

int64


In [146]:
housing_data['BsmtUnfSF'].isna().sum()
# There are missing values

0

##### TotalBsmtSF

In [147]:
housing_data['TotalBsmtSF'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean     1057.429452
std       438.705324
min         0.000000
25%       795.750000
50%       991.500000
75%      1298.250000
90%      1602.200000
99%      2155.050000
100%     6110.000000
max      6110.000000
Name: TotalBsmtSF, dtype: float64

In [148]:
print(housing_data['TotalBsmtSF'].dtypes)
# Continuous variable and dtype is int so no need to convert
continuous_vars.append('TotalBsmtSF')

int64


In [149]:
housing_data['TotalBsmtSF'].isna().sum()
# There are no missing values

0

##### Heating

In [150]:
housing_data['Heating'].value_counts()

GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: Heating, dtype: int64

In [151]:
print(housing_data['Heating'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [152]:
housing_data['Heating'].isna().sum()
#No Missing Values

0

##### HeatingQC

In [153]:
housing_data['HeatingQC'].value_counts()

Ex    741
TA    428
Gd    241
Fa     49
Po      1
Name: HeatingQC, dtype: int64

In [154]:
print(housing_data['HeatingQC'].dtypes)
# HeatingQC is Ordinal Categorical variable, so it should be label encoded

object


In [155]:
housing_data['HeatingQC'].isna().sum()

0

In [156]:
# Label Encode HeatingQC
housing_data['HeatingQC'] = housing_data['HeatingQC'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
housing_data['HeatingQC'].value_counts()

4    741
2    428
3    241
1     49
0      1
Name: HeatingQC, dtype: int64

##### CentralAir

In [157]:
housing_data['CentralAir'].value_counts()

Y    1365
N      95
Name: CentralAir, dtype: int64

In [158]:
print(housing_data['CentralAir'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [159]:
housing_data['CentralAir'].isna().sum()
#No Missing Values

0

##### Electrical

In [160]:
housing_data['Electrical'].value_counts()

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64

In [161]:
print(housing_data['Electrical'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [162]:
housing_data['Electrical'].isna().sum()

1

In [163]:
#One missing value, impute with Mode
mode_electrical = housing_data['Electrical'].mode()[0]
print(f"Mode of Electrical: {mode_electrical}")
housing_data['Electrical'].fillna(mode_electrical,inplace=True)


Mode of Electrical: SBrkr


##### 1stFlrSF

In [164]:
housing_data['1stFlrSF'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])
#data is little skewed towards right

count    1460.000000
mean     1162.626712
std       386.587738
min       334.000000
25%       882.000000
50%      1087.000000
75%      1391.250000
90%      1680.000000
99%      2219.460000
100%     4692.000000
max      4692.000000
Name: 1stFlrSF, dtype: float64

In [165]:
print(housing_data['1stFlrSF'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [166]:
housing_data['1stFlrSF'].isna().sum()
# There are no missing values

0

##### 2ndFlrSF

In [167]:
housing_data['2ndFlrSF'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean      346.992466
std       436.528436
min         0.000000
25%         0.000000
50%         0.000000
75%       728.000000
90%       954.200000
99%      1418.920000
100%     2065.000000
max      2065.000000
Name: 2ndFlrSF, dtype: float64

In [168]:
print(housing_data['2ndFlrSF'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [169]:
housing_data['2ndFlrSF'].isna().sum()
# There are no missing values

0

##### LowQualFinSF

In [170]:
housing_data['LowQualFinSF'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])

#data is heavily skewed towards right

count    1460.000000
mean        5.844521
std        48.623081
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
90%         0.000000
99%       360.000000
100%      572.000000
max       572.000000
Name: LowQualFinSF, dtype: float64

In [171]:
print(housing_data['LowQualFinSF'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [172]:
housing_data['LowQualFinSF'].isna().sum()
# There are no missing values

0

##### GrLivArea

In [173]:
housing_data['GrLivArea'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean     1515.463699
std       525.480383
min       334.000000
25%      1129.500000
50%      1464.000000
75%      1776.750000
90%      2158.300000
99%      3123.480000
100%     5642.000000
max      5642.000000
Name: GrLivArea, dtype: float64

In [174]:
print(housing_data['GrLivArea'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [175]:
housing_data['GrLivArea'].isna().sum()
# There are no missing values

0

##### BsmtFullBath

In [176]:
housing_data['BsmtFullBath'].value_counts()


0    856
1    588
2     15
3      1
Name: BsmtFullBath, dtype: int64

In [177]:
print(housing_data['BsmtFullBath'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [178]:
housing_data['BsmtFullBath'].isna().sum()
# There are no missing values

0

##### BsmtHalfBath

In [179]:
housing_data['BsmtHalfBath'].value_counts()


0    1378
1      80
2       2
Name: BsmtHalfBath, dtype: int64

In [180]:
print(housing_data['BsmtHalfBath'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [181]:
housing_data['BsmtHalfBath'].isna().sum()
# There are no missing values

0

##### FullBath

In [182]:
housing_data['FullBath'].value_counts()


2    768
1    650
3     33
0      9
Name: FullBath, dtype: int64

In [183]:
print(housing_data['FullBath'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [184]:
housing_data['FullBath'].isna().sum()
# There are no missing values

0

##### HalfBath

In [185]:
housing_data['HalfBath'].value_counts()


0    913
1    535
2     12
Name: HalfBath, dtype: int64

In [186]:
print(housing_data['HalfBath'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [187]:
housing_data['HalfBath'].isna().sum()
# There are no missing values

0

##### BedroomAbvGr

In [188]:
housing_data['BedroomAbvGr'].value_counts()


3    804
2    358
4    213
1     50
5     21
6      7
0      6
8      1
Name: BedroomAbvGr, dtype: int64

In [189]:
print(housing_data['BedroomAbvGr'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [190]:
housing_data['BedroomAbvGr'].isna().sum()
# There are no missing values

0

##### KitchenAbvGr

In [191]:
housing_data['KitchenAbvGr'].value_counts()


1    1392
2      65
3       2
0       1
Name: KitchenAbvGr, dtype: int64

In [192]:
print(housing_data['KitchenAbvGr'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [193]:
housing_data['KitchenAbvGr'].isna().sum()
# There are no missing values

0

##### KitchenQual

In [194]:
housing_data['KitchenQual'].value_counts()

TA    735
Gd    586
Ex    100
Fa     39
Name: KitchenQual, dtype: int64

In [195]:
print(housing_data['KitchenQual'].dtypes)
# HeatingQC is Ordinal Categorical variable, so it should be label encoded

object


In [196]:
housing_data['KitchenQual'].isna().sum()

0

In [197]:
# Label Encode HeatingQC
housing_data['KitchenQual'] = housing_data['KitchenQual'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
housing_data['KitchenQual'].value_counts()

2    735
3    586
4    100
1     39
Name: KitchenQual, dtype: int64

##### TotRmsAbvGrd

In [199]:
housing_data['TotRmsAbvGrd'].value_counts()


6     402
7     329
5     275
8     187
4      97
9      75
10     47
11     18
3      17
12     11
14      1
2       1
Name: TotRmsAbvGrd, dtype: int64

In [200]:
print(housing_data['TotRmsAbvGrd'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [201]:
housing_data['TotRmsAbvGrd'].isna().sum()
# There are no missing values

0

##### Functional

In [202]:
housing_data['Functional'].value_counts()

Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Name: Functional, dtype: int64

In [203]:
print(housing_data['Functional'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [204]:
housing_data['Functional'].isna().sum()
# No Missing Values

0

##### Fireplaces

In [205]:
housing_data['Fireplaces'].value_counts()


0    690
1    650
2    115
3      5
Name: Fireplaces, dtype: int64

In [206]:
print(housing_data['Fireplaces'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. Also it is already label encoded so no encoding needed

int64


In [207]:
housing_data['Fireplaces'].isna().sum()
# There are no missing values

0

##### FireplaceQu

In [208]:
housing_data['FireplaceQu'].value_counts()

NA    690
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

In [244]:
print(housing_data['FireplaceQu'].dtypes)
# FireplaceQu is Ordinal Categorical variable, so it should be label encoded 

int64


In [210]:
housing_data['FireplaceQu'].isna().sum()

0

In [211]:
# Label Encode BsmtCond
housing_data['FireplaceQu'] = housing_data['FireplaceQu'].map({'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
housing_data['FireplaceQu'].value_counts()

0    690
4    380
3    313
2     33
5     24
1     20
Name: FireplaceQu, dtype: int64

##### GarageType

In [212]:
housing_data['GarageType'].value_counts()

Attchd     870
Detchd     387
BuiltIn     88
NA          81
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64

In [213]:
print(housing_data['GarageType'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [214]:
housing_data['GarageType'].isna().sum()
# No Missing Values

0

In [216]:
# Replace NA values to NoGarage for better readability
housing_data['GarageType'] = housing_data['GarageType'].apply(lambda x : 'NoGarage' if x == 'NA' else x)
housing_data['GarageType'].value_counts()

Attchd      870
Detchd      387
BuiltIn      88
NoGarage     81
Basment      19
CarPort       9
2Types        6
Name: GarageType, dtype: int64

##### GarageYrBlt

In [223]:
housing_data['GarageYrBlt'].value_counts()

2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
          ..
1908.0     1
1927.0     1
1933.0     1
1900.0     1
1906.0     1
Name: GarageYrBlt, Length: 97, dtype: int64

In [227]:
print(housing_data['GarageYrBlt'].dtypes)
# Ordinal Categorical variable and dtype is int/float, so no change needed

int64


In [225]:
housing_data['GarageYrBlt'].isna().sum()
# No Missing Values

81

In [226]:
housing_data['GarageYrBlt'] = LabelEncoder().fit_transform(housing_data['GarageYrBlt'])
# This will also encode Missing values (No Garage in this case) which is desired

##### GarageFinish

In [228]:
housing_data['GarageFinish'].value_counts()

Unf    605
RFn    422
Fin    352
NA      81
Name: GarageFinish, dtype: int64

In [229]:
print(housing_data['GarageFinish'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [230]:
housing_data['GarageFinish'].isna().sum()
# No Missing Values

0

In [231]:
# Replace NA values to NoGarage for better readability
housing_data['GarageFinish'] = housing_data['GarageFinish'].apply(lambda x : 'NoGarage' if x == 'NA' else x)
housing_data['GarageFinish'].value_counts()

Unf         605
RFn         422
Fin         352
NoGarage     81
Name: GarageFinish, dtype: int64

##### GarageCars

In [234]:
housing_data['GarageCars'].value_counts()


2    824
1    369
3    181
0     81
4      5
Name: GarageCars, dtype: int64

In [235]:
print(housing_data['GarageCars'].dtypes)
# Ordinal Categorical variable and dtype is int so no need to convert. It is already label encoded

int64


In [236]:
housing_data['GarageCars'].isna().sum()
# There are no missing values

0

##### GarageArea

In [237]:
housing_data['GarageArea'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean      472.980137
std       213.804841
min         0.000000
25%       334.500000
50%       480.000000
75%       576.000000
90%       757.100000
99%      1002.790000
100%     1418.000000
max      1418.000000
Name: GarageArea, dtype: float64

In [238]:
print(housing_data['GarageArea'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [239]:
housing_data['GarageArea'].isna().sum()
# There are no missing values

0

##### GarageQual

In [240]:
housing_data['GarageQual'].value_counts()

TA    1311
NA      81
Fa      48
Gd      14
Po       3
Ex       3
Name: GarageQual, dtype: int64

In [241]:
print(housing_data['GarageQual'].dtypes)
# GarageQual is Ordinal Categorical variable, so it should be label encoded 

object


In [242]:
housing_data['GarageQual'].isna().sum()

0

In [243]:
# Label Encode BsmtCond
housing_data['GarageQual'] = housing_data['GarageQual'].map({'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
housing_data['GarageQual'].value_counts()

3    1311
0      81
2      48
4      14
5       3
1       3
Name: GarageQual, dtype: int64

##### GarageCond

In [245]:
housing_data['GarageCond'].value_counts()

TA    1326
NA      81
Fa      35
Gd       9
Po       7
Ex       2
Name: GarageCond, dtype: int64

In [246]:
print(housing_data['GarageCond'].dtypes)
# FireplaceQu is Ordinal Categorical variable, so it should be label encoded 

object


In [247]:
housing_data['GarageCond'].isna().sum()

0

In [248]:
# Label Encode BsmtCond
housing_data['GarageCond'] = housing_data['GarageCond'].map({'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
housing_data['GarageCond'].value_counts()

3    1326
0      81
2      35
4       9
1       7
5       2
Name: GarageCond, dtype: int64

##### PavedDrive

In [249]:
housing_data['PavedDrive'].value_counts()

Y    1340
N      90
P      30
Name: PavedDrive, dtype: int64

In [250]:
print(housing_data['PavedDrive'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [251]:
housing_data['PavedDrive'].isna().sum()
# No Missing Values

0

##### WoodDeckSF

In [252]:
housing_data['WoodDeckSF'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean       94.244521
std       125.338794
min         0.000000
25%         0.000000
50%         0.000000
75%       168.000000
90%       262.000000
99%       505.460000
100%      857.000000
max       857.000000
Name: WoodDeckSF, dtype: float64

In [253]:
print(housing_data['WoodDeckSF'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [254]:
housing_data['WoodDeckSF'].isna().sum()
# There are no missing values

0

##### OpenPorchSF

In [255]:
housing_data['OpenPorchSF'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean       46.660274
std        66.256028
min         0.000000
25%         0.000000
50%        25.000000
75%        68.000000
90%       130.000000
99%       285.820000
100%      547.000000
max       547.000000
Name: OpenPorchSF, dtype: float64

In [256]:
print(housing_data['OpenPorchSF'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [257]:
housing_data['OpenPorchSF'].isna().sum()
# There are no missing values

0

##### EnclosedPorch

In [258]:
housing_data['EnclosedPorch'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean       21.954110
std        61.119149
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
90%       112.000000
99%       261.050000
100%      552.000000
max       552.000000
Name: EnclosedPorch, dtype: float64

In [259]:
print(housing_data['EnclosedPorch'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [260]:
housing_data['EnclosedPorch'].isna().sum()
# There are no missing values

0

##### 3SsnPorch

In [261]:
housing_data['3SsnPorch'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean        3.409589
std        29.317331
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
90%         0.000000
99%       168.000000
100%      508.000000
max       508.000000
Name: 3SsnPorch, dtype: float64

In [262]:
print(housing_data['3SsnPorch'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [263]:
housing_data['3SsnPorch'].isna().sum()
# There are no missing values

0

##### ScreenPorch

In [264]:
housing_data['ScreenPorch'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean       15.060959
std        55.757415
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
90%         0.000000
99%       268.050000
100%      480.000000
max       480.000000
Name: ScreenPorch, dtype: float64

In [265]:
print(housing_data['ScreenPorch'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [266]:
housing_data['ScreenPorch'].isna().sum()
# There are no missing values

0

##### PoolArea

In [267]:
housing_data['PoolArea'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])


count    1460.000000
mean        2.758904
std        40.177307
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
90%         0.000000
99%         0.000000
100%      738.000000
max       738.000000
Name: PoolArea, dtype: float64

In [268]:
print(housing_data['PoolArea'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [269]:
housing_data['PoolArea'].isna().sum()
# There are no missing values

0

##### PoolQC

In [270]:
housing_data['PoolQC'].value_counts()

NA    1453
Gd       3
Fa       2
Ex       2
Name: PoolQC, dtype: int64

In [271]:
print(housing_data['PoolQC'].dtypes)
# PoolQC is Ordinal Categorical variable, so it should be label encoded 

object


In [273]:
housing_data['PoolQC'].isna().sum()

0

In [274]:
# Label Encode PoolQC
housing_data['PoolQC'] = housing_data['PoolQC'].map({'NA':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
housing_data['PoolQC'].value_counts()

0    1453
3       3
4       2
1       2
Name: PoolQC, dtype: int64

##### Fence

In [275]:
housing_data['Fence'].value_counts()

NA       1179
MnPrv     157
GdPrv      59
GdWo       54
MnWw       11
Name: Fence, dtype: int64

In [276]:
print(housing_data['Fence'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [277]:
housing_data['Fence'].isna().sum()
# No Missing Values

0

In [280]:
#Replace NA with NoFence for better readability
housing_data['Fence'] = housing_data['Fence'].apply(lambda x : 'NoFence' if x == 'NA' else x)
housing_data['Fence'].value_counts()

NoFence    1179
MnPrv       157
GdPrv        59
GdWo         54
MnWw         11
Name: Fence, dtype: int64

##### MiscFeature

In [281]:
housing_data['MiscFeature'].value_counts()

NA      1406
Shed      49
Othr       2
Gar2       2
TenC       1
Name: MiscFeature, dtype: int64

In [282]:
print(housing_data['MiscFeature'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [283]:
housing_data['MiscFeature'].isna().sum()
# No Missing Values

0

In [284]:
#Replace NA with NoMisc for better readability
housing_data['MiscFeature'] = housing_data['MiscFeature'].apply(lambda x : 'NoMisc' if x == 'NA' else x)
housing_data['MiscFeature'].value_counts()

NoMisc    1406
Shed        49
Othr         2
Gar2         2
TenC         1
Name: MiscFeature, dtype: int64

##### MiscVal

In [285]:
housing_data['MiscVal'].describe(percentiles=[0.25,0.5,0.75,0.9,0.99,1])
#data is heavily skewed because of the nature of data (only less houses has MISC Feature)


count     1460.000000
mean        43.489041
std        496.123024
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
90%          0.000000
99%        700.000000
100%     15500.000000
max      15500.000000
Name: MiscVal, dtype: float64

In [286]:
print(housing_data['MiscVal'].dtypes)
# Continuous variable and dtype is int so no need to convert

int64


In [287]:
housing_data['MiscVal'].isna().sum()
# There are no missing values

0

##### MoSold

In [289]:
housing_data['MoSold'].value_counts()

6     253
7     234
5     204
4     141
8     122
3     106
10     89
11     79
9      63
12     59
1      58
2      52
Name: MoSold, dtype: int64

In [294]:
print(housing_data['MoSold'].dtypes)
# Categorical variable and dtype is int, so convert to object

object


In [291]:
housing_data['MoSold'].isna().sum()
# No Missing Values

0

In [293]:
#Convert the data type to object
housing_data['MoSold'] = housing_data['MoSold'].astype('object')

##### YrSold

In [296]:
housing_data['YrSold'].value_counts()

2009    338
2007    329
2006    314
2008    304
2010    175
Name: YrSold, dtype: int64

In [297]:
print(housing_data['YrSold'].dtypes)
# Ordinal Categorical variable and dtype is int/float, so no change needed

int64


In [298]:
housing_data['YrSold'].isna().sum()
# No Missing Values

0

In [299]:
housing_data['YrSold'] = LabelEncoder().fit_transform(housing_data['YrSold'])
housing_data['YrSold'].value_counts()

##### SaleType

In [301]:
housing_data['SaleType'].value_counts()

WD       1267
New       122
COD        43
ConLD       9
ConLI       5
ConLw       5
CWD         4
Oth         3
Con         2
Name: SaleType, dtype: int64

In [302]:
print(housing_data['SaleType'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [303]:
housing_data['SaleType'].isna().sum()
# No Missing Values

0

##### SaleCondition

In [304]:
housing_data['SaleCondition'].value_counts()

Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: SaleCondition, dtype: int64

In [305]:
print(housing_data['SaleCondition'].dtypes)
# Categorical variable and dtype is object so no need to convert

object


In [306]:
housing_data['SaleCondition'].isna().sum()
# No Missing Values

0

### Step 3: Data Analysis and Visualization

In [None]:
categorical_vars = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2',\
                    'BldgType','HouseStyle','YearBuilt','YearRemodAdd','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType',\
                    'ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir', \
                    'Electrical','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd','Functional',\
                    'Fireplaces','FireplaceQu','GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageQual','GarageCond','PavedDrive','PoolQC','Fence',\
                    'MiscFeature','MoSold','YrSold','SaleType','SaleCondition']
continuous_vars = ['LotFrontage','LotArea','OverallQual','OverallCond','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',\
                    'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']

In [None]:
plt.figure(figsize=(20,10))
batch_size = 10
for i in range(0,len(continuous_vars),batch_size):
    sns.pairplot(data=housing_data,x_vars=continuous_vars[i:i+batch_size],y_vars='SalePrice')
plt.show()

#### Inferences
1. LotFrontage seems to have little influence on the SalePrice but seems to be linear influencer
2. LotArea seems to have not much influence on the target SalePrice
3. MasVnrArea, BsmtFinSF1 BsmtFinSF2, BsmtUnfSF seem to have little effect on the SalePrice. 
4. If we remove the value 0 for TotalBsmtSF,1stFlrSF, 2ndFlrSF, they seem to have to good influence on the target. 0 just represents that the particular category (basement, 1st floor or 2nd floor) is not there in the house
5. LowQualFinSF,WoodDeckSF,EnclosedPorch,3SsnPorch,ScreenPorch, PoolArea, MiscVal seems to have not much influence on the target, there are lot of zeros for these vars in the dataset which is also widely spread across different Sale Prices
6. GrLivArea and GarageArea seems to have a good linear influence on the SalePrice


In [None]:
# Creating subplot axes
fig, axes = plt.subplots(12,5,figsize=(30,40), sharey=True)
for name, ax in zip(categorical_vars, axes.flatten()):
    sns.boxplot(y='SalePrice', x= name, data=housing_data, orient='v', ax=ax)
plt.show()


In [None]:
# TODO : Write more inferences and find more columns to drop

#### Inferences
1. MSSubClass seems to be have some influence on the target variable.
2. Houses with Floating Village Residential & Residential Low Density MSZones tend to have higher Selling Price.
3. Houses with Paved road access tend to have higher SalePrice than Gravel
4. Houses with No Alley access tend to have higher SalePrice
5. Lotshape also have some influence on the target variable but not too much significant.
6. LandContour have some influence on the target variable. Higher SalePrice for Houses on Hillside and Depression
7. Utilities is heavily skewed towards AllPub. Just one datapoint for NoSeWa. Hence it is not an indicator for the SalePrice
8. Higher Influcence Predictors:
    - Neighbourhood
    - Condition1
    - Condition2
    - HouseStyle
    - OverAll quality
    - Overall Cond
    - Roof Material
    - Exterior Covering Materials (both 1 and 2)
    - Exterior Quality
    - Exterior Condition
    - Mass Vnr Type
    - 
9. Comparatively Lower Influence Predictors:
    - LotConfig
    - Landslope
    - Bldng type
    - Roof Style
7. YearBuilt and YearRemodel done doesn't seem to have good influence on the target ==>  Although more data is skewed towards the recent years, it has wide range of salePrice
8. Although recent values of GarageYrBlt have some high SalePrice, it is not a great influence for sale price
9. The variation of SalePrice w.r.t Month Sold or Year Sold is not significant enough



In [None]:
# Drop columns based on EDA
cols_to_be_dropped = ['Utilities', 'LowQualFinSF','EnclosedPorch','3SsnPorch','ScreenPorch', 'PoolArea', 'MiscVal','MoSold','YrSold']
# cols_to_be_dropped.extend(['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF'])
housing_data.drop(cols_to_be_dropped,axis=1,inplace=True)
print(f"Number of columns after dropping {housing_data.columns.size}")

In [None]:
# Remove the dropped cols from continuous and categorical list
categorical_vars = [var for var in categorical_vars if var not in cols_to_be_dropped]
continuous_vars =  [var for var in continuous_vars if var not in cols_to_be_dropped]

### Step 4: Preparing the data for Modelling

#### Encoding

In [None]:
# Dummy Encoding for all categorical variables
# Certain Numbers in Categorical Variable are not mapped to string , it is read as string to interpret it better. For eg: OverallCond_10 would be easier to interpret than OverallCond_Very_Excellent
dummy_encoded_values = pd.get_dummies(data=housing_data[categorical_vars].astype(str),drop_first=True)
categorical_vars_encoded = list(dummy_encoded_values.columns)

# Add the new encoded cols to original dataframe and drop the source columns
housing_data = pd.concat([housing_data,dummy_encoded_values],axis=1)
housing_data.drop(categorical_vars,axis=1,inplace=True)

In [None]:
print(f"Number of columns after dummyEncoding {housing_data.columns.size}")

#### Train test split

In [None]:
df_train, df_test = train_test_split(housing_data,train_size=0.7,random_state=100)
df_train.columns

#### Scaling the features using StandardScaler

In [None]:
scaler = StandardScaler()
df_train[continuous_vars] = scaler.fit_transform(df_train[continuous_vars])
df_test[continuous_vars] = scaler.transform(df_test[continuous_vars])
df_train[continuous_vars]

### Step 5: Model Builiding

#### Utilities

In [None]:
# Calculate Adjusted R2
def adjusted_r2(X,r2_score):
    n = len(X)
    p = len(X.columns)
    return 1-(1-r2_score)*(n-1)/(n-p-1)

In [None]:
# Add Constant
def add_constant(X_train):
    return sm.add_constant(X_train)

In [None]:
## Build Stats Model and return summary
def build_stats_model(X_train,y_train):
    lm = sm.OLS(y_train,add_constant(X_train)).fit() # Fitting the model
    return lm.summary(),lm # Return summary

In [None]:
# Compute VIF
def compute_vif(X_train):
    vif = pd.DataFrame()
    vif['Features'] = X_train.columns
    vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2) # Rounding to 2 decimal values
    vif = vif.sort_values(by = 'VIF', ascending = False)
    return vif

In [None]:
def get_model(model_type,alpha=1.0):
    if model_type == 'Ridge':
        return Ridge(alpha=alpha)
    if model_type == 'Lasso':
        return Lasso(alpha=alpha)

    raise ValueError(f"Invalid Model Type {model_type}") 
    

In [None]:
def build_with_regularization(X_train,y_train,X_test,y_test,model_type,alpha_range,scoring='neg_root_mean_squared_error',folds=5):
    alpha_grid = [{'alpha': alpha_range}]
    grid_search_cv = GridSearchCV(get_model(model_type),param_grid= alpha_grid,scoring=scoring,cv=folds,n_jobs=-1,return_train_score=True)
    grid_search_cv.fit(X_train,y_train)
    best_alpha = grid_search_cv.best_params_['alpha']
    
    model = get_model(model_type,best_alpha)
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    train_r2_score = r2_score(y_true=y_train,y_pred=y_train_pred)
    y_test_pred = model.predict(X_test)
    test_r2_score =  r2_score(y_true=y_test,y_pred=y_test_pred)   
    return {'model': model, 'best_alpha': best_alpha, 'best_score': grid_search_cv.best_score_, 'train_r2_score': train_r2_score,'test_r2_score': test_r2_score}

#### Step 5.1: Model Building Without Regularization

In [None]:
# Splitting the training data into X and y
y_train = df_train.pop('SalePrice')
X_train = df_train
y_test = df_test.pop('SalePrice')
X_test = df_test

In [None]:
# Create Linear Regression Model
lm = LinearRegression()
lm.fit(X_train,y_train)

# Running RFE with output number of values as 50
output_var_count = 50
rfe = RFE(lm,n_features_to_select=output_var_count)
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
resulting_rfe_cols = X_train.columns[rfe.support_]
resulting_rfe_cols

In [None]:
X_train.columns[~rfe.support_]

In [None]:
# Keeping only the columns from RFE
X_train_rfe = X_train[resulting_rfe_cols]
X_test_rfe = X_test[resulting_rfe_cols]

In [None]:
# Create Linear Regression Model
lm = LinearRegression()
lm.fit(X_train_rfe,y_train)
y_train_pred = lm.predict(X_train_rfe)
y_test_pred = lm.predict(X_test_rfe)
train_r2_score  = r2_score(y_true=y_train,y_pred=y_train_pred)
train_adj_r2_scopre = adjusted_r2(X_train_rfe,train_r2_score)
test_r2_score  = r2_score(y_true=y_test,y_pred=y_test_pred)
test_adj_r2_score = adjusted_r2(X_test_rfe,test_r2_score)
print(f"R2 score of training {train_r2_score}")
print(f"Adj R2 score of training {train_adj_r2_scopre}")
print(f"R2 score of test {test_r2_score}")
print(f"Adj R2 score of test {test_adj_r2_score}")

##### Inference
- Even though R2 and Adj R2 score for training set seems to be decent , it miserably failed for test data. Let's look at the statistics to understand the significance of the variables

In [None]:
# Get the List of Independent Variables with high p value and/or high vif:
def get_correlated_unreliable_ind_vars(X_train,y_train):
    # Build stats model and get summary
    (results_summary,_) = build_stats_model(X_train,y_train)

    #Read the Metrics from OLS Summary
    results_as_html_0 = results_summary.tables[0].as_html()
    summary_df_0 = pd.read_html(results_as_html_0,index_col=2)[0][[3]]
    summary_df_0.index.name = 'Metric'
    summary_df_0.rename({3:'Value'},axis=1,inplace=True)

    # Read the p value table from OLS Summary
    results_as_html_1 = results_summary.tables[1].as_html()
    summary_df_1 = pd.read_html(results_as_html_1, header=0, index_col=0)[0]
    summary_filtered_df = summary_df_1[summary_df_1['P>|t|'] > 0.05].sort_values('P>|t|',ascending=False)


    #Compute VIF
    vif = compute_vif(X_train)
    vif_filtered_df = vif[vif['VIF'] > 5]

    #Merge both df to combine variables which needs attention
    corr_unr_ind_vars = summary_filtered_df.merge(vif_filtered_df,how='outer',left_index=True,right_on=['Features'])[['Features','P>|t|','VIF']]
    corr_unr_ind_vars.reset_index(drop=True,inplace=True)
    corr_unr_ind_vars.sort_values(by=['P>|t|','VIF'],ascending=False,inplace=True)
    return (corr_unr_ind_vars,summary_df_0,summary_df_1,results_summary,vif)
    

In [None]:
#Recursively remove features based on stats model summary statistics
def recursive_feature_removal(X_train,y_train,iter=1):
    result = []
    (corr_unr_ind_vars,summary_df_0,_,_,_) = get_correlated_unreliable_ind_vars(X_train,y_train)
    if corr_unr_ind_vars.empty:
        top_row = {'Features': None,'VIF': None,'P>|t|': None}
    else:
        top_row = corr_unr_ind_vars.iloc[0]
        
    var_to_drop = top_row['Features']
    vif = top_row['VIF']
    p_value = top_row['P>|t|']
    r2 = summary_df_0.iloc[0]['Value']
    adj_r2 = summary_df_0.iloc[1]['Value']
    result.append({
        'iter': iter,
        'var_to_drop': var_to_drop,
        'vif': vif,
        'p_value': p_value,
        'r2_score': r2,
        'adj_r2_score': adj_r2,
        'X_train': X_train
    })
    if not corr_unr_ind_vars.empty:
        result.extend(recursive_feature_removal(X_train.drop([var_to_drop],axis=1),y_train,iter+1))
    return result


In [None]:
# Get the List of Independent Variables with high p value and/or high vif:
def get_unreliable_ind_vars(X_train,y_train):
    # Build stats model and get summary
    (results_summary,_) = build_stats_model(X_train,y_train)

    #Read the Metrics from OLS Summary
    results_as_html_0 = results_summary.tables[0].as_html()
    summary_df_0 = pd.read_html(results_as_html_0,index_col=2)[0][[3]]
    summary_df_0.index.name = 'Metric'
    summary_df_0.rename({3:'Value'},axis=1,inplace=True)

    # Read the p value table from OLS Summary
    results_as_html_1 = results_summary.tables[1].as_html()
    summary_df_1 = pd.read_html(results_as_html_1, header=0, index_col=0)[0]
    summary_df_1.drop('const',inplace=True)
    summary_filtered_df = summary_df_1[summary_df_1['P>|t|'] > 0.05].sort_values('P>|t|',ascending=False)


    #Merge both df to combine variables which needs attention
    corr_unr_ind_vars = summary_filtered_df
    corr_unr_ind_vars['Features'] = corr_unr_ind_vars.index
    corr_unr_ind_vars.sort_values(by=['P>|t|'],ascending=False,inplace=True)
    return (corr_unr_ind_vars,summary_df_0,summary_df_1,results_summary)
    

In [None]:
#Recursively remove features based on stats model summary statistics
def recursive_feature_removal_only_p(X_train,y_train,iter=1):
    result = []
    (corr_unr_ind_vars,summary_df_0,_,_) = get_unreliable_ind_vars(X_train,y_train)
    if corr_unr_ind_vars.empty:
        top_row = {'Features': None,'P>|t|': None}
    else:
        top_row = corr_unr_ind_vars.iloc[0]
        
    var_to_drop = top_row['Features']
    p_value = top_row['P>|t|']
    r2 = summary_df_0.iloc[0]['Value']
    adj_r2 = summary_df_0.iloc[1]['Value']
    result.append({
        'iter': iter,
        'var_to_drop': var_to_drop,
        'p_value': p_value,
        'r2_score': r2,
        'adj_r2_score': adj_r2,
        'X_train': X_train
    })
    if not corr_unr_ind_vars.empty:
        result.extend(recursive_feature_removal_only_p(X_train.drop([var_to_drop],axis=1),y_train,iter+1))
    return result


In [None]:
rfr_result = recursive_feature_removal_only_p(X_train_rfe,y_train)
pd.DataFrame(rfr_result).head(50)

In [None]:
X_train_final = rfr_result[-1]['X_train']
X_test_final = X_test_rfe[X_train_final.columns]

In [None]:
X_train_final = X_train_rfe
X_test_final = X_test_rfe[X_train_final.columns]

#### Step 5.2: Model Building With Ridge

In [None]:
print(f"Building Model Using Ridge ")
ridge_alphas = list(np.linspace(0,1,100))
result = build_with_regularization(X_train_final,y_train,X_test_final,y_test,'Ridge',ridge_alphas)
print(f"Best Alpha: {result['best_alpha']}")
print(f"Best Score: {result['best_score']}")
print(f"Training R2 Score: {result['train_r2_score']}")
print(f"Test R2 Score: {result['test_r2_score']}")

In [None]:
X_train_final.columns.size

#### Step 5.3: Model Building With Lasso

In [None]:
print(f"Building Model Using Lasso ")
lasso_alphas = list(np.arange(0,1000,1))
result = build_with_regularization(X_train_final,y_train,X_test_final,y_test,'Lasso',lasso_alphas)
print(f"Best Alpha: {result['best_alpha']}")
print(f"Best Score: {result['best_score']}")
print(f"Training R2 Score: {result['train_r2_score']}")
print(f"Test R2 Score: {result['test_r2_score']}")