# Advanced Linear Regression Assignment for Company : **Surprise Housing**

In [247]:
### Importing required Libraries ####
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import MinMaxScaler , PolynomialFeatures
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor  #VIF
from sklearn .metrics import r2_score , mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression , Ridge , Lasso
import warnings
warnings.filterwarnings('ignore')

In [248]:
#Reading the housing data file from GOOGLE DRIVE (Used Google Colab)
#Checking the first 5 rows in the data file
# Read the dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

housing=pd.read_csv('/content/drive/MyDrive/train.csv')
housing.head()

Mounted at /content/drive


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


**READING & UNDERSTANDING THE DATA **

In [249]:
# Checking the Information about the file
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [250]:
# Checking the Shape of the file
housing.shape

(1460, 81)

In [251]:
# Data imputing / Dropping  for missing values got - 1)Alley 2) Pool QC 3)Fireplace Qu 4)MiscFeature 5)Lotfrontage
# Dropping ID as it donot add any information -
# Only 91 rows have data for Alley - hence dropping
# Only 770 rows have data for FireplaceQu -  hence dropping
# Only 7 records avialbel  for PoolQC - hence dropping
# Only 54 records avilable for MiscFeature - hence dropping
# Only 281 records available Fence  - Hence Dropping
# MiscVal col has 1408 records with val 0 out of 1460 recs - hence dropping
# PoolArea col has 1453 records as 0 - hence dropping
# ScreenPorch has 1344 records as 0 - hence dropping
# 3SsnPorch has 1463 records as 0  - hence dropping
# EnclosedPorch has 1200+ records with val 0 - hence dropping
#BsmtFinSF2  has almost 1300 records with val 0 - Hence dropping
#LowQualFinSF has almost 1300 recs with val 0 - hence dropping
#BsmtHalfBath has 1200+ records woth val 0 - hence dropping
# Street have 1400+ records with value 'Pave' - hence dropping
# Utilities Column has all rows with value AllPub - Hence Dropping
# CentralAir colm has around 80 rows with value 'N' - Hence Dropping

housing.drop(['Id','Alley','FireplaceQu' , 'PoolQC','MiscFeature', 'Fence',
'MiscVal' , 'PoolArea', 'ScreenPorch','3SsnPorch','EnclosedPorch' , 'BsmtFinSF2',
'LowQualFinSF','BsmtHalfBath','Street','Utilities','CentralAir'],axis=1 , inplace=True)

housing.shape

(1460, 64)

In [252]:
# Check for columns having NA values
housing.columns[housing.isnull().any()]

Index(['LotFrontage', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual',
       'GarageCond'],
      dtype='object')

In [253]:
# Replace NA with 0 for continuous variables  LotFrortage , MasVnrArea , GarageYrBlt
housing['LotFrontage'] = housing['LotFrontage'].replace(np.nan, 0)
housing['MasVnrArea'] = housing['MasVnrArea'].replace(np.nan, 0)
housing['GarageYrBlt'] = housing['GarageYrBlt'].replace(np.nan, 0)

In [254]:
# Converting non binary Categorical variabes to original values
#______________________________________________________
# COL 'MSSubClass' mapping
# 20	1-STORY 1946 & NEWER ALL STYLES
# 30	1-STORY 1945 & OLDER
# 40	1-STORY W/FINISHED ATTIC ALL AGES
# 45	1-1/2 STORY - UNFINISHED ALL AGES
# 50	1-1/2 STORY FINISHED ALL AGES
# 60	2-STORY 1946 & NEWER
# 70	2-STORY 1945 & OLDER
# 75	2-1/2 STORY ALL AGES
# 80	SPLIT OR MULTI-LEVEL
# 85	SPLIT FOYER
# 90	DUPLEX - ALL STYLES AND AGES
# 120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
# 150	1-1/2 STORY PUD - ALL AGES
# 160	2-STORY PUD - 1946 & NEWER
# 180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
# 190	2 FAMILY CONVERSION - ALL STYLES AND AGES

# and DROP COL 'MSSubClass'
housing['MSSubClass_new'] = housing['MSSubClass'].map({20:"1-STORY 1946 & NEWER ALL STYLES" , 30:"1-STORY 1945 & OLDER",
                40:"1-STORY W/FINISHED ATTIC ALL AGES", 45:"1-1/2 STORY - UNFINISHED ALL AGES", 50:"1-1/2 STORY FINISHED ALL AGES" ,
                60:"2-STORY 1946 & NEWER" , 70:"2-STORY 1945 & OLDER" ,75:"2-1/2 STORY ALL AGES",80:"SPLIT OR MULTI-LEVEL" ,
                85:"SPLIT FOYER" , 90:"DUPLEX - ALL STYLES AND AGES", 120:"1-STORY PUD - 1946 & NEWER", 150:"1-1/2 STORY PUD - ALL AGES",
                160:"2-STORY PUD - 1946 & NEWER" , 180:"PUD - MULTILEVEL - INCL SPLIT LEV/FOYER", 190:"2 FAMILY CONVERSION - ALL STYLES AND AGES"})
housing.drop('MSSubClass',axis=1,inplace=True)
#______________________________________________________
# COL 'MSZoning' mapping
      # A	Agriculture
      # C	Commercial
      # FV	Floating Village Residential
      # I	Industrial
      # RH	Residential High Density
      # RL	Residential Low Density
      # RP	Residential Low Density Park
      # RM	Residential Medium Density

# and DROP COL 'MSZoning'
housing['MSZoning_new'] = housing['MSZoning'].map({'A':"Agriculture" , 'C':"Commercial",
        'FV':"Floating Village Residential", 'I':"Industrial", 'RH':"Residential High Density" ,
        'RL':"Residential Low Density" , 'RP':"Residential Low Density Park" ,'RM':"Residential Medium Density"})
housing.drop('MSZoning',axis=1,inplace=True)
#______________________________________________________

#OverallQual: Rates the overall material and finish of the house
# 10	Very Excellent
# 9	Excellent
# 8	Very Good
# 7	Good
# 6	Above Average
# 5	Average
# 4	Below Average
# 3	Fair
# 2	Poor
# 1	Very Poor
# and DROP COL 'OverallQual'

housing['OverallQual_new'] = housing['OverallQual'].map({10:"Very Excellent" , 9:"Excellent", 8:"Very Good", 7:"Good", 6:"Above Average" ,
                                                         5:"Average" , 4:"Below Average" , 3:"Fair" , 2:"Poor" , 1:"Very Poor"})
housing.drop('OverallQual',axis=1,inplace=True)
#_________________________________________________________________________
#OverallCond: Rates the overall condition of the house
# 10	Very Excellent
# 9	Excellent
# 8	Very Good
# 7	Good
# 6	Above Average
# 5	Average
# 4	Below Average
# 3	Fair
# 2	Poor
# 1	Very Poor
# and DROP COL 'OverallCond'
housing['OverallCond_new'] = housing['OverallCond'].map({10:"Very Excellent" , 9:"Excellent", 8:"Very Good", 7:"Good", 6:"Above Average" ,
                                                         5:"Average" , 4:"Below Average" , 3:"Fair" , 2:"Poor" , 1:"Very Poor"})
housing.drop('OverallCond',axis=1,inplace=True)
#_____________________________________________________________________________
# MoSold : Month Sold
# Replace the month number with names
housing['MonthSold'] = housing['MoSold'].map({12:"Dec" , 11:"Nov", 10:"Oct", 9:"Sep", 8:"Aug", 7:"Jul", 6:"Jun" ,
                                                         5:"May" , 4:"Apr" , 3:"Mar" , 2:"Feb" , 1:"Jan"})
housing.drop('MoSold',axis=1,inplace=True)

#_____________________________________________________________________________
# LandSlope : to the original values
# Replace the month number with names
housing['LandSlope_new'] = housing['LandSlope'].map({'Gtl':"GentleSlope" , 'Mod':"MOderateSlope", 'Sev':"SevereSlope"})
housing.drop('LandSlope',axis=1,inplace=True)

#_____________________________________________________________________________
# Functional : to the original values
# Replace the month number with names
housing['Functional_new'] = housing['Functional'].map({'Typ':"TypicalFunctionality" , 'Min1':"MinorDeductions1",
                            'Min2':"MinorDeductions2",'Mod':"Moderate Deductions",'Maj1':"MajorDeductions1",
                            'Maj2':"MajorDeductions1",'Sev':"Severely Damaged",'Sal':"Salvage only"})
housing.drop('Functional',axis=1,inplace=True)


In [255]:
# Checking the statistical data for the dataframe
housing.describe()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,57.623288,10516.828082,1971.267808,1984.865753,103.117123,443.639726,567.240411,1057.429452,1162.626712,346.992466,...,1.046575,6.517808,0.613014,1868.739726,1.767123,472.980137,94.244521,46.660274,2007.815753,180921.19589
std,34.664304,9981.264932,30.202904,20.645407,180.731373,456.098091,441.866955,438.705324,386.587738,436.528436,...,0.220338,1.625393,0.644666,453.697295,0.747315,213.804841,125.338794,66.256028,1.328095,79442.502883
min,0.0,1300.0,1872.0,1950.0,0.0,0.0,0.0,0.0,334.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2006.0,34900.0
25%,42.0,7553.5,1954.0,1967.0,0.0,0.0,223.0,795.75,882.0,0.0,...,1.0,5.0,0.0,1958.0,1.0,334.5,0.0,0.0,2007.0,129975.0
50%,63.0,9478.5,1973.0,1994.0,0.0,383.5,477.5,991.5,1087.0,0.0,...,1.0,6.0,1.0,1977.0,2.0,480.0,0.0,25.0,2008.0,163000.0
75%,79.0,11601.5,2000.0,2004.0,164.25,712.25,808.0,1298.25,1391.25,728.0,...,1.0,7.0,1.0,2001.0,2.0,576.0,168.0,68.0,2009.0,214000.0
max,313.0,215245.0,2010.0,2010.0,1600.0,5644.0,2336.0,6110.0,4692.0,2065.0,...,3.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,2010.0,755000.0


In [256]:
housing.head()

Unnamed: 0,LotFrontage,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,...,SaleType,SaleCondition,SalePrice,MSSubClass_new,MSZoning_new,OverallQual_new,OverallCond_new,MonthSold,LandSlope_new,Functional_new
0,65.0,8450,Reg,Lvl,Inside,CollgCr,Norm,Norm,1Fam,2Story,...,WD,Normal,208500,2-STORY 1946 & NEWER,Residential Low Density,Good,Average,Feb,GentleSlope,TypicalFunctionality
1,80.0,9600,Reg,Lvl,FR2,Veenker,Feedr,Norm,1Fam,1Story,...,WD,Normal,181500,1-STORY 1946 & NEWER ALL STYLES,Residential Low Density,Above Average,Very Good,May,GentleSlope,TypicalFunctionality
2,68.0,11250,IR1,Lvl,Inside,CollgCr,Norm,Norm,1Fam,2Story,...,WD,Normal,223500,2-STORY 1946 & NEWER,Residential Low Density,Good,Average,Sep,GentleSlope,TypicalFunctionality
3,60.0,9550,IR1,Lvl,Corner,Crawfor,Norm,Norm,1Fam,2Story,...,WD,Abnorml,140000,2-STORY 1945 & OLDER,Residential Low Density,Good,Average,Feb,GentleSlope,TypicalFunctionality
4,84.0,14260,IR1,Lvl,FR2,NoRidge,Norm,Norm,1Fam,2Story,...,WD,Normal,250000,2-STORY 1946 & NEWER,Residential Low Density,Very Good,Average,Dec,GentleSlope,TypicalFunctionality


In [257]:
# Checking the Information about the file
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 64 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   LotFrontage      1460 non-null   float64
 1   LotArea          1460 non-null   int64  
 2   LotShape         1460 non-null   object 
 3   LandContour      1460 non-null   object 
 4   LotConfig        1460 non-null   object 
 5   Neighborhood     1460 non-null   object 
 6   Condition1       1460 non-null   object 
 7   Condition2       1460 non-null   object 
 8   BldgType         1460 non-null   object 
 9   HouseStyle       1460 non-null   object 
 10  YearBuilt        1460 non-null   int64  
 11  YearRemodAdd     1460 non-null   int64  
 12  RoofStyle        1460 non-null   object 
 13  RoofMatl         1460 non-null   object 
 14  Exterior1st      1460 non-null   object 
 15  Exterior2nd      1460 non-null   object 
 16  MasVnrType       1452 non-null   object 
 17  MasVnrArea    

# Adding Dummies for Categorical Variables

In [258]:
#_______________________________________________________#
# Get the dummies for MSSubClass_new and store it in a variable for MSSubClass_new
dummies_MSSubClass = pd.get_dummies(housing['MSSubClass_new'],drop_first= True)
dummies_MSSubClass.head()

Unnamed: 0,1-1/2 STORY FINISHED ALL AGES,1-STORY 1945 & OLDER,1-STORY 1946 & NEWER ALL STYLES,1-STORY PUD - 1946 & NEWER,1-STORY W/FINISHED ATTIC ALL AGES,2 FAMILY CONVERSION - ALL STYLES AND AGES,2-1/2 STORY ALL AGES,2-STORY 1945 & OLDER,2-STORY 1946 & NEWER,2-STORY PUD - 1946 & NEWER,DUPLEX - ALL STYLES AND AGES,PUD - MULTILEVEL - INCL SPLIT LEV/FOYER,SPLIT FOYER,SPLIT OR MULTI-LEVEL
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [259]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_MSSubClass],axis=1)
#drop the 'MSSubClass_new' column as we have the dummy variable now
housing = housing.drop('MSSubClass_new',axis = 1)

In [260]:
#_______________________________________________________#
# Get the dummies for MonthSold and store it in a variable for MonthSold
dummies_MonthSold = pd.get_dummies(housing['MonthSold'],drop_first= True)
dummies_MonthSold.head()

Unnamed: 0,Aug,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep
0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0


In [261]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_MonthSold],axis=1)
#drop the 'MonthSold' column as we have the dummy variable now
housing = housing.drop('MonthSold',axis = 1)

In [262]:
#_______________________________________________________#
# Get the dummies for MSZoning and store it in a variable for MSZoning
dummies_MSZoning = pd.get_dummies(housing['MSZoning_new'],drop_first= True)
dummies_MSZoning.head()

Unnamed: 0,Residential High Density,Residential Low Density,Residential Medium Density
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [263]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_MSZoning],axis=1)
#drop the 'MSZoning' column as we have the dummy variable now
housing = housing.drop('MSZoning_new',axis = 1)

In [264]:
#_______________________________________________________#
# Get the dummies for LotShape and store it in a variable for LotShape
dummies_LotShape = pd.get_dummies(housing['LotShape'],drop_first= True)
dummies_LotShape.head()

Unnamed: 0,IR2,IR3,Reg
0,0,0,1
1,0,0,1
2,0,0,0
3,0,0,0
4,0,0,0


In [265]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_LotShape],axis=1)
#drop the 'LotShape' column as we have the dummy variable now
housing = housing.drop('LotShape',axis = 1)

In [266]:
#_______________________________________________________#
# Get the dummies for LandContour and store it in a variable for LandContour
dummies_LandContour = pd.get_dummies(housing['LandContour'],drop_first= True)
dummies_LandContour.head()

Unnamed: 0,HLS,Low,Lvl
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [267]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_LandContour],axis=1)
#drop the 'LandContour' column as we have the dummy variable now
housing = housing.drop('LandContour',axis = 1)

In [268]:
#_______________________________________________________#
# Get the dummies for LotConfig and store it in a variable for LotConfig
dummies_LotConfig = pd.get_dummies(housing['LotConfig'],drop_first= True)
dummies_LotConfig.head()

Unnamed: 0,CulDSac,FR2,FR3,Inside
0,0,0,0,1
1,0,1,0,0
2,0,0,0,1
3,0,0,0,0
4,0,1,0,0


In [269]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_LotConfig],axis=1)
#drop the 'LotConfig' column as we have the dummy variable now
housing = housing.drop('LotConfig',axis = 1)

In [270]:
#_______________________________________________________#
# Get the dummies for seasons and store it in a variable for seasons
dummies_LandSlope_new = pd.get_dummies(housing['LandSlope_new'],drop_first= True)
dummies_LandSlope_new.head()

Unnamed: 0,MOderateSlope,SevereSlope
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [271]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_LandSlope_new],axis=1)
#drop the 'LandSlope_new' column as we have the dummy variable now
housing = housing.drop('LandSlope_new',axis = 1)

In [272]:
#_______________________________________________________#
# Get the dummies for Neighborhood and store it in a variable for Neighborhood
dummies_Neighborhood = pd.get_dummies(housing['Neighborhood'],drop_first= True)
dummies_Neighborhood.head()

Unnamed: 0,Blueste,BrDale,BrkSide,ClearCr,CollgCr,Crawfor,Edwards,Gilbert,IDOTRR,MeadowV,...,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [273]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Neighborhood],axis=1)
#drop the 'Neighborhood' column as we have the dummy variable now
housing = housing.drop('Neighborhood',axis = 1)

In [274]:
#_______________________________________________________#
# Get the dummies for Condition1 and store it in a variable for Condition1
dummies_Condition1 = pd.get_dummies(housing['Condition1'],drop_first= True)
dummies_Condition1.head()

Unnamed: 0,Feedr,Norm,PosA,PosN,RRAe,RRAn,RRNe,RRNn
0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0


In [275]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Condition1],axis=1)
#drop the 'Condition1' column as we have the dummy variable now
housing = housing.drop('Condition1',axis = 1)

In [276]:
#_______________________________________________________#
# Get the dummies for Condition2 and store it in a variable for Condition2
dummies_Condition2 = pd.get_dummies(housing['Condition2'],drop_first= True)
dummies_Condition2.head()

Unnamed: 0,Feedr,Norm,PosA,PosN,RRAe,RRAn,RRNn
0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0


In [277]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Condition2],axis=1)
#drop the 'Condition2' column as we have the dummy variable now
housing = housing.drop('Condition2',axis = 1)

In [278]:
#_______________________________________________________#
# Get the dummies for BldgType and store it in a variable for BldgType
dummies_BldgType = pd.get_dummies(housing['BldgType'],drop_first= True)
dummies_BldgType.head()

Unnamed: 0,2fmCon,Duplex,Twnhs,TwnhsE
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [279]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_BldgType],axis=1)
#drop the 'BldgType' column as we have the dummy variable now
housing = housing.drop('BldgType',axis = 1)

In [280]:
#_______________________________________________________#
# Get the dummies for HouseStyle and store it in a variable for HouseStyle
dummies_HouseStyle = pd.get_dummies(housing['HouseStyle'],drop_first= True)
dummies_HouseStyle.head()

Unnamed: 0,1.5Unf,1Story,2.5Fin,2.5Unf,2Story,SFoyer,SLvl
0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0


In [281]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_HouseStyle],axis=1)
#drop the 'HouseStyle' column as we have the dummy variable now
housing = housing.drop('HouseStyle',axis = 1)

In [282]:
#_______________________________________________________#
# Get the dummies for OverallQual_new and store it in a variable for OverallQual_new
dummies_OverallQual_new = pd.get_dummies(housing['OverallQual_new'],drop_first= True)
dummies_OverallQual_new.head()

Unnamed: 0,Average,Below Average,Excellent,Fair,Good,Poor,Very Excellent,Very Good,Very Poor
0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0


In [283]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_OverallQual_new],axis=1)
#drop the 'OverallQual_new' column as we have the dummy variable now
housing = housing.drop('OverallQual_new',axis = 1)

In [284]:
#_______________________________________________________#
# Get the dummies for OverallCond_new and store it in a variable for OverallCond_new
dummies_OverallCond_new = pd.get_dummies(housing['OverallCond_new'],drop_first= True)
dummies_OverallCond_new.head()

Unnamed: 0,Average,Below Average,Excellent,Fair,Good,Poor,Very Good,Very Poor
0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0


In [285]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_OverallCond_new],axis=1)
#drop the 'OverallCond_new' column as we have the dummy variable now
housing = housing.drop('OverallCond_new',axis = 1)

In [286]:
#_______________________________________________________#
# Get the dummies for RoofStyle and store it in a variable for RoofStyle
dummies_RoofStyle = pd.get_dummies(housing['RoofStyle'],drop_first= True)
dummies_RoofStyle.head()

Unnamed: 0,Gable,Gambrel,Hip,Mansard,Shed
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0


In [287]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_RoofStyle],axis=1)
#drop the 'RoofStyle' column as we have the dummy variable now
housing = housing.drop('RoofStyle',axis = 1)

In [288]:
#_______________________________________________________#
# Get the dummies for RoofMatl and store it in a variable for RoofMatl
dummies_RoofMatl = pd.get_dummies(housing['RoofMatl'],drop_first= True)
dummies_RoofMatl.head()

Unnamed: 0,CompShg,Membran,Metal,Roll,Tar&Grv,WdShake,WdShngl
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [289]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_RoofMatl],axis=1)
#drop the 'RoofMatl' column as we have the dummy variable now
housing = housing.drop('RoofMatl',axis = 1)

In [290]:
#_______________________________________________________#
# Get the dummies for Exterior1st and store it in a variable for Exterior1st
dummies_Exterior1st = pd.get_dummies(housing['Exterior1st'],drop_first= True)
dummies_Exterior1st.head()

Unnamed: 0,AsphShn,BrkComm,BrkFace,CBlock,CemntBd,HdBoard,ImStucc,MetalSd,Plywood,Stone,Stucco,VinylSd,Wd Sdng,WdShing
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [291]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Exterior1st],axis=1)
#drop the 'Exterior1st' column as we have the dummy variable now
housing = housing.drop('Exterior1st',axis = 1)

In [292]:
#_______________________________________________________#
# Get the dummies for Exterior2nd and store it in a variable for Exterior2nd
dummies_Exterior2nd = pd.get_dummies(housing['Exterior2nd'],drop_first= True)
dummies_Exterior2nd.head()

Unnamed: 0,AsphShn,Brk Cmn,BrkFace,CBlock,CmentBd,HdBoard,ImStucc,MetalSd,Other,Plywood,Stone,Stucco,VinylSd,Wd Sdng,Wd Shng
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [293]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Exterior2nd],axis=1)
#drop the 'Exterior2nd' column as we have the dummy variable now
housing = housing.drop('Exterior2nd',axis = 1)

In [294]:
#_______________________________________________________#
# Get the dummies for MasVnrType and store it in a variable for MasVnrType
dummies_MasVnrType = pd.get_dummies(housing['MasVnrType'],drop_first= True)
dummies_MasVnrType.head()

Unnamed: 0,BrkFace,None,Stone
0,1,0,0
1,0,1,0
2,1,0,0
3,0,1,0
4,1,0,0


In [295]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_MasVnrType],axis=1)
#drop the 'MasVnrType' column as we have the dummy variable now
housing = housing.drop('MasVnrType',axis = 1)

In [296]:
#_______________________________________________________#
# Get the dummies for ExterQual and store it in a variable for ExterQual
dummies_ExterQual = pd.get_dummies(housing['ExterQual'],drop_first= True)
dummies_ExterQual.head()

Unnamed: 0,Fa,Gd,TA
0,0,1,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0


In [297]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_ExterQual],axis=1)
#drop the 'ExterQual' column as we have the dummy variable now
housing = housing.drop('ExterQual',axis = 1)

In [298]:
#_______________________________________________________#
# Get the dummies for ExterCond and store it in a variable for ExterCond
dummies_ExterCond = pd.get_dummies(housing['ExterCond'],drop_first= True)
dummies_ExterCond.head()

Unnamed: 0,Fa,Gd,Po,TA
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [299]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_ExterCond],axis=1)
#drop the 'ExterCond' column as we have the dummy variable now
housing = housing.drop('ExterCond',axis = 1)

In [300]:
#_______________________________________________________#
# Get the dummies for Foundation and store it in a variable for Foundation
dummies_Foundation = pd.get_dummies(housing['Foundation'],drop_first= True)
dummies_Foundation.head()

Unnamed: 0,CBlock,PConc,Slab,Stone,Wood
0,0,1,0,0,0
1,1,0,0,0,0
2,0,1,0,0,0
3,0,0,0,0,0
4,0,1,0,0,0


In [301]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Foundation],axis=1)
#drop the 'Foundation' column as we have the dummy variable now
housing = housing.drop('Foundation',axis = 1)

In [302]:
#_______________________________________________________#
# Get the dummies for BsmtQual and store it in a variable for BsmtQual
dummies_BsmtQual = pd.get_dummies(housing['BsmtQual'],drop_first= True)
dummies_BsmtQual.head()

Unnamed: 0,Fa,Gd,TA
0,0,1,0
1,0,1,0
2,0,1,0
3,0,0,1
4,0,1,0


In [303]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_BsmtQual],axis=1)
#drop the 'BsmtQual' column as we have the dummy variable now
housing = housing.drop('BsmtQual',axis = 1)

In [304]:
#_______________________________________________________#
# Get the dummies for BsmtCond and store it in a variable for BsmtCond
dummies_BsmtCond = pd.get_dummies(housing['BsmtCond'],drop_first= True)
dummies_BsmtCond.head()

Unnamed: 0,Gd,Po,TA
0,0,0,1
1,0,0,1
2,0,0,1
3,1,0,0
4,0,0,1


In [305]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_BsmtCond],axis=1)
#drop the 'BsmtCond' column as we have the dummy variable now
housing = housing.drop('BsmtCond',axis = 1)

In [306]:
#_______________________________________________________#
# Get the dummies for BsmtExposure and store it in a variable for BsmtExposure
dummies_BsmtExposure = pd.get_dummies(housing['BsmtExposure'],drop_first= True)
dummies_BsmtExposure.head()

Unnamed: 0,Gd,Mn,No
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,0,0


In [307]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_BsmtExposure],axis=1)
#drop the 'BsmtExposure' column as we have the dummy variable now
housing = housing.drop('BsmtExposure',axis = 1)

In [308]:
#_______________________________________________________#
# Get the dummies for BsmtFinType1 and store it in a variable for BsmtFinType1
dummies_BsmtFinType1 = pd.get_dummies(housing['BsmtFinType1'],drop_first= True)
dummies_BsmtFinType1.head()

Unnamed: 0,BLQ,GLQ,LwQ,Rec,Unf
0,0,1,0,0,0
1,0,0,0,0,0
2,0,1,0,0,0
3,0,0,0,0,0
4,0,1,0,0,0


In [309]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_BsmtFinType1],axis=1)
#drop the 'BsmtFinType1' column as we have the dummy variable now
housing = housing.drop('BsmtFinType1',axis = 1)

In [310]:
#_______________________________________________________#
# Get the dummies for HeatingQC and store it in a variable for HeatingQC
dummies_HeatingQC = pd.get_dummies(housing['HeatingQC'],drop_first= True)
dummies_HeatingQC.head()

Unnamed: 0,Fa,Gd,Po,TA
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,1,0,0
4,0,0,0,0


In [311]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_HeatingQC],axis=1)
#drop the 'HeatingQC' column as we have the dummy variable now
housing = housing.drop('HeatingQC',axis = 1)

In [312]:
#_______________________________________________________#
# Get the dummies for Electrical and store it in a variable for Electrical
dummies_Electrical = pd.get_dummies(housing['Electrical'],drop_first= True)
dummies_Electrical.head()

Unnamed: 0,FuseF,FuseP,Mix,SBrkr
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [313]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Electrical],axis=1)
#drop the 'Electrical' column as we have the dummy variable now
housing = housing.drop('Electrical',axis = 1)

In [314]:
#_______________________________________________________#
# Get the dummies for KitchenQual and store it in a variable for KitchenQual
dummies_KitchenQual = pd.get_dummies(housing['KitchenQual'],drop_first= True)
dummies_KitchenQual.head()

Unnamed: 0,Fa,Gd,TA
0,0,1,0
1,0,0,1
2,0,1,0
3,0,1,0
4,0,1,0


In [315]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_KitchenQual],axis=1)
#drop the 'KitchenQual' column as we have the dummy variable now
housing = housing.drop('KitchenQual',axis = 1)

In [316]:
#_______________________________________________________#
# Get the dummies for Functional and store it in a variable for Functional
dummies_Functional = pd.get_dummies(housing['Functional_new'],drop_first= True)
dummies_Functional.head()

Unnamed: 0,MinorDeductions1,MinorDeductions2,Moderate Deductions,Severely Damaged,TypicalFunctionality
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [317]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Functional],axis=1)
#drop the 'Functional' column as we have the dummy variable now
housing = housing.drop('Functional_new',axis = 1)

In [318]:
#_______________________________________________________#
# Get the dummies for GarageType and store it in a variable for GarageType
dummies_GarageType = pd.get_dummies(housing['GarageType'],drop_first= True)
dummies_GarageType.head()

Unnamed: 0,Attchd,Basment,BuiltIn,CarPort,Detchd
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,0,0,0,0,1
4,1,0,0,0,0


In [319]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_GarageType],axis=1)
#drop the 'GarageType' column as we have the dummy variable now
housing = housing.drop('GarageType',axis = 1)

In [320]:
#_______________________________________________________#
# Get the dummies for GarageFinish and store it in a variable for GarageFinish
dummies_GarageFinish = pd.get_dummies(housing['GarageFinish'],drop_first= True)
dummies_GarageFinish.head()

Unnamed: 0,RFn,Unf
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0


In [321]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_GarageFinish],axis=1)
#drop the 'GarageFinish' column as we have the dummy variable now
housing = housing.drop('GarageFinish',axis = 1)

In [322]:
#_______________________________________________________#
# Get the dummies for GarageQual and store it in a variable for GarageQual
dummies_GarageQual = pd.get_dummies(housing['GarageQual'],drop_first= True)
dummies_GarageQual.head()

Unnamed: 0,Fa,Gd,Po,TA
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [323]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_GarageQual],axis=1)
#drop the 'GarageQual' column as we have the dummy variable now
housing = housing.drop('GarageQual',axis = 1)

In [324]:
#_______________________________________________________#
# Get the dummies for GarageCond and store it in a variable for GarageCond
dummies_GarageCond = pd.get_dummies(housing['GarageCond'],drop_first= True)
dummies_GarageCond.head()

Unnamed: 0,Fa,Gd,Po,TA
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [325]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_GarageCond],axis=1)
#drop the 'GarageCond' column as we have the dummy variable now
housing = housing.drop('GarageCond',axis = 1)

In [326]:
#_______________________________________________________#
# Get the dummies for PavedDrive and store it in a variable for PavedDrive
dummies_PavedDrive = pd.get_dummies(housing['PavedDrive'],drop_first= True)
dummies_PavedDrive.head()

Unnamed: 0,P,Y
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [327]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_PavedDrive],axis=1)
#drop the 'PavedDrive' column as we have the dummy variable now
housing = housing.drop('PavedDrive',axis = 1)

In [328]:
#_______________________________________________________#
# Get the dummies for SaleType and store it in a variable for SaleType
dummies_SaleType = pd.get_dummies(housing['SaleType'],drop_first= True)
dummies_SaleType.head()

Unnamed: 0,CWD,Con,ConLD,ConLI,ConLw,New,Oth,WD
0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1


In [329]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_SaleType],axis=1)
#drop the 'SaleType' column as we have the dummy variable now
housing = housing.drop('SaleType',axis = 1)

In [330]:
#_______________________________________________________#
# Get the dummies for SaleCondition and store it in a variable for SaleCondition
dummies_SaleCondition = pd.get_dummies(housing['SaleCondition'],drop_first= True)
dummies_SaleCondition.head()

Unnamed: 0,AdjLand,Alloca,Family,Normal,Partial
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,0,0
4,0,0,0,1,0


In [331]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_SaleCondition],axis=1)
#drop the 'SaleCondition' column as we have the dummy variable now
housing = housing.drop('SaleCondition',axis = 1)

In [332]:
#_______________________________________________________#
# Get the dummies for BsmtFinType2 and store it in a variable for BsmtFinType2
dummies_BsmtFinType2 = pd.get_dummies(housing['BsmtFinType2'],drop_first= True)
dummies_BsmtFinType2.head()

Unnamed: 0,BLQ,GLQ,LwQ,Rec,Unf
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [333]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_BsmtFinType2],axis=1)
#drop the 'BsmtFinType2' column as we have the dummy variable now
housing = housing.drop('BsmtFinType2',axis = 1)

In [334]:
#_______________________________________________________#
# Get the dummies for BsmtFinType2 and store it in a variable for Heating
dummies_Heating = pd.get_dummies(housing['Heating'],drop_first= True)
dummies_Heating.head()

Unnamed: 0,GasA,GasW,Grav,OthW,Wall
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0


In [335]:
# Concat the dummy variables with the original dataframe
housing = pd.concat([housing,dummies_Heating],axis=1)
#drop the 'Heating' column as we have the dummy variable now
housing = housing.drop('Heating',axis = 1)

In [336]:
# Checking the Information about the file
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 258 entries, LotFrontage to Wall
dtypes: float64(3), int64(22), uint8(233)
memory usage: 617.5 KB


# Split the data into Train & Test set & perform scaling on continuous variables

In [337]:
# Train , test split of Data in ratio of 70 : 30
df_train , df_test = train_test_split(housing,train_size = .7 , random_state = 100)
print(df_train.shape)
print(df_test.shape)

(1021, 258)
(439, 258)


In [338]:
# Rescaling  numeric variables
# Instantiate an object
scaler = MinMaxScaler()
# Create list of only numeric variables which we want to rescale
num_vars = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
            '1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath','FullBath','HalfBath',
            'BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt',
            'GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','SalePrice','YrSold','YearBuilt','YearRemodAdd']
# Fit on data (Train Set)
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
# Fit on data (Test set)
df_test[num_vars] = scaler.fit_transform(df_test[num_vars])
df_train.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,BLQ,GLQ,LwQ,Rec,Unf,GasA,GasW,Grav,OthW,Wall
318,0.28754,0.039403,0.876812,0.716667,0.16,0.174876,0.15411,0.220458,0.231481,0.616949,...,0,0,0,0,1,1,0,0,0,0
239,0.166134,0.033981,0.528986,0.0,0.0,0.016655,0.274401,0.120295,0.098611,0.333656,...,0,0,0,0,1,1,0,0,0,0
986,0.188498,0.017931,0.275362,0.883333,0.0,0.0,0.20762,0.079378,0.145602,0.307022,...,0,0,0,0,1,1,0,0,0,0
1416,0.191693,0.046139,0.094203,0.0,0.0,0.0,0.33262,0.127169,0.202315,0.505569,...,0,0,0,0,1,1,0,0,0,0
390,0.159744,0.032409,0.202899,0.0,0.0,0.0427,0.098031,0.140917,0.136343,0.19661,...,1,0,0,0,0,1,0,0,0,0


In [339]:
df_train.describe()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,BLQ,GLQ,LwQ,Rec,Unf,GasA,GasW,Grav,OthW,Wall
count,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,...,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0,1021.0
mean,0.184049,0.042162,0.719719,0.583072,0.065306,0.079406,0.241329,0.173813,0.184401,0.166364,...,0.022527,0.012733,0.029383,0.03526,0.855044,0.977473,0.012733,0.003918,0.000979,0.003918
std,0.10996,0.048221,0.219718,0.343416,0.117088,0.082409,0.192097,0.075139,0.092106,0.210895,...,0.148462,0.112173,0.16896,0.184526,0.352229,0.148462,0.112173,0.0625,0.031296,0.0625
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.13738,0.027937,0.594203,0.283333,0.0,0.0,0.092466,0.129787,0.116667,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,0.191693,0.037555,0.731884,0.733333,0.0,0.069454,0.197774,0.162684,0.165278,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,0.249201,0.046943,0.927536,0.9,0.09875,0.126152,0.345034,0.215057,0.243056,0.352542,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [340]:
# Sale Price in y (for train set)
y_train = df_train.pop('SalePrice')
X_train = df_train
X_train.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,BLQ,GLQ,LwQ,Rec,Unf,GasA,GasW,Grav,OthW,Wall
318,0.28754,0.039403,0.876812,0.716667,0.16,0.174876,0.15411,0.220458,0.231481,0.616949,...,0,0,0,0,1,1,0,0,0,0
239,0.166134,0.033981,0.528986,0.0,0.0,0.016655,0.274401,0.120295,0.098611,0.333656,...,0,0,0,0,1,1,0,0,0,0
986,0.188498,0.017931,0.275362,0.883333,0.0,0.0,0.20762,0.079378,0.145602,0.307022,...,0,0,0,0,1,1,0,0,0,0
1416,0.191693,0.046139,0.094203,0.0,0.0,0.0,0.33262,0.127169,0.202315,0.505569,...,0,0,0,0,1,1,0,0,0,0
390,0.159744,0.032409,0.202899,0.0,0.0,0.0427,0.098031,0.140917,0.136343,0.19661,...,1,0,0,0,0,1,0,0,0,0


In [341]:
# Sale Price in y (for train set)
y_test = df_test.pop('SalePrice')
X_test = df_test
X_test.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,BLQ,GLQ,LwQ,Rec,Unf,GasA,GasW,Grav,OthW,Wall
1436,0.191693,0.048827,0.716418,0.35,0.0,0.281536,0.128165,0.342314,0.242009,0.0,...,0,0,0,0,1,1,0,0,0,0
57,0.284345,0.065599,0.962687,0.9,0.0,0.0,0.444444,0.340729,0.240183,0.541221,...,0,0,0,0,1,1,0,0,0,0
780,0.201278,0.041693,0.895522,0.766667,0.04126,0.0,0.639276,0.490095,0.419635,0.0,...,0,0,0,0,1,1,0,0,0,0
382,0.252396,0.05038,0.977612,0.933333,0.0,0.0,0.485271,0.372029,0.276256,0.539962,...,0,0,0,0,1,1,0,0,0,0
1170,0.242812,0.054407,0.761194,0.45,0.0,0.238574,0.296641,0.434231,0.357991,0.0,...,0,0,0,0,1,1,0,0,0,0


In [342]:
# Building model using Reccursing feature elemination
# Running Linear regression with number of variables as 35
lm = LinearRegression()
lm.fit(X_train,y_train)
# Reducing variables using RFE & rebuilding the model
rfe = RFE(lm, n_features_to_select = 35, step = 1)   # Running Reccursive feature elemination
rfe = rfe.fit(X_train,y_train)
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[('LotFrontage', False, 67),
 ('LotArea', True, 1),
 ('YearBuilt', True, 1),
 ('YearRemodAdd', False, 27),
 ('MasVnrArea', False, 46),
 ('BsmtFinSF1', True, 1),
 ('BsmtUnfSF', False, 45),
 ('TotalBsmtSF', True, 1),
 ('1stFlrSF', True, 1),
 ('2ndFlrSF', True, 1),
 ('GrLivArea', True, 1),
 ('BsmtFullBath', False, 60),
 ('FullBath', False, 69),
 ('HalfBath', False, 179),
 ('BedroomAbvGr', True, 1),
 ('KitchenAbvGr', True, 1),
 ('TotRmsAbvGrd', False, 66),
 ('Fireplaces', False, 106),
 ('GarageYrBlt', False, 108),
 ('GarageCars', False, 55),
 ('GarageArea', True, 1),
 ('WoodDeckSF', False, 221),
 ('OpenPorchSF', False, 82),
 ('YrSold', False, 192),
 ('1-1/2 STORY FINISHED ALL AGES', False, 93),
 ('1-STORY 1945 & OLDER', False, 190),
 ('1-STORY 1946 & NEWER ALL STYLES', False, 96),
 ('1-STORY PUD - 1946 & NEWER', False, 64),
 ('1-STORY W/FINISHED ATTIC ALL AGES', False, 215),
 ('2 FAMILY CONVERSION - ALL STYLES AND AGES', False, 11),
 ('2-1/2 STORY ALL AGES', False, 84),
 ('2-STORY 1945 & O

In [343]:
# Below columns were picked by RFE
col = X_train.columns[rfe.support_]
col

Index(['LotArea', 'YearBuilt', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageArea',
       'SevereSlope', 'PosA', 'PosN', 'RRAe', 'Excellent', 'Very Excellent',
       'Very Good', 'Excellent', 'Fair', 'Shed', 'CompShg', 'Membran', 'Metal',
       'Roll', 'Tar&Grv', 'WdShake', 'WdShngl', 'CBlock', 'Stone', 'Wood',
       'Po', 'Severely Damaged', 'Con', 'Partial', 'OthW'],
      dtype='object')

In [344]:
# Building the model
X_train_rfe = X_train[col]       # Using the selected columns given by RFE
# Adding a constant
X_train_rfe = sm.add_constant(X_train_rfe)
# Running the Linear regression model
lm = sm.OLS(y_train,X_train_rfe).fit()

# Print the Statsmodel summary  of  the Linear regression model built
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.914
Method:                 Least Squares   F-statistic:                     232.7
Date:                Wed, 02 Aug 2023   Prob (F-statistic):               0.00
Time:                        11:34:26   Log-Likelihood:                 2085.5
No. Observations:                1021   AIC:                            -4075.
Df Residuals:                     973   BIC:                            -3838.
Df Model:                          47                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -1.1687      0.038  

In [345]:
# Running for test set
# Reducing variables using RFE & rebuilding the model
lm1 = LinearRegression()
lm1.fit(X_test,y_test)
rfe = RFE(lm1, n_features_to_select = 35, step = 1)   # Running Reccursive feature elemination
rfe = rfe.fit(X_test,y_test)
list(zip(X_test.columns,rfe.support_,rfe.ranking_))

[('LotFrontage', False, 148),
 ('LotArea', True, 1),
 ('YearBuilt', False, 124),
 ('YearRemodAdd', False, 152),
 ('MasVnrArea', False, 72),
 ('BsmtFinSF1', True, 1),
 ('BsmtUnfSF', False, 145),
 ('TotalBsmtSF', False, 168),
 ('1stFlrSF', True, 1),
 ('2ndFlrSF', False, 139),
 ('GrLivArea', True, 1),
 ('BsmtFullBath', False, 160),
 ('FullBath', False, 74),
 ('HalfBath', False, 103),
 ('BedroomAbvGr', False, 157),
 ('KitchenAbvGr', False, 181),
 ('TotRmsAbvGrd', False, 71),
 ('Fireplaces', False, 110),
 ('GarageYrBlt', False, 190),
 ('GarageCars', False, 205),
 ('GarageArea', False, 47),
 ('WoodDeckSF', False, 218),
 ('OpenPorchSF', False, 211),
 ('YrSold', False, 182),
 ('1-1/2 STORY FINISHED ALL AGES', False, 6),
 ('1-STORY 1945 & OLDER', False, 16),
 ('1-STORY 1946 & NEWER ALL STYLES', False, 15),
 ('1-STORY PUD - 1946 & NEWER', False, 9),
 ('1-STORY W/FINISHED ATTIC ALL AGES', False, 4),
 ('2 FAMILY CONVERSION - ALL STYLES AND AGES', True, 1),
 ('2-1/2 STORY ALL AGES', True, 1),
 ('2-

In [346]:
# Below columns were picked by RFE
col = X_test.columns[rfe.support_]
col

Index(['LotArea', 'BsmtFinSF1', '1stFlrSF', 'GrLivArea',
       '2 FAMILY CONVERSION - ALL STYLES AND AGES', '2-1/2 STORY ALL AGES',
       '2-STORY 1946 & NEWER', 'DUPLEX - ALL STYLES AND AGES', 'SPLIT FOYER',
       'FR3', 'SevereSlope', 'NridgHt', 'StoneBr', 'PosA', 'Duplex',
       'Excellent', 'Poor', 'Mansard', 'Tar&Grv', 'WdShake', 'WdShngl',
       'Other', 'Stone', 'Po', 'Mix', 'Fa', 'Gd', 'Po', 'TA', 'Fa', 'Gd', 'Po',
       'TA', 'Partial', 'GasA'],
      dtype='object')

In [347]:
# Building the model
X_test_rfe = X_test[col]       # Using the selected columns given by RFE
# Adding a constant
X_test_rfe = sm.add_constant(X_test_rfe)
# Running the Linear regression model
lm = sm.OLS(y_test,X_test_rfe).fit()

# Print the Statsmodel summary  of  the Linear regression model built
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.903
Model:                            OLS   Adj. R-squared:                  0.889
Method:                 Least Squares   F-statistic:                     66.10
Date:                Wed, 02 Aug 2023   Prob (F-statistic):          8.73e-163
Time:                        11:34:29   Log-Likelihood:                 752.56
No. Observations:                 439   AIC:                            -1395.
Df Residuals:                     384   BIC:                            -1170.
Df Model:                          54                                         
Covariance Type:            nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------

# Building Linear Regression Model

In [348]:
# Build Linear Regression model for Train set & review the R2 score
reg = LinearRegression()
reg.fit(X_train,y_train)

In [349]:
#Print INtercept & coefficient values
print(reg.intercept_)
print(reg.coef_)
print(reg.feature_names_in_)

-1.136610023661783
[ 1.26613374e-02  1.99180589e-01  8.93581340e-02  7.76560754e-03
  3.10337760e-02  3.96293052e-02 -3.84473798e-02  3.11618779e-01
  2.34330142e-01  1.67174386e-01  1.54859417e-01  9.03018915e-03
  9.01427221e-03 -4.31904249e-03 -2.00594185e-02 -3.56014010e-02
 -2.05885938e-02  8.54620596e-03  9.86168498e-03  1.15679084e-02
  2.64855711e-02  2.28755053e-03  1.30092301e-02 -1.07410090e-03
 -1.65545435e-03  7.33519027e-03 -5.73211211e-04 -1.73434953e-03
  5.23481774e-03 -5.34362060e+10 -9.44297677e-03  4.92313298e-03
  6.05746827e-03 -6.56843185e-04 -3.53380851e+10  2.04992294e-03
 -9.02128220e-03 -4.95624542e-03 -4.69017029e-03  3.19719315e-04
 -2.75826454e-03  6.73389435e-03  5.91516495e-03 -1.43241882e-03
  4.52756882e-04  6.44302368e-03 -1.13952160e-03 -3.13282013e-03
 -9.48905945e-05  2.23588943e-02  1.76677704e-02  1.45988464e-02
  2.11772323e-03  1.02334023e-02  1.02567673e-03 -1.18279457e-03
 -7.03859329e-03  3.97682190e-03  1.18548870e-02 -9.96971130e-03
  7.65

In [350]:
# Do prediction  for Training set
y_train_pred = reg.predict(X_train)
y_train_pred

array([0.39037061, 0.11142469, 0.10484815, ..., 0.41086078, 0.09521986,
       0.32393385])

In [351]:
# Build Linear Regression model for Test set & review the R2 score
reg.fit(X_test,y_test)

In [352]:
#Print Intercept & coefficient values
print(reg.intercept_)
print(reg.coef_)

-200376811210.22238
[-1.48071201e-03  2.75753637e-01 -1.10904631e-02 -2.04474807e-02
  1.94115791e-02  4.03986921e-02 -5.04591088e-02  1.68488983e-02
  4.16835856e-01  1.69970251e-01 -3.14299980e-01 -1.78272951e-02
  6.01273082e-02  2.65970076e-02 -1.98380604e-02 -7.63156904e-04
  6.24165807e-02  1.06853705e-02  4.09539046e-03  8.20414268e-03
  2.35002672e-02  2.31810652e-02  8.95552957e-03  6.96871523e-03
  5.40512452e+11  5.40512452e+11  5.40512452e+11  9.55027087e+11
  5.40512452e+11  5.40512452e+11  5.40512452e+11  5.40512452e+11
  5.40512452e+11  9.55027087e+11  4.66700011e+11  9.55027087e+11
  5.40512452e+11  5.40512452e+11 -7.93457031e-03 -2.75268555e-02
 -2.80265808e-02 -1.05285645e-03 -1.36413574e-02  1.52587891e-05
  7.01904297e-03 -6.77490234e-03 -1.85852051e-02 -1.13067627e-02
 -9.73510742e-03 -2.39791870e-02 -9.93347168e-03  9.14001465e-03
  1.62963867e-02 -4.51288223e-02 -2.38037109e-03 -8.94165039e-03
 -3.03916931e-02 -1.58691406e-02  1.11846924e-02 -4.14276123e-03
 -8.9

In [353]:
# Do prediction  for Test set
y_test_pred = reg.predict(X_test)
y_test_pred

array([ 0.12963867,  0.26245117,  0.19824219,  0.28051758,  0.18115234,
        0.33337402,  0.30603027,  0.42675781,  0.07336426,  0.24151611,
        0.31256104,  0.22485352,  0.21557617,  0.20465088,  0.12390137,
        0.10760498,  0.21276855,  0.09710693,  0.15600586,  0.13751221,
        0.24829102,  0.11920166,  0.15588379,  0.14733887,  0.24926758,
        0.19055176, -0.00415039,  0.12237549,  0.28723145,  0.19604492,
        0.19470215,  0.31286621,  0.33361816,  0.10266113,  0.12902832,
        0.10388184,  0.16455078,  0.07531738,  0.15734863,  0.16741943,
        0.63806152,  0.31188965,  0.13525391,  0.0177002 ,  0.26489258,
       -0.00247192,  0.38787842,  0.16589355,  0.20117188,  0.14733887,
        0.29321289,  0.15246582,  0.49865723,  0.34692383,  0.22290039,
        0.08581543,  0.03622437,  0.25976562,  0.19311523,  0.7331543 ,
        0.14758301,  0.29125977,  0.2019043 ,  0.27587891,  0.22192383,
        0.31726074,  0.14013672,  0.17456055,  0.31384277,  0.17

In [354]:
# Computing the R2 , RSS , MSE , RMSE values - Training & test set & putting in a metric table
metric = []
# R2 score for train set
r2_train_lr = r2_score(y_train,y_train_pred)
print(r2_train_lr)
metric.append(r2_train_lr)
# R2 score for test set
r2_test_lr = r2_score(y_test,y_test_pred)
print(r2_test_lr)
metric.append(r2_test_lr)
# RSS  for train set
RSS_train_lr = np.sum(np.square(y_train - y_train_pred))
print(RSS_train_lr)
metric.append(RSS_train_lr)
# RSS  for test set
RSS_test_lr = np.sum(np.square(y_test - y_test_pred))
print(RSS_test_lr)
metric.append(RSS_test_lr)
# MSE  for train set
MSE_train_lr = mean_squared_error(y_train,y_train_pred)
print(MSE_train_lr)
metric.append(MSE_train_lr)
# MSE  for train set
MSE_test_lr = mean_squared_error(y_test,y_test_pred)
print(MSE_test_lr)
metric.append(MSE_test_lr)
# RMSE  for train set
RMSE_train_lr = MSE_train_lr**0.5
print(RMSE_train_lr)
metric.append(RMSE_train_lr)
# RMSE  for test set
RMSE_test_lr = MSE_test_lr**0.5
print(RMSE_test_lr)
metric.append(RMSE_test_lr)

0.9552118053411991
0.9590772069514153
0.5512200041630403
0.351268485850443
0.0005398824722458768
0.0008001560042151322
0.02323537114499953
0.028287028903989406


# ****Applying RIDGE Regularisation*****

In [355]:
#Applying Ridge regression by varying hyperparameter Lamda
# List of Lamda / Alpha to tune
params = {'alpha': [0.0001 , 0.005 , 0.001, 0.05 , 0.01, 0.1, 0.2,
                    0.3 , 0.4 , 0.5, 0.6 , 0.7 , 0.8 , 0.9 , 1.0,
                    10.0, 50, 100, 500, 1000]}

ridge = Ridge()

# Cross Validation with 5 folds
folds = 5
model_cv = GridSearchCV(estimator=ridge,
                        param_grid = params,
                        scoring = 'neg_mean_absolute_error',
                        cv = folds,
                        return_train_score = True,
                        verbose = 1)
model_cv.fit(X_train_rfe,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [356]:
# Printing the best hyperparameter
# This will return us the highest negative mean absolute error
print(model_cv.best_params_)

{'alpha': 0.05}


In [357]:
# Fitting the Ridge model for Lamda/Alpha = 0.05
alpha_best = 0.05
ridge = Ridge(alpha = alpha_best)
ridge.fit(X_train_rfe,y_train)
print(ridge.coef_)
print(ridge.feature_names_in_)

[ 0.00000000e+00  1.87772911e-01  9.34049938e-02  1.46984066e-01
  1.36369414e-01  3.47120122e-01  1.56653518e-01  1.20941251e-01
 -4.23208117e-02 -9.89138491e-02  4.24649825e-02 -2.50596221e-02
  2.59355700e-02 -1.09674271e-01  2.47826205e-03 -6.66208106e-01
 -3.67545683e-02 -1.50776446e-01  6.33222026e-02  2.24379504e-02
  1.98558325e-01  5.27689474e-02  2.35840065e-02  6.33222026e-02
  2.24379504e-02 -1.25708732e-04 -4.15168595e-02  1.10881273e-01
  7.56514602e-01  8.37488887e-01  7.82168498e-01  7.16091694e-01
  7.57991172e-01  7.11421557e-01  8.31015478e-01 -1.70240065e-02
 -1.70240065e-02 -9.87882083e-03 -1.44547495e-02  4.48626724e-02
  4.04827762e-03  3.67816226e-02 -4.39770001e-02  0.00000000e+00
 -2.15611973e-02  3.50763102e-02 -1.74923508e-02  5.64751433e-03
 -1.34032944e-01  5.16774775e-02  3.03671941e-02 -8.40015454e-02]
['const' 'LotArea' 'YearBuilt' 'BsmtFinSF1' 'TotalBsmtSF' '1stFlrSF'
 '2ndFlrSF' 'GrLivArea' 'BedroomAbvGr' 'KitchenAbvGr' 'GarageArea'
 'SevereSlope' 'Po

In [358]:
# Predict target variable using Ridge
y_pred_ridge_train = ridge.predict(X_train_rfe)

In [359]:
# Fitting the Ridge model for Lamda/Alpha = 0.05
ridge.fit(X_test_rfe,y_test)
print(ridge.coef_)
print(ridge.feature_names_in_)

[ 0.00000000e+00  2.91172576e-01  7.95841477e-02  1.23480478e-01
  2.88293065e-01 -2.40112446e-02 -4.86784071e-02  3.34286758e-02
 -1.51059415e-02  4.84945250e-03 -9.67206789e-02 -1.89558149e-01
  8.40110318e-02  1.03330312e-01 -1.16426084e-01  0.00000000e+00
 -1.51059415e-02  8.31096102e-02  1.21987944e-02  4.77138040e-02
 -4.29080626e-02 -2.31373304e-02  3.01897656e-03  5.39901835e-02
  2.15743390e-02 -6.65365952e-02  0.00000000e+00 -6.43690979e-02
  1.85998534e-03 -5.82849221e-02 -1.79486590e-02 -2.49594036e-02
  0.00000000e+00 -2.49594036e-02  7.62591462e-02 -2.49594036e-02
 -4.85574634e-02 -1.99575967e-02 -1.31188347e-02  8.40804651e-04
 -1.66172398e-02 -1.20633384e-01  1.34117686e-01 -3.86259611e-02
 -5.46492203e-03 -2.36802241e-04  1.87625652e-02  2.16187755e-02
 -5.93761588e-03 -7.12513617e-03 -1.30064663e-01  1.62263944e-01
 -1.79486590e-02 -2.49594036e-02  0.00000000e+00 -2.49594036e-02
  7.62591462e-02 -4.66065685e-02 -8.49624328e-03 -8.96704916e-03
  1.74340179e-02 -7.05316

In [360]:

y_pred_ridge_test = ridge.predict(X_test_rfe)

In [361]:
# Calculating the metrics R2 , RSS , MSE , RMSE for this model
metric2 = []
# R2 score for train set
r2_train_ridge = r2_score(y_train,y_pred_ridge_train)
print(r2_train_ridge)
metric2.append(r2_train_ridge)
# R2 score for test set
r2_test_ridge = r2_score(y_test,y_pred_ridge_test)
print(r2_test_ridge)
metric2.append(r2_test_ridge)
# RSS  for train set
RSS_train_ridge = np.sum(np.square(y_train - y_pred_ridge_train))
print(RSS_train_ridge)
metric2.append(RSS_train_ridge)
# RSS  for test set
RSS_test_ridge = np.sum(np.square(y_test - y_pred_ridge_test))
print(RSS_test_ridge)
metric2.append(RSS_test_ridge)
# MSE  for train set
MSE_train_ridge = mean_squared_error(y_train,y_pred_ridge_train)
print(MSE_train_ridge)
metric2.append(MSE_train_ridge)
# MSE  for train set
MSE_test_ridge = mean_squared_error(y_test,y_pred_ridge_test)
print(MSE_test_ridge)
metric2.append(MSE_test_ridge)
# RMSE  for train set
RMSE_train_ridge = MSE_train_ridge**0.5
print(RMSE_train_ridge)
metric2.append(RMSE_train_ridge)
# RMSE  for test set
RMSE_test_ridge = MSE_test_ridge**0.5
print(RMSE_test_ridge)
metric2.append(RMSE_test_ridge)

0.9087908059678913
0.9021876936918221
1.122535362210836
0.8395903156859843
0.0010994469757207013
0.00191250641386329
0.033157909700713965
0.04373221254251024


# **Applying Lasso Regression**

In [362]:
lasso = Lasso()

model_cv = GridSearchCV(estimator=lasso,
                        param_grid = params,
                        scoring = 'neg_mean_absolute_error',
                        cv = folds,
                        return_train_score = True,
                        verbose = 1)
model_cv.fit(X_train_rfe,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [363]:
# Printing the best hyperparameter
# This will return us the highest negative mean absolute error
print(model_cv.best_params_)

{'alpha': 0.0001}


In [364]:
# Fitting the Lasso model for Lamda/Alpha = 0.0001
alpha_best = 0.0001
lasso = Lasso(alpha = alpha_best)
lasso.fit(X_train_rfe,y_train)
print(lasso.coef_)
print(lasso.feature_names_in_)

[ 0.          0.08014726  0.10863485  0.08648661  0.03972327  0.2418928
  0.08579405  0.22977871 -0.01111587 -0.09072828  0.05879831  0.03657862
  0.01707967 -0.         -0.         -0.4729862  -0.02732496 -0.
  0.12755472  0.03230769  0.1697691   0.06115604  0.0214015   0.01142528
  0.         -0.         -0.03295608  0.          0.10429586  0.0822981
  0.01567663  0.          0.06823103  0.06188971  0.20346384 -0.
 -0.         -0.00742484  0.          0.          0.00153336  0.
 -0.          0.         -0.          0.          0.          0.
 -0.          0.00677632  0.02293945 -0.        ]
['const' 'LotArea' 'YearBuilt' 'BsmtFinSF1' 'TotalBsmtSF' '1stFlrSF'
 '2ndFlrSF' 'GrLivArea' 'BedroomAbvGr' 'KitchenAbvGr' 'GarageArea'
 'SevereSlope' 'PosA' 'PosA' 'PosN' 'PosN' 'RRAe' 'RRAe' 'Excellent'
 'Excellent' 'Very Excellent' 'Very Good' 'Very Good' 'Excellent'
 'Excellent' 'Fair' 'Fair' 'Shed' 'CompShg' 'Membran' 'Metal' 'Roll'
 'Tar&Grv' 'WdShake' 'WdShngl' 'CBlock' 'CBlock' 'CBlock' 'S

In [365]:
# Predict target variable using Lasso
y_pred_lasso_train = lasso.predict(X_train_rfe)

In [366]:
# Fitting the Lasso model for Lamda/Alpha = 0.0001
lasso.fit(X_test_rfe,y_test)
print(lasso.coef_)

[ 0.00000000e+00  2.50784931e-01  7.87299771e-02  1.10332785e-01
  2.94693404e-01 -1.74908798e-02 -1.31274871e-02  3.19287588e-02
 -2.10026672e-02  0.00000000e+00 -5.34302732e-02 -1.55924328e-01
  8.29093537e-02  9.91670053e-02 -8.44128837e-02  0.00000000e+00
 -2.59779017e-03  8.26213684e-02  2.19007243e-02  0.00000000e+00
 -7.05557614e-04 -0.00000000e+00 -0.00000000e+00  1.60358669e-02
  1.02211714e-01 -1.56604923e-02  0.00000000e+00 -3.05116132e-02
  5.20050584e-03 -4.66156579e-02 -3.04096602e-02  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -5.38438947e-02 -1.66788659e-02 -1.61647011e-02 -0.00000000e+00
 -3.08222378e-02  0.00000000e+00  1.79627994e-04 -2.75748810e-03
  0.00000000e+00  0.00000000e+00  3.02200631e-02  4.22191190e-02
 -1.12592777e-02 -4.17544867e-03 -0.00000000e+00  2.33965511e-02
 -2.28107739e-03  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -4.88256714e-02  0.00000000e+00 -1.07755519e-02
  2.26509793e-02 -1.24758

In [367]:
# Predict target variable using Lasso
y_pred_lasso_test = lasso.predict(X_test_rfe)

In [368]:
# Calculating the metrics R2 , RSS , MSE , RMSE for this model
metric3 = []
# R2 score for train set
r2_train_lasso = r2_score(y_train,y_pred_lasso_train)
print(r2_train_lasso)
metric3.append(r2_train_lasso)
# R2 score for test set
r2_test_lasso = r2_score(y_test,y_pred_lasso_test)
print(r2_test_lasso)
metric3.append(r2_test_lasso)
# RSS  for train set
RSS_train_lasso = np.sum(np.square(y_train - y_pred_lasso_train))
print(RSS_train_lasso)
metric3.append(RSS_train_lasso)
# RSS  for test set
RSS_test_lasso = np.sum(np.square(y_test - y_pred_lasso_test))
print(RSS_test_lasso)
metric3.append(RSS_test_lasso)
# MSE  for train set
MSE_train_lasso = mean_squared_error(y_train,y_pred_lasso_train)
print(MSE_train_lasso)
metric3.append(MSE_train_lasso)
# MSE  for train set
MSE_test_lasso = mean_squared_error(y_test,y_pred_lasso_test)
print(MSE_test_lasso)
metric3.append(MSE_test_lasso)
# RMSE  for train set
RMSE_train_lasso = MSE_train_lasso**0.5
print(RMSE_train_lasso)
metric3.append(RMSE_train_lasso)
# RMSE  for test set
RMSE_test_lasso = MSE_test_lasso**0.5
print(RMSE_test_lasso)
metric3.append(RMSE_test_lasso)

0.8432992239239697
0.8937517201666982
1.9285573597909695
0.9120020800376145
0.0018888906560146617
0.002077453485279304
0.04346136969786688
0.045579090439359406


In [369]:
# Creating a table with all the metrics values
lr_table = {'Metric': ['R2 Score(Train)','R2 Score(Test)','RSS(Train)', 'RSS(Test)','MSE(Train)','MSE(Test)' , 'RMSE(Train)', 'RMSE(Test)'],
           'Linear Regression': metric}
lr_metric = pd.DataFrame(lr_table ,columns = ['Metric','Linear Regression'])
ridge_metric = pd.Series(metric2, name = 'Ridge Regression')
lasso_metric = pd.Series(metric3, name = 'Lasso Regression')

final_metric = pd.concat([lr_metric , ridge_metric , lasso_metric],axis = 1)
print(final_metric)

            Metric  Linear Regression  Ridge Regression  Lasso Regression
0  R2 Score(Train)           0.955212          0.908791          0.843299
1   R2 Score(Test)           0.959077          0.902188          0.893752
2       RSS(Train)           0.551220          1.122535          1.928557
3        RSS(Test)           0.351268          0.839590          0.912002
4       MSE(Train)           0.000540          0.001099          0.001889
5        MSE(Test)           0.000800          0.001913          0.002077
6      RMSE(Train)           0.023235          0.033158          0.043461
7       RMSE(Test)           0.028287          0.043732          0.045579
