In [130]:
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt, sklearn as sk, statsmodels.api as sm

In [131]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [132]:
main_data = pd.read_csv('train.csv')

# EDA

In [133]:
main_data.shape

(1460, 81)

In [134]:
main_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [135]:
main_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Columns with very low numbers of Data

These are the columns that have significant number of their data as Not-Applicable. 
Thus, these varibales would hardly impact the result of the model. 

### Total rows (1460)

Showing Non-Null values here
1. Alley (91)
2. FireplaceQu (770)
3. PoolQC (7)
4. Fence (281)
5. MiscFeature (54)


But on second thoughts, these could be the varibles that account for the difference. Since, these are the things (variables) that add up to the cost in one way or other.

Trying to preserve the data as much as possible, as the missing values can be deduced in most of the cases.


Duplicating DataDrame


In [136]:
df = main_data

Replacing nan values 
1. With 0 in int / float datatype
2. With 2023 in case of missing years
3. With No in case of Objects to depict the absence of field / facility

In [137]:
df.LotFrontage = df['LotFrontage'].fillna(0) # Area -> 0
df.MasVnrArea = df['MasVnrArea'].fillna(0)   # Area -> 0
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(2023)

df['BsmtQual'] = df['BsmtQual'].fillna('No')    #Quality rating -> No
df['BsmtCond'] = df['BsmtCond'].fillna('No')    #Condition rating -> No
df['BsmtExposure'] = df['BsmtExposure'].fillna('No')
df['BsmtFinType1'] = df['BsmtFinType1'].fillna('No')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('No')
df['Electrical'] = df['Electrical'].fillna('No')  
df['FireplaceQu'] = df['FireplaceQu'].fillna('No') 
df['GarageType'] = df['GarageType'].fillna('No') 
df['GarageFinish'] = df['GarageFinish'].fillna('No')
df['GarageQual'] = df['GarageQual'].fillna('No') 
df['GarageCond'] = df['GarageCond'].fillna('No') 
df['PoolQC'] = df['PoolQC'].fillna('No') 
df['Fence'] = df['Fence'].fillna('No') 
df['MiscFeature'] = df['MiscFeature'].fillna('No') 
df['MasVnrType'] = df['MasVnrType'].fillna('No')  
df['Alley'] = df['Alley'].fillna('No')  

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [139]:
df['Age'] = df['YrSold'] - df['YearBuilt']


In [140]:
df.Age.head()

0     5
1    31
2     7
3    91
4     8
Name: Age, dtype: int64

In [141]:
drop1 = ['YrSold','YearBuilt']
df = df.drop(drop1, axis = 1)

In [142]:
scaler = MinMaxScaler()

In [143]:
# To drop BsmtUnfSF BsmtFinSF1 2ndFlrSF 1stFlrSF
to_be_dropped = ['Id','BsmtUnfSF','BsmtFinSF1','2ndFlrSF','1stFlrSF']
df = df.drop(to_be_dropped, axis = 1)

In [144]:
to_be_scaled = ['LotFrontage','LotArea','OverallQual','OverallCond','MasVnrArea','TotalBsmtSF','GrLivArea', 'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold','Age']

df[to_be_scaled] = scaler.fit_transform(df[to_be_scaled])

# Word for the evaluator: Doubt with unimplemented piece of code.

to_be_scaled = ['LotFrontage','LotArea','OverallQual','OverallCond','MasVnrArea','TotalBsmtSF','GrLivArea', 'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold','Age']

df[to_be_scaled] = scaler.fit_transform(df[to_be_scaled])

### After this code cell, I tried the following code to get dummies for the rest of the columns in a single go. The code is as follows:

df_s = df.drop(to_be_scaled, axis = 1)
dummies = df_s.columns

for i in dummies:
    status = pd.get_dummies(df.i, drop_first = True)
    df = pd.concat([df,status], axis = 1)

df = df.drop(dummies, axis = 1)

### But my laptop kept freezing everytime I ran this cell. I suspect a infinite loop here but am unable to figure out the exact reason. So, I've done it for a few columns mannualy and not for all due to lack of time.

In [145]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,Age
0,60,RL,0.207668,0.03342,Pave,No,Reg,Lvl,AllPub,Inside,...,0.0,No,No,No,0.0,0.090909,WD,Normal,208500,0.036765
1,20,RL,0.255591,0.038795,Pave,No,Reg,Lvl,AllPub,FR2,...,0.0,No,No,No,0.0,0.363636,WD,Normal,181500,0.227941
2,60,RL,0.217252,0.046507,Pave,No,IR1,Lvl,AllPub,Inside,...,0.0,No,No,No,0.0,0.727273,WD,Normal,223500,0.051471
3,70,RL,0.191693,0.038561,Pave,No,IR1,Lvl,AllPub,Corner,...,0.0,No,No,No,0.0,0.090909,WD,Abnorml,140000,0.669118
4,60,RL,0.268371,0.060576,Pave,No,IR1,Lvl,AllPub,FR2,...,0.0,No,No,No,0.0,1.0,WD,Normal,250000,0.058824


In [146]:
df_s = df.drop(to_be_scaled, axis = 1)

dummies = df_s.columns

dummies.drop('SalePrice')

#for i in dummies:
    #status = pd.get_dummies(df[i], drop_first = True)
    #df = pd.concat([df_s,status], axis = 1)

df = df.drop(dummies, axis = 1)
df = pd.concat([df,main_data.SalePrice], axis = 1)

df.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,TotalBsmtSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,Age,SalePrice
0,0.207668,0.03342,0.666667,0.5,0.1225,0.140098,0.259231,0.38646,0.0,0.111517,0.0,0.0,0.0,0.0,0.0,0.090909,0.036765,208500
1,0.255591,0.038795,0.555556,0.875,0.0,0.206547,0.17483,0.324401,0.347725,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.227941,181500
2,0.217252,0.046507,0.666667,0.5,0.10125,0.150573,0.273549,0.428773,0.0,0.076782,0.0,0.0,0.0,0.0,0.0,0.727273,0.051471,223500
3,0.191693,0.038561,0.666667,0.5,0.0,0.123732,0.26055,0.45275,0.0,0.063985,0.492754,0.0,0.0,0.0,0.0,0.090909,0.669118,140000
4,0.268371,0.060576,0.777778,0.5,0.21875,0.187398,0.351168,0.589563,0.224037,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,0.058824,250000


In [147]:
df.columns


Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea',
       'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'Age', 'SalePrice'],
      dtype='object')

In [148]:
y = df.SalePrice.values.reshape(-1,1)
df = df.drop('SalePrice',axis = 1)
X = df

In [149]:
y.shape

(1460, 1)

In [150]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 1)

In [151]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [152]:
y_pred = reg.predict(X_train)
y_pred

array([[137604.80283385],
       [ 72138.99539325],
       [194151.79173676],
       ...,
       [124539.09450445],
       [118122.35482651],
       [ 91414.67750185]])

In [153]:
r2_score(y_train,y_pred)

0.7792273468919211

In [166]:
y_test_pred = reg.predict(X_test)


In [155]:
r2_score(y_test,y_test_pred)

0.7956989794185897

In [156]:
rss = np.sum(np.square(y_train - y_pred))
print(rss)
mse = mean_squared_error(y_train, y_pred)
print(mse)
rmse = mse ** 0.5
print(rmse)

1570953840559.129
1344994726.5061035
36674.169745286716


In [157]:
lambdas = [0,0.001,0.01,0.1,1,10,100]

for i in lambdas:
    ridreg = Ridge(alpha = i)
    ridreg.fit(X_train,y_train)
    y_pred = ridreg.predict(X_train)
    print('For alpha = ' + str(i))
    print(r2_score(y_train,y_pred))
    print(ridreg.coef_)

For alpha = 0
0.7792273468919212
[[   4456.57267931  143522.76693193  173344.04236243   51608.14790064
    43812.32693954  132091.03283794  226663.98934177   59612.21556631
    39630.85815392    7326.51440493   20520.74971078   14049.10648196
    39782.39816695 -116889.74506303   -7288.68639348   -1703.95652754
   -66041.07372286]]
For alpha = 0.001
0.779227343983391
[[   4460.2474764   143471.19715445  173346.84715191   51603.02999653
    43818.2443011   132073.41093768  226639.06382973   59620.5557186
    39633.04928088    7331.83236963   20516.9301948    14047.72822972
    39780.56615045 -116838.87114842   -7281.32640502   -1703.54186308
   -66036.94117268]]
For alpha = 0.01
0.7792270576632041
[[   4493.2719163   143009.08209362  173371.91699717   51557.05284371
    43871.3485989   131915.05188276  226415.08679944   59695.42719995
    39652.71656858    7379.62856121   20482.6887412    14035.31771572
    39764.12749786 -116382.77815577   -7215.57900104   -1699.81333187
   -65999.8982

In [158]:
ridreg = Ridge(alpha = 0.00001)
ridreg.fit(X_train,y_train)
y_pred = ridreg.predict(X_train)
print('For alpha = ' + str(i))
print(r2_score(y_train,y_pred))

For alpha = 100
0.7792273468916302


In [159]:
 lambdas = [0,0.001,0.01,0.1,1,10,100]

for i in lambdas:
    las = Lasso(alpha = i)
    las.fit(X_train,y_train)
    y_pred = las.predict(X_train)
    print('For alpha = ' + str(i))
    print(r2_score(y_train,y_pred))
    print(las.coef_)

For alpha = 0
0.7792273468919212
[   4456.57267931  143522.76693193  173344.04236243   51608.14790064
   43812.32693954  132091.03283794  226663.98934177   59612.21556631
   39630.85815392    7326.51440493   20520.74971078   14049.10648196
   39782.39816695 -116889.74506303   -7288.68639348   -1703.95652754
  -66041.07372286]
For alpha = 0.001
0.7792273468914691
[   4456.5154812   143522.30979719  173344.12504842   51608.03970315
   43812.29116571  132090.92963089  226663.93974664   59612.26244749
   39630.81469158    7326.45628567   20520.57609788   14048.8348973
   39782.29677424 -116889.23984015   -7287.88487429   -1703.93366276
  -66040.98597586]
For alpha = 0.01
0.7792273468471678
[   4456.0254177   143518.33005604  173344.97192084   51607.05763148
   43812.02863189  132089.98215439  226663.41891253   59612.61108527
   39630.41636266    7325.92367867   20519.04958377   14046.38920636
   39781.38727258 -116884.69323026   -7280.65533995   -1703.72873222
  -66040.18624731]
For alpha 

  las.fit(X_train,y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [167]:
las = Lasso(alpha = 0.002)
las.fit(X_train,y_train)
y_pred = las.predict(X_train)
print('For alpha = ' + str(i))
print(r2_score(y_train,y_pred))
print(las.coef_)

For alpha = 100
0.7792273468901105
[   4456.45518591  143521.84962851  173344.21385132   51607.92860462
   43812.25841064  132090.83641905  226663.88932135   59612.30802327
   39630.77078342    7326.39691263   20520.39933608   14048.56308078
   39782.19401374 -116888.73509993   -7287.08277683   -1703.91092452
  -66040.89350349]


0.779227343983391 < 0.7792273468914691

Ridge             <  Lasso


In [163]:
las = Lasso(alpha = i)
las.fit(X_train,y_train)
y_pred = las.predict(X_test)
print('For alpha = ' + str(i))
print(r2_score(y_test,y_pred))
#print(las.coef_)

For alpha = 100
0.8108462055506377
