In [3]:
import numpy as np
import pandas as pd
df = pd.read_excel('HousePricePrediction.xlsx')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2914,160,RM,1936,Inside,Twnhs,7,1970,1970,CemntBd,0.0,546.0,
2915,2915,160,RM,1894,Inside,TwnhsE,5,1970,1970,CemntBd,0.0,546.0,
2916,2916,20,RL,20000,Inside,1Fam,7,1960,1996,VinylSd,0.0,1224.0,
2917,2917,85,RL,10441,Inside,1Fam,5,1992,1992,HdBoard,0.0,912.0,


In [4]:
df.shape

(2919, 13)

In [5]:
df.dtypes

Id                int64
MSSubClass        int64
MSZoning         object
LotArea           int64
LotConfig        object
BldgType         object
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
Exterior1st      object
BsmtFinSF2      float64
TotalBsmtSF     float64
SalePrice       float64
dtype: object

In [6]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotArea            0
LotConfig          0
BldgType           0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
Exterior1st        1
BsmtFinSF2         1
TotalBsmtSF        1
SalePrice       1459
dtype: int64

# Missing Values Implementation

In [7]:
df.shape

(2919, 13)

In [8]:
df = df.dropna(axis=0)

In [9]:
df.shape

(1460, 13)

In [10]:
df.isnull().sum()

Id              0
MSSubClass      0
MSZoning        0
LotArea         0
LotConfig       0
BldgType        0
OverallCond     0
YearBuilt       0
YearRemodAdd    0
Exterior1st     0
BsmtFinSF2      0
TotalBsmtSF     0
SalePrice       0
dtype: int64

# Using LabelEncoder

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
enc = LabelEncoder()

In [13]:
df_encoded = pd.DataFrame()

In [14]:
df_encoded['Id'] = df['Id']

In [15]:
df_encoded['MSSubClass'] = df['MSSubClass']

In [16]:
df_encoded['MSZoning'] = enc.fit_transform(df['MSZoning'])

In [17]:
df_encoded['LotArea'] = enc.fit_transform(df['LotArea'])

In [18]:
df_encoded['LotConfig'] = enc.fit_transform(df['LotConfig'])

In [19]:
df_encoded['BldgType'] = enc.fit_transform(df['BldgType'])

In [20]:
df_encoded['OverallCond'] = df['OverallCond']

In [21]:
df_encoded['YearBuilt'] = df['YearBuilt']

In [22]:
df_encoded['YearRemodAdd'] = df['YearRemodAdd']

In [23]:
df_encoded['BsmtFinSF2'] = df['BsmtFinSF2']

In [24]:
df_encoded['TotalBsmtSF'] = df['TotalBsmtSF']

In [25]:
df_encoded['SalePrice'] = df['SalePrice']

In [26]:
df_encoded.dtypes

Id                int64
MSSubClass        int64
MSZoning          int64
LotArea           int64
LotConfig         int64
BldgType          int64
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
BsmtFinSF2      float64
TotalBsmtSF     float64
SalePrice       float64
dtype: object

In [27]:
df_encoded.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'LotConfig', 'BldgType',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF2', 'TotalBsmtSF',
       'SalePrice'],
      dtype='object')

## Approach 1 : Brute-force approach-using all the features

In [28]:
X=df_encoded.drop(['Id','SalePrice',],axis=1)
y=df_encoded[['SalePrice']]

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [30]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,y_train)
reg.score(X_test, y_test)

0.6414582830016342

In [31]:
reg.coef_

array([[   388.77336136,  -2582.5586994 ,     66.99203892,
           724.6349998 , -12030.47388582,   5216.94302976,
           686.72018769,    802.58854187,    -16.51035018,
            68.34527297]])

In [32]:
reg.intercept_

array([-2909890.50720953])

## Approach 2 : using only the relevant attribute

In [33]:
df_encoded.corr()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice
Id,1.0,0.011156,-0.006096,-0.004765,0.049756,0.021912,0.012609,-0.012713,-0.021998,-0.005968,-0.015415,-0.021917
MSSubClass,0.011156,1.0,0.0359,-0.334358,0.07591,0.746063,-0.059316,0.02785,0.040581,-0.065649,-0.238518,-0.084284
MSZoning,-0.006096,0.0359,1.0,-0.080135,-0.009895,0.00569,0.186951,-0.308908,-0.174728,0.028086,-0.087834,-0.166872
LotArea,-0.004765,-0.334358,-0.080135,1.0,-0.199341,-0.442006,-0.036993,0.110995,0.08686,0.083172,0.364715,0.454564
LotConfig,0.049756,0.07591,-0.009895,-0.199341,1.0,0.107229,-0.030788,0.013629,-0.005161,-0.010357,-0.040049,-0.067396
BldgType,0.021912,0.746063,0.00569,-0.442006,0.107229,1.0,-0.16204,0.217584,0.104855,-0.017376,-0.050033,-0.085591
OverallCond,0.012609,-0.059316,0.186951,-0.036993,-0.030788,-0.16204,1.0,-0.375983,0.073741,0.040229,-0.171098,-0.077856
YearBuilt,-0.012713,0.02785,-0.308908,0.110995,0.013629,0.217584,-0.375983,1.0,0.592855,-0.049107,0.391452,0.522897
YearRemodAdd,-0.021998,0.040581,-0.174728,0.08686,-0.005161,0.104855,0.073741,0.592855,1.0,-0.067759,0.291066,0.507101
BsmtFinSF2,-0.005968,-0.065649,0.028086,0.083172,-0.010357,-0.017376,0.040229,-0.049107,-0.067759,1.0,0.10481,-0.011378


In [39]:
df_encoded.corr()['SalePrice']

Id             -0.021917
MSSubClass     -0.084284
MSZoning       -0.166872
LotArea         0.454564
LotConfig      -0.067396
BldgType       -0.085591
OverallCond    -0.077856
YearBuilt       0.522897
YearRemodAdd    0.507101
BsmtFinSF2     -0.011378
TotalBsmtSF     0.613581
SalePrice       1.000000
Name: SalePrice, dtype: float64

In [40]:
df_encoded.corr()['SalePrice'].sort_values()

MSZoning       -0.166872
BldgType       -0.085591
MSSubClass     -0.084284
OverallCond    -0.077856
LotConfig      -0.067396
Id             -0.021917
BsmtFinSF2     -0.011378
LotArea         0.454564
YearRemodAdd    0.507101
YearBuilt       0.522897
TotalBsmtSF     0.613581
SalePrice       1.000000
Name: SalePrice, dtype: float64

In [41]:
X=df_encoded[['TotalBsmtSF','YearBuilt','YearRemodAdd','LotArea']]

In [42]:
y=df_encoded[['SalePrice']]

In [44]:
lr=LinearRegression()
lr.fit(X,y)
lr.score(X,y)

0.5824832455167268