In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
#Supress warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

In [3]:
#read data
data = pd.read_csv('melb_house_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [5]:
data.nunique()

Unnamed: 0       18396
Suburb             330
Address          18134
Rooms               11
Type                 3
Price             2470
Method               5
SellerG            305
Date                58
Distance           210
Postcode           205
Bedroom2            12
Bathroom             9
Car                 11
Landsize          1449
BuildingArea       613
YearBuilt          144
CouncilArea         33
Lattitude         7518
Longtitude        8168
Regionname           8
Propertycount      324
dtype: int64

In [6]:
data.shape

(18396, 22)

In [7]:
#remove unused columns
cols_to_use = ['Suburb','Rooms','Type','Method','SellerG','Regionname','Propertycount','Distance','CouncilArea','Bedroom2','Bathroom','Car','Landsize','BuildingArea','Price']
dataset = data[cols_to_use]
dataset.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,2.0,1.0,1.0,202.0,,1480000.0
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,2.0,1.0,0.0,156.0,79.0,1035000.0
2,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,3.0,2.0,0.0,134.0,150.0,1465000.0
3,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,3.0,2.0,1.0,94.0,,850000.0
4,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,Yarra,3.0,1.0,2.0,120.0,142.0,1600000.0


In [8]:
dataset.shape

(18396, 15)

In [9]:
dataset.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           1
Propertycount        1
Distance             1
CouncilArea       6163
Bedroom2          3469
Bathroom          3471
Car               3576
Landsize          4793
BuildingArea     10634
Price                0
dtype: int64

In [10]:
#filling these columns's na value with zero(0)
cols_tofill_zero = ['Propertycount','Distance','Bedroom2','Bathroom','Car']
dataset[cols_tofill_zero] = dataset[cols_tofill_zero].fillna(0)
dataset.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           1
Propertycount        0
Distance             0
CouncilArea       6163
Bedroom2             0
Bathroom             0
Car                  0
Landsize          4793
BuildingArea     10634
Price                0
dtype: int64

In [14]:
dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())


In [15]:
dataset.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          1
Propertycount       0
Distance            0
CouncilArea      6163
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price               0
dtype: int64

In [16]:
#drop data which has na values
dataset.dropna(inplace=True)
dataset.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [17]:
#one hot encoding
dataset = pd.get_dummies(dataset, drop_first = True)
dataset.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moreland,CouncilArea_Nillumbik,CouncilArea_Port Phillip,CouncilArea_Stonnington,CouncilArea_Unavailable,CouncilArea_Whitehorse,CouncilArea_Whittlesea,CouncilArea_Wyndham,CouncilArea_Yarra,CouncilArea_Yarra Ranges
0,2,4019.0,2.5,2.0,1.0,1.0,202.0,151.220219,1480000.0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,4019.0,2.5,3.0,2.0,1.0,94.0,151.220219,850000.0,0,...,0,0,0,0,0,0,0,0,1,0
4,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
x = dataset.drop('Price',axis=1)
y = dataset['Price']

In [19]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3,random_state=2)

In [20]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x, train_y)

In [21]:
reg.score(test_x,test_y)

-995506570.1891763

In [22]:
reg.score(train_x,train_y)

0.7100072899184681

In [23]:
#here above
#overfitting because for training sample it gave good score
#for test sample it gave horrible score

In [24]:
#to address overfitting(lasso regression ---L1 regularization)
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(train_x,train_y)

In [25]:
lasso_reg.score(test_x,test_y)

0.6703450094534258

In [26]:
lasso_reg.score(train_x,train_y)

0.7059060292458545

In [27]:
#here above by using L1 regularization score increased

In [28]:
#now with L2 Regularization
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=50,max_iter=100, tol=0.1)
ridge_reg.fit(train_x,train_y)

In [29]:
ridge_reg.score(test_x,test_y)

0.6612473627638111

In [30]:
ridge_reg.score(train_x,train_y)

0.6767852914122192

In [31]:
#Above
#score increased also with Ridge score i.e.L2-regularization