In [1]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
# Supress Warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

In [3]:
# read dataset
dataset = pd.read_csv('melb_data.csv')
dataset.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,03-12-2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,04-02-2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,04-03-2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,04-03-2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,04-06-2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [4]:
#dataset.nunique()
dataset.Suburb.nunique()

314

In [5]:
dataset.shape

(13580, 21)

In [6]:
dataset.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [7]:
cols_to_use = ['Suburb', 'Rooms','Price', 'Type', 'Method', 'SellerG',
       'Distance', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea',  'CouncilArea', 
        'Regionname', 'Propertycount']

dataset = dataset[cols_to_use]
dataset.shape

(13580, 15)

In [8]:
dataset.isna().sum()

Suburb              0
Rooms               0
Price               0
Type                0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
CouncilArea      1369
Regionname          0
Propertycount       0
dtype: int64

In [9]:
dataset['Car'] = dataset['Car'].fillna(0)

In [10]:
dataset.isna().sum()

Suburb              0
Rooms               0
Price               0
Type                0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea     6450
CouncilArea      1369
Regionname          0
Propertycount       0
dtype: int64

In [11]:
# for Building area will calculate mean and fill na with mean
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())

In [12]:
dataset.isna().sum()

Suburb              0
Rooms               0
Price               0
Type                0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
CouncilArea      1369
Regionname          0
Propertycount       0
dtype: int64

In [13]:
# drop na values for COuncil Area 
dataset = dataset.dropna()
dataset.isnull().sum()

Suburb           0
Rooms            0
Price            0
Type             0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      0
Regionname       0
Propertycount    0
dtype: int64

In [14]:
dataset.shape

(12211, 15)

In [15]:
dataset.sample(3)

Unnamed: 0,Suburb,Rooms,Price,Type,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
8992,Yarraville,4,1000000,h,SP,Village,6.3,4,2,2.0,438,151.96765,Maribyrnong,Western Metropolitan,6543
7542,Box Hill,2,990000,t,S,Fletchers,13.1,2,1,1.0,257,151.96765,Whitehorse,Eastern Metropolitan,4605
3976,Melbourne,1,427000,u,S,Harcourts,2.8,1,1,0.0,0,151.96765,Melbourne,Northern Metropolitan,17496


In [16]:
# Use one hot encoding for categorial variable
dataset = pd.get_dummies(dataset, drop_first = True)
dataset.sample(3)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,...,CouncilArea_Wyndham,CouncilArea_Yarra,CouncilArea_Yarra Ranges,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
9022,5,930000,23.2,5,2,2.0,992,198.0,5030,0,...,0,0,0,0,0,0,0,0,0,0
750,2,797500,13.0,2,1,2.0,321,151.96765,6795,0,...,0,0,0,0,0,0,0,1,0,0
2279,2,582500,7.7,2,1,1.0,0,151.96765,8989,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
dataset.shape

(12211, 613)

In [18]:
X = dataset.drop('Price', axis=1)
y = dataset['Price']

In [19]:
from sklearn.model_selection import train_test_split 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [21]:
X_train.shape

(8547, 612)

In [22]:
X_test.shape

(3664, 612)

In [23]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)


In [24]:
model.score(X_test, y_test) # Horrible Score -37%

-244720247911.3896

In [25]:
model.score(X_train, y_train)

0.7120927410916368

In [26]:
# L1 - Lasso Regulirazation only added absulute theta value to MSR to reduce error
from sklearn import linear_model
lasso_model = linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_model.fit(X_train, y_train)

In [27]:
lasso_model.score(X_train, y_train)

0.7074303619393785

In [28]:
lasso_model.score(X_test, y_test)

0.6635361138790516

In [29]:
# L2 - Ridge Regulirazation only added squared theta value to MSR to reduce error
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_model.fit(X_train, y_train)

In [30]:
ridge_model.score(X_train, y_train)

0.6767113841211123

In [31]:
ridge_model.score(X_test, y_test)

0.6644072458301121