In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Uploading the dataset
housing_df = pd.read_csv('Melbourne_housing_FULL.csv')

In [3]:
housing_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [6]:
housing_df.shape

(34857, 15)

In [7]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Rooms          34857 non-null  int64  
 2   Type           34857 non-null  object 
 3   Method         34857 non-null  object 
 4   SellerG        34857 non-null  object 
 5   Regionname     34854 non-null  object 
 6   Propertycount  34854 non-null  float64
 7   Distance       34856 non-null  float64
 8   CouncilArea    34854 non-null  object 
 9   Bedroom2       26640 non-null  float64
 10  Bathroom       26631 non-null  float64
 11  Car            26129 non-null  float64
 12  Landsize       23047 non-null  float64
 13  BuildingArea   13742 non-null  float64
 14  Price          27247 non-null  float64
dtypes: float64(8), int64(1), object(6)
memory usage: 4.0+ MB


In [4]:
# Using only the required columns

req_cols = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount',
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
housing_df = housing_df[req_cols]

In [5]:
# Checking for unique values
housing_df.nunique()

Unnamed: 0,0
Suburb,351
Rooms,12
Type,3
Method,9
SellerG,388
Regionname,8
Propertycount,342
Distance,215
CouncilArea,33
Bedroom2,15


In [8]:
# Checking for null values
housing_df.isnull().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,3
Propertycount,3
Distance,1
CouncilArea,3
Bedroom2,8217


In [11]:
# Imputing the null values to zero for few columns

cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
housing_df[cols_to_fill_zero] = housing_df[cols_to_fill_zero].fillna(0)

# Imputing the remaining columns with null values to mean
housing_df['Landsize'] = housing_df['Landsize'].fillna(housing_df.Landsize.mean())
housing_df['BuildingArea'] = housing_df['BuildingArea'].fillna(housing_df.BuildingArea.mean())

In [12]:
# Drop NA values in Price column
housing_df.dropna(inplace=True)

In [13]:
housing_df.shape

(27244, 15)

In [14]:
housing_df.isna().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,0
Propertycount,0
Distance,0
CouncilArea,0
Bedroom2,0


In [15]:
# One hot encoding the categorical columns

housing_df = pd.get_dummies(housing_df, drop_first=True)

In [16]:
X = housing_df.drop('Price', axis=1)
y = housing_df['Price']

In [18]:
# Splitting the dta to test and train
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [19]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [20]:
lin_reg.score(X_test, y_test)

0.6805949381552105

In [21]:
lin_reg.score(X_train, y_train)

0.6777287226772643

In [22]:
# Applying L1 regularization to the model

from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(X_train, y_train)

In [23]:
lasso_reg.score(X_test, y_test)

0.6843609939704707

In [24]:
lasso_reg.score(X_train, y_train)

0.6733482624893063

In [25]:
# Applying Ridge regularization

from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(X_train, y_train)

In [26]:
ridge_reg.score(X_test, y_test)

0.6767578574083843

In [27]:
ridge_reg.score(X_train, y_train)

0.6619150188215528