In [1]:
# Importing 
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

pd.options.mode.chained_assignment = None # default='warn'
%matplotlib inline

In [2]:
# reading Melbourne housing data
house_data = pd.read_csv("melb_data.csv")

In [3]:
house_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
house_data.shape

(13580, 21)

In [5]:
house_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,13580.0,2.937997,0.955748,1.0,2.0,3.0,3.0,10.0
Price,13580.0,1075684.0,639310.724296,85000.0,650000.0,903000.0,1330000.0,9000000.0
Distance,13580.0,10.13778,5.868725,0.0,6.1,9.2,13.0,48.1
Postcode,13580.0,3105.302,90.676964,3000.0,3044.0,3084.0,3148.0,3977.0
Bedroom2,13580.0,2.914728,0.965921,0.0,2.0,3.0,3.0,20.0
Bathroom,13580.0,1.534242,0.691712,0.0,1.0,1.0,2.0,8.0
Car,13518.0,1.610075,0.962634,0.0,1.0,2.0,2.0,10.0
Landsize,13580.0,558.4161,3990.669241,0.0,177.0,440.0,651.0,433014.0
BuildingArea,7130.0,151.9676,541.014538,0.0,93.0,126.0,174.0,44515.0
YearBuilt,8205.0,1964.684,37.273762,1196.0,1940.0,1970.0,1999.0,2018.0


In [6]:
house_data.columns.tolist()

['Suburb',
 'Address',
 'Rooms',
 'Type',
 'Price',
 'Method',
 'SellerG',
 'Date',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'CouncilArea',
 'Lattitude',
 'Longtitude',
 'Regionname',
 'Propertycount']

In [7]:
house_data.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [8]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [9]:
house_data.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [10]:
# Splitting the data and selecting relevant columns
feature_data = house_data[["Suburb", "Rooms", "Type", "SellerG", "Distance", "Postcode", "YearBuilt",
                           "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea"]]
target_data = house_data["Price"]

In [11]:
feature_data.head()

Unnamed: 0,Suburb,Rooms,Type,SellerG,Distance,Postcode,YearBuilt,Bedroom2,Bathroom,Car,Landsize,BuildingArea
0,Abbotsford,2,h,Biggin,2.5,3067.0,,2.0,1.0,1.0,202.0,
1,Abbotsford,2,h,Biggin,2.5,3067.0,1900.0,2.0,1.0,0.0,156.0,79.0
2,Abbotsford,3,h,Biggin,2.5,3067.0,1900.0,3.0,2.0,0.0,134.0,150.0
3,Abbotsford,3,h,Biggin,2.5,3067.0,,3.0,2.0,1.0,94.0,
4,Abbotsford,4,h,Nelson,2.5,3067.0,2014.0,3.0,1.0,2.0,120.0,142.0


In [12]:
feature_data.isnull().sum()

Suburb             0
Rooms              0
Type               0
SellerG            0
Distance           0
Postcode           0
YearBuilt       5375
Bedroom2           0
Bathroom           0
Car               62
Landsize           0
BuildingArea    6450
dtype: int64

In [13]:
# Selecting only missing data
column_miss = feature_data.isnull().sum()
column_miss = column_miss[column_miss!=0]
column_miss

YearBuilt       5375
Car               62
BuildingArea    6450
dtype: int64

In [14]:
feature_data["Type"].value_counts()

h    9449
u    3017
t    1114
Name: Type, dtype: int64

In [15]:
feature_data.loc[feature_data["Type"]=="h"].isnull().sum()

Suburb             0
Rooms              0
Type               0
SellerG            0
Distance           0
Postcode           0
YearBuilt       4041
Bedroom2           0
Bathroom           0
Car               60
Landsize           0
BuildingArea    4589
dtype: int64

In [16]:
feature_data.loc[feature_data["Type"]=="u"].isnull().sum()

Suburb             0
Rooms              0
Type               0
SellerG            0
Distance           0
Postcode           0
YearBuilt        982
Bedroom2           0
Bathroom           0
Car                2
Landsize           0
BuildingArea    1433
dtype: int64

In [17]:
feature_data.loc[feature_data["Type"]=="t"].isnull().sum()

Suburb            0
Rooms             0
Type              0
SellerG           0
Distance          0
Postcode          0
YearBuilt       352
Bedroom2          0
Bathroom          0
Car               0
Landsize          0
BuildingArea    428
dtype: int64

In [18]:
feature_data.loc[feature_data["Type"]=="t"]["BuildingArea"]

21         NaN
38       225.0
42       134.0
48        90.0
56       133.0
         ...  
13437    215.0
13485      NaN
13504    138.0
13521      0.0
13526    128.0
Name: BuildingArea, Length: 1114, dtype: float64

In [19]:
# Classified data per Type for handling missing value
data_h = feature_data.loc[feature_data["Type"]=="h"]
data_u = feature_data.loc[feature_data["Type"]=="u"]
data_t = feature_data.loc[feature_data["Type"]=="t"]

In [20]:
# Imputing missing data
imputer = SimpleImputer()
feature_data.loc[feature_data.Type == "h", "Car"] = imputer.fit_transform(data_h["Car"].values.reshape(-1,1))
feature_data.loc[feature_data.Type == "h", "YearBuilt"] = imputer.fit_transform(data_h["YearBuilt"].values.reshape(-1,1))
feature_data.loc[feature_data.Type == "h", "BuildingArea"] = imputer.fit_transform(data_h["BuildingArea"].values.reshape(-1,1))
feature_data.loc[feature_data.Type == "u", "Car"] = imputer.fit_transform(data_u["Car"].values.reshape(-1,1))
feature_data.loc[feature_data.Type == "u", "YearBuilt"] = imputer.fit_transform(data_u["YearBuilt"].values.reshape(-1,1))
feature_data.loc[feature_data.Type == "u", "BuildingArea"] = imputer.fit_transform(data_u["BuildingArea"].values.reshape(-1,1))
feature_data.loc[feature_data.Type == "t", "YearBuilt"] = imputer.fit_transform(data_t["YearBuilt"].values.reshape(-1,1))
feature_data.loc[feature_data.Type == "t", "BuildingArea"] = imputer.fit_transform(data_t["BuildingArea"].values.reshape(-1,1))

In [21]:
feature_data.isnull().sum()

Suburb          0
Rooms           0
Type            0
SellerG         0
Distance        0
Postcode        0
YearBuilt       0
Bedroom2        0
Bathroom        0
Car             0
Landsize        0
BuildingArea    0
dtype: int64

In [22]:
feature_data.head()

Unnamed: 0,Suburb,Rooms,Type,SellerG,Distance,Postcode,YearBuilt,Bedroom2,Bathroom,Car,Landsize,BuildingArea
0,Abbotsford,2,h,Biggin,2.5,3067.0,1954.081176,2.0,1.0,1.0,202.0,176.866248
1,Abbotsford,2,h,Biggin,2.5,3067.0,1900.0,2.0,1.0,0.0,156.0,79.0
2,Abbotsford,3,h,Biggin,2.5,3067.0,1900.0,3.0,2.0,0.0,134.0,150.0
3,Abbotsford,3,h,Biggin,2.5,3067.0,1954.081176,3.0,2.0,1.0,94.0,176.866248
4,Abbotsford,4,h,Nelson,2.5,3067.0,2014.0,3.0,1.0,2.0,120.0,142.0


In [23]:
# Rounding "YearBuilt" and "BuildingArea" to one digit
feature_data["YearBuilt"] = round(feature_data["YearBuilt"],0)

In [24]:
feature_data["BuildingArea"] = round(feature_data["BuildingArea"],0)

In [25]:
feature_data.head()

Unnamed: 0,Suburb,Rooms,Type,SellerG,Distance,Postcode,YearBuilt,Bedroom2,Bathroom,Car,Landsize,BuildingArea
0,Abbotsford,2,h,Biggin,2.5,3067.0,1954.0,2.0,1.0,1.0,202.0,177.0
1,Abbotsford,2,h,Biggin,2.5,3067.0,1900.0,2.0,1.0,0.0,156.0,79.0
2,Abbotsford,3,h,Biggin,2.5,3067.0,1900.0,3.0,2.0,0.0,134.0,150.0
3,Abbotsford,3,h,Biggin,2.5,3067.0,1954.0,3.0,2.0,1.0,94.0,177.0
4,Abbotsford,4,h,Nelson,2.5,3067.0,2014.0,3.0,1.0,2.0,120.0,142.0


In [26]:
# Converting categorical data value to numerical data value
encoder = OrdinalEncoder()
feature_data["Suburb"] = encoder.fit_transform(feature_data[["Suburb"]])
feature_data["SellerG"] = encoder.fit_transform(feature_data[["SellerG"]])
feature_data["Type"] = encoder.fit_transform(feature_data[["Type"]])

In [27]:
feature_data.head()

Unnamed: 0,Suburb,Rooms,Type,SellerG,Distance,Postcode,YearBuilt,Bedroom2,Bathroom,Car,Landsize,BuildingArea
0,0.0,2,0.0,23.0,2.5,3067.0,1954.0,2.0,1.0,1.0,202.0,177.0
1,0.0,2,0.0,23.0,2.5,3067.0,1900.0,2.0,1.0,0.0,156.0,79.0
2,0.0,3,0.0,23.0,2.5,3067.0,1900.0,3.0,2.0,0.0,134.0,150.0
3,0.0,3,0.0,23.0,2.5,3067.0,1954.0,3.0,2.0,1.0,94.0,177.0
4,0.0,4,0.0,155.0,2.5,3067.0,2014.0,3.0,1.0,2.0,120.0,142.0


In [29]:
scaler = StandardScaler()
scaled = scaler.fit_transform(feature_data)
scaled

array([[-1.71982483, -0.98146337, -0.63200323, ..., -0.63584819,
        -0.08931566,  0.06219426],
       [-1.71982483, -0.98146337, -0.63200323, ..., -1.67699936,
        -0.10084297, -0.18720721],
       [-1.71982483,  0.06487613, -0.63200323, ..., -1.67699936,
        -0.10635603, -0.00651839],
       ...,
       [ 1.65763629,  0.06487613, -0.63200323, ...,  2.48760531,
        -0.03067672,  0.06219426],
       [ 1.65763629,  1.11121563, -0.63200323, ...,  3.52875648,
         0.0770786 ,  0.011296  ],
       [ 1.74622544,  1.11121563, -0.63200323, ..., -0.63584819,
        -0.04922066, -0.10322508]])

In [31]:
cols = feature_data.columns

In [32]:
feature_data = pd.DataFrame(scaled, columns=cols)

In [34]:
feature_data.head()

Unnamed: 0,Suburb,Rooms,Type,SellerG,Distance,Postcode,YearBuilt,Bedroom2,Bathroom,Car,Landsize,BuildingArea
0,-1.719825,-0.981463,-0.632003,-1.386938,-1.301485,-0.422415,-0.313374,-0.947035,-0.772376,-0.635848,-0.089316,0.062194
1,-1.719825,-0.981463,-0.632003,-1.386938,-1.301485,-0.422415,-2.094296,-0.947035,-0.772376,-1.676999,-0.100843,-0.187207
2,-1.719825,0.064876,-0.632003,-1.386938,-1.301485,-0.422415,-2.094296,0.088284,0.673367,-1.676999,-0.106356,-0.006518
3,-1.719825,0.064876,-0.632003,-1.386938,-1.301485,-0.422415,-0.313374,0.088284,0.673367,-0.635848,-0.11638,0.062194
4,-1.719825,1.111216,-0.632003,0.311181,-1.301485,-0.422415,1.665429,0.088284,-0.772376,0.405303,-0.109864,-0.026878


In [37]:
print("size of feature and target data is: ", feature_data.shape, target_data.shape)

size of feature and target data is:  (13580, 12) (13580,)
