In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Melbourne_housing_FULL.csv',low_memory=False)

In [3]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom,...,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longtitude,Regionname,Propertycount,ParkingArea,Price
0,Abbotsford,68 Studley St,2,h,SS,Jellis,3/9/2016,2.5,3067.0,2.0,...,126.0,inf,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0,Carport,
1,Airport West,154 Halsey Rd,3,t,PI,Nelson,3/9/2016,13.5,3042.0,3.0,...,303.0,225.0,2016.0,Moonee Valley City Council,-37.718,144.878,Western Metropolitan,3464.0,Detached Garage,840000.0
2,Albert Park,105 Kerferd Rd,2,h,S,hockingstuart,3/9/2016,3.3,3206.0,2.0,...,120.0,82.0,1900.0,Port Phillip City Council,-37.8459,144.9555,Southern Metropolitan,3280.0,Attached Garage,1275000.0
3,Albert Park,85 Richardson St,2,h,S,Thomson,3/9/2016,3.3,3206.0,2.0,...,159.0,inf,,Port Phillip City Council,-37.845,144.9538,Southern Metropolitan,3280.0,Indoor,1455000.0
4,Alphington,30 Austin St,3,h,SN,McGrath,3/9/2016,6.4,3078.0,3.0,...,174.0,122.0,2003.0,Darebin City Council,-37.7818,145.0198,Northern Metropolitan,2211.0,Parkade,


In [4]:
data.shape

(34857, 22)

In [5]:
data.isnull().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom           8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21097
YearBuilt        19306
CouncilArea          3
Latitude          7976
Longtitude        7976
Regionname           0
Propertycount        3
ParkingArea          0
Price             7610
dtype: int64

In [6]:
data.dropna(inplace=True)

### Converting `categorical` data to `numerical` data so that it can be used by algorithms

#### One Hot Encoding
One-Hot-Encoding is used to create dummy variables to replace the categories in a categorical variable into features of each category and represent it using 1 or 0 based on the presence or absence of the categorical value in the record.

In [7]:
df_dummies = pd.get_dummies(data,columns=['ParkingArea'],prefix='PARK')

In [8]:
df_dummies.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom,...,Propertycount,Price,PARK_Attached Garage,PARK_Carport,PARK_Detached Garage,PARK_Indoor,PARK_Outdoor Stall,PARK_Parkade,PARK_Parking Pad,PARK_Underground
1,Airport West,154 Halsey Rd,3,t,PI,Nelson,3/9/2016,13.5,3042.0,3.0,...,3464.0,840000.0,0,0,1,0,0,0,0,0
2,Albert Park,105 Kerferd Rd,2,h,S,hockingstuart,3/9/2016,3.3,3206.0,2.0,...,3280.0,1275000.0,1,0,0,0,0,0,0,0
5,Alphington,6 Smith St,4,h,S,Brace,3/9/2016,6.4,3078.0,3.0,...,2211.0,2000000.0,0,0,0,0,0,0,0,1
6,Alphington,5/6 Yarralea St,3,h,S,Jellis,3/9/2016,6.4,3078.0,3.0,...,2211.0,1110000.0,0,0,0,0,1,0,0,0
7,Altona,158 Queen St,3,h,VB,Greg,3/9/2016,13.8,3018.0,3.0,...,5301.0,520000.0,0,0,0,0,0,0,1,0


#### Sklearn Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

# returns label encoded variable(s)
df_dummies['RegionId'] = labelencoder.fit_transform(df_dummies.Regionname)

In [10]:
df_dummies['RegionId'].head(10)

1     6
2     5
5     2
6     2
7     6
9     6
10    6
13    5
16    5
19    5
Name: RegionId, dtype: int32

In [11]:
df_dummies['RegionId'].value_counts()

5    2709
2    2613
6    2059
0     982
4     371
3      62
1      51
7      43
Name: RegionId, dtype: int64

In [12]:
df_dummies['RegionId'].unique() # All the unique values supplied

array([6, 5, 2, 0, 4, 3, 1, 7])

In [13]:
df_dummies['RegionId'].nunique()

8

In [14]:
from sklearn.preprocessing import OneHotEncoder
hotencoder = OneHotEncoder()
encoded = hotencoder.fit_transform(df_dummies.RegionId.values.reshape(-1,1)).toarray() 
# Returns a numpy array of one hot encoded variables
encoded

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [15]:
encoded.shape

(8890, 8)

In [16]:
# convert the array into a dataframe. Specifically, one hot encoded dataframe

df_encoded = pd.DataFrame(encoded, columns = ["RegionId_"+str(int(i)) for i in range(encoded.shape[1])])

In [17]:
df_encoded

Unnamed: 0,RegionId_0,RegionId_1,RegionId_2,RegionId_3,RegionId_4,RegionId_5,RegionId_6,RegionId_7
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
8885,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8886,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8887,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8888,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [18]:
df_dummies = pd.concat([df_dummies, df_encoded], axis=1)  # concats two dataframes
df_dummies

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom,...,PARK_Underground,RegionId,RegionId_0,RegionId_1,RegionId_2,RegionId_3,RegionId_4,RegionId_5,RegionId_6,RegionId_7
0,,,,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Airport West,154 Halsey Rd,3.0,t,PI,Nelson,3/9/2016,13.5,3042.0,3.0,...,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Albert Park,105 Kerferd Rd,2.0,h,S,hockingstuart,3/9/2016,3.3,3206.0,2.0,...,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,,,,,,,,,,,...,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,,,,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34838,Caroline Springs,29 Edgbaston Pde,3.0,h,SP,Crane,30/09/2017,14.8,3023.0,3.0,...,0.0,6.0,,,,,,,,
34846,Keysborough,3 Cobain St,3.0,h,S,Area,30/09/2017,25.2,3173.0,3.0,...,0.0,4.0,,,,,,,,
34848,Maidstone,15 Ballarat Rd,2.0,h,SP,Biggin,30/09/2017,6.4,3012.0,2.0,...,0.0,6.0,,,,,,,,
34851,Noble Park,5 Blaby St,3.0,h,PI,C21,30/09/2017,22.7,3174.0,3.0,...,0.0,4.0,,,,,,,,


### Normalization and Scaling

#### Standard Scaler
StandardScaler normalizes the data using the formula (x-mean)/standard deviation

In [19]:
#Scales the data. Essentially returns the z-scores of every attribute

from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler()

In [20]:
# returns z-scores of the values of the attribute
data['Bedroom_Stdscale'] = std_scale.fit_transform(data[['Bedroom']])

In [21]:
data['Bedroom_Stdscale'].head()

1   -0.080681
2   -1.115676
5   -0.080681
6   -0.080681
7   -0.080681
Name: Bedroom_Stdscale, dtype: float64

In [22]:
data['Bedroom_Stdscale'].describe()

count    8.890000e+03
mean     2.165110e-15
std      1.000056e+00
min     -3.185668e+00
25%     -1.115676e+00
50%     -8.068077e-02
75%      9.543150e-01
max      9.234281e+00
Name: Bedroom_Stdscale, dtype: float64

#### MinMaxScaler
MinMaxScaler normalizes the data using the formula (x - min)/(max - min)

In [23]:
from sklearn.preprocessing import MinMaxScaler

minmax_scale = MinMaxScaler()

In [24]:
data['Car_MinMaxScale'] = minmax_scale.fit_transform(data[['Car']])

In [25]:
data['Car_MinMaxScale'].head()

1    0.1
2    0.0
5    0.4
6    0.2
7    0.1
Name: Car_MinMaxScale, dtype: float64

In [26]:
data['Car_MinMaxScale'].describe()

count    8890.000000
mean        0.169224
std         0.097534
min         0.000000
25%         0.100000
50%         0.200000
75%         0.200000
max         1.000000
Name: Car_MinMaxScale, dtype: float64

#### Log Transformation

In [27]:
import numpy as np

In [28]:
# Transform an attribute using a mathematical transformation.
from sklearn.preprocessing import FunctionTransformer   

log_transformer = FunctionTransformer(np.log1p) # not to bring in other things with log transformer


In [29]:
data['Distance_logtransform'] = log_transformer.fit_transform(data[['Distance']])   # Log Transform the attribute 

In [30]:
data['Distance_logtransform'].head()

1    2.674149
2    1.458615
5    2.001480
6    2.001480
7    2.694627
Name: Distance_logtransform, dtype: float64

In [31]:
data['Distance_logtransform'].describe()

count    8890.000000
mean        2.356024
std         0.550826
min         0.000000
25%         2.001480
50%         2.415914
75%         2.701361
max         3.879500
Name: Distance_logtransform, dtype: float64

#### Exponential Transformation

In [32]:
exp_transformer = FunctionTransformer(np.exp) # Exponential transform 

In [33]:
data['Rooms_exptransform'] = exp_transformer.fit_transform(data[['Rooms']])  #returns the exponential transform of the data

In [34]:
data['Rooms_exptransform'].head()

1    20.085537
2     7.389056
5    54.598150
6    20.085537
7    20.085537
Name: Rooms_exptransform, dtype: float64

In [35]:
data['Rooms_exptransform'].describe()

count      8890.000000
mean         57.862291
std        1743.357006
min           2.718282
25%           7.389056
50%          20.085537
75%          54.598150
max      162754.791419
Name: Rooms_exptransform, dtype: float64