In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

### Melbourne Housing
---

We are provided with a dataset of Melbourne houses.

* Task
** Predict house prices


* Algorithm
** Use Decision trees

In [3]:
# Read dataset and display

melbourne_file_path = './data/melb_data.csv'

melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.head(8)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0
5,Abbotsford,129 Charles St,2,h,941000.0,S,Jellis,7/05/2016,2.5,3067.0,...,1.0,0.0,181.0,,,Yarra,-37.8041,144.9953,Northern Metropolitan,4019.0
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,7/05/2016,2.5,3067.0,...,2.0,0.0,245.0,210.0,1910.0,Yarra,-37.8024,144.9993,Northern Metropolitan,4019.0
7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,8/10/2016,2.5,3067.0,...,1.0,2.0,256.0,107.0,1890.0,Yarra,-37.806,144.9954,Northern Metropolitan,4019.0


In [60]:
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


### Display features

In [4]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

---
All features contains numerical values.  
We could use all features to predict our target. But for this task we select few features list below

* Rooms
* Bathroom
* Landsize
* Latitude
* Longtitude

to predict house values.

---

In [5]:
 # Targets
y = melbourne_data.Price                     

# Features
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]        

X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


----
*We have 13 580 data instances. We can split our datasets into training and validation sets respectively.  
Next we display few houses with their prices*

----

In [15]:
preview = pd.concat([X,y], axis=1)
preview.head(5)

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude,Price
0,2,1.0,202.0,-37.7996,144.9984,1480000.0
1,2,1.0,156.0,-37.8079,144.9934,1035000.0
2,3,2.0,134.0,-37.8093,144.9944,1465000.0
3,3,2.0,94.0,-37.7969,144.9969,850000.0
4,4,1.0,120.0,-37.8072,144.9941,1600000.0


In [7]:
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)

print("# of training set: {}".format(len(train_X)))
print("# of validation set: {}".format(len(train_X)))

# of training set: 10185
# of validation set: 10185



----
### Building The Model And Train

Building model and making predictions

----

In [13]:
# Define model. Specifyy a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

---
Next we check how well our model performed on first 5 data instances.

In [23]:
print("Making predictions for the following 5 houses\n")

preview_predictions = melbourne_model.predict(train_X.head())
preview_predictionsDf = pd.DataFrame(preview_predictions, columns=['Predictions'])

preview_results = pd.concat([preview, preview_predictionsDf], axis=1)
preview_results.head()

Making predictions for the following 5 houses



Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude,Price,Predictions
0,2,1.0,202.0,-37.7996,144.9984,1480000.0,1335000.0
1,2,1.0,156.0,-37.8079,144.9934,1035000.0,590000.0
2,3,2.0,134.0,-37.8093,144.9944,1465000.0,1420000.0
3,3,2.0,94.0,-37.7969,144.9969,850000.0,552500.0
4,4,1.0,120.0,-37.8072,144.9941,1600000.0,1830000.0


---
Our model predicts that house #1 has price value of 1 335 000 whereas the house is priced at 1 480 000.  
So its off by 1 480 000 - 1 335 00 = 145000.  
Since we have 13 580 data instance, this means our model will be off for each house.  
We can calculate accumulated error and try to optmize the model to make better predictions

### Model Validation
----

In [27]:
val_predictions = melbourne_model.predict(val_X)

print("Accumulated absolute error on the validation set:", mean_absolute_error(val_y, val_predictions))

Accumulated absolute error on the validation set: 250429.6127638684


----
Our model predictions is off. We next improve it

### Underfitting and OverFitting

We create few decision trees of different depth and check which tree perfom better

----

In [28]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return (mae)


# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes,  train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: % d"%(max_leaf_nodes, my_mae))

Max leaf nodes: 5 		 Mean Absolute Error:  354662
Max leaf nodes: 50 		 Mean Absolute Error:  266447
Max leaf nodes: 500 		 Mean Absolute Error:  231301
Max leaf nodes: 5000 		 Mean Absolute Error:  249163


----
### Random Forests

Using many trees to predict same instance and then average the results to get better accuracy

----

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)

print(mean_absolute_error(melb_preds, val_y))

180860.37877504269


With Random Forests our accumulated error is much better