In [21]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

### Melbourne Housing
---

We are provided with a dataset of Melbourne houses. Our task is to use decision trees to predict our house prices and evaluate the accuarcy of the model

In [4]:
# Read dataset and display

melbourne_file_path = './data/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [5]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

We have lot of features. We'll select few features along with target values

In [6]:
y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


In [7]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.7996,144.9984
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
3,3,2.0,94.0,-37.7969,144.9969
4,4,1.0,120.0,-37.8072,144.9941


In [22]:
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)


### Building The Model
----

Building model and making predictions

In [24]:
# Define model. Specifyy a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

Display few model predictions

In [25]:
print("Making predictions for the following 5 houses\n")
print(train_X.head(),'\n')

print("\nThe predictions are")
print(melbourne_model.predict(train_X.head()))

print("\nActual prices \n",train_y.head())

Making predictions for the following 5 houses

       Rooms  Bathroom  Landsize  Lattitude  Longtitude
664        3       2.0     368.0  -37.78460   145.09350
3270       2       1.0     586.0  -37.74350   145.04860
3873       2       1.0     348.0  -37.86720   145.04320
13170      3       1.0     521.0  -37.63854   145.05179
1730       4       2.0     687.0  -37.89310   145.04790 


The predictions are
[1335000.  590000. 1420000.  552500. 1830000.]

Actual prices 
 664      1335000.0
3270      590000.0
3873     1420000.0
13170     552500.0
1730     1830000.0
Name: Price, dtype: float64


### Model validation
----

In [27]:
val_predictions = melbourne_model.predict(val_X)

print(mean_absolute_error(val_y, val_predictions))

250429.6127638684


Our model predictions is off. We next improve it