In [53]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error

#read in data from csv

df = pd.read_csv("~/Melbourne_housing_FULL.csv")

In [54]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [55]:
#delete unneeded columns

del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [56]:
df.columns.unique

<bound method Index.unique of Index(['Suburb', 'Rooms', 'Type', 'Price', 'Distance', 'Bedroom2', 'Bathroom',
       'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea'],
      dtype='object')>

In [57]:
df.isnull().sum()

Suburb              0
Rooms               0
Type                0
Price            7610
Distance            1
Bedroom2         8217
Bathroom         8226
Car              8728
Landsize        11810
BuildingArea    21115
YearBuilt       19306
CouncilArea         3
dtype: int64

In [58]:
len(df)

34857

In [59]:
#remove rows with missing values

df.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)

In [60]:
df.isnull().sum()

Suburb          0
Rooms           0
Type            0
Price           0
Distance        0
Bedroom2        0
Bathroom        0
Car             0
Landsize        0
BuildingArea    0
YearBuilt       0
CouncilArea     0
dtype: int64

In [61]:
len(df)

8895

In [62]:
#convert non-numeric data using one-hot encoding

df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

In [63]:
#assign x and y variables

X = df.drop('Price', axis = 1)
y = df['Price']

In [64]:
#split data into test/train sets (I will use a 70/30 split and shuffle the data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True)

In [65]:
#set up algorithm

model = ensemble.GradientBoostingRegressor(
    n_estimators = 250, learning_rate = 0.1,
    max_depth = 5, min_samples_split = 4, min_samples_leaf = 6, 
    max_features = 0.6, loss = 'huber')

In [66]:
#run model on training data

model.fit(X_train, y_train)

GradientBoostingRegressor(loss='huber', max_depth=5, max_features=0.6,
                          min_samples_leaf=6, min_samples_split=4,
                          n_estimators=250)

In [69]:
#check model accuracy (2 d.p.)
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.2f" % mae_train)

mae_test = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.2f" % mae_test)

Training Set Mean Absolute Error: 123420.34
Test Set Mean Absolute Error: 161640.68


In [72]:
print("Training Score: %.2f" % model.score(X_train, y_train))
print("Test Score: %.2f" % model.score(X_test, y_test))

Training Score: 0.90
Test Score: 0.78
