## 1. Set the environment

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib


## 2. Import the dataset

In [4]:
df = pd.read_csv("Melbourne_housing_FULL.csv")

In [5]:
df.head(n=5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


## 3. Scrub the dataset

In [6]:
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [7]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,,2.5,2.0,1.0,1.0,126.0,,,Yarra City Council
1,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
3,Abbotsford,3,u,,2.5,3.0,2.0,1.0,0.0,,,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council


In [8]:
df.dropna(axis=0,how='any',thresh=None,subset=None,inplace=True)

In [10]:
features_df=pd.get_dummies(df,columns=['Suburb','CouncilArea','Type']) #one-hot encoding

In [11]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
6,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council
11,Abbotsford,3,h,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra City Council
14,Abbotsford,2,h,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra City Council


In [12]:
del features_df['Price']

In [15]:
X=features_df.to_numpy() #independent variables
y=df['Price'].to_numpy() #dependent variable

## 4. Split the Dataset

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

##  5. Select Algorithm and configure its Hyerparameters

In [21]:
# using gradient boosting 
# first line is algo and lines below it is hyperparameters
model=ensemble.GradientBoostingRegressor(
n_estimators=150, #how many decision tree to build(high no of tree more accuracy)
learning_rate=0.1, #low rate improve accuracy
max_depth=30, #max no of layer for DT
min_samples_split=4, #min no samples to exec
min_samples_leaf=6, #min no samples in each node
max_features=0.6, #total no features used in model to get best split 
loss='huber') #loss calc model's error rate

In [22]:
model.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=30, max_features=0.6, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=6, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=150,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [23]:
joblib.dump(model,'house_trained_model.pkl') #saving training model as file

['house_trained_model.pkl']

## 6. Evaluate the results

In [24]:
mse = mean_absolute_error(y_train,model.predict(X_train))
print("Training Set Mean Absolute Error : %.2f"%mse)

Training Set Mean Absolute Error : 30810.83


In [26]:
mse = mean_absolute_error(y_test,model.predict(X_test))
print("Test Set Mean Absolute Error : %.2f"%mse)

Test Set Mean Absolute Error : 165073.37


In [None]:
"""Hence,our training set mean absolute error is $30810.83 and the
test set mean absolute error is $165073.37. This means that on average, the
training set miscalculated the actual property value by a mere $30810.83.02.
However, the test set miscalculated by an average of $165073.37."""

                               ************************ THE END ********************************