In [24]:
#In machine learning, each project will vary in regards to the libraries required
#for import. For this particular exercise, we are using gradient boosting
#(ensemble modeling) and mean absolute error to measure performance.

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib

In [26]:
df = pd.read_csv('Melbourne_housing_FULL.csv')

In [27]:
df.head(n=5) #“n” representing the number of rows you wish to preview in relation to the head row

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/9/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/2/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/2/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/3/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [28]:
#Scrubbing Process
#Let’s first remove columns from the dataset that we don’t wish to include in
#the model by using the del df[' '] function and entering the vector (column)
#titles that we wish to remove.
# The misspellings of “longitude” and “latitude” are used, as the two misspellings were not corrected in
#the source file.

In [29]:
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [30]:
df

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,,2.5,2.0,1.0,1.0,126.0,,,Yarra City Council
1,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
3,Abbotsford,3,u,,2.5,3.0,2.0,1.0,0.0,,,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,1480000.0,6.3,4.0,1.0,3.0,593.0,,,Maribyrnong City Council
34853,Yarraville,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council
34854,Yarraville,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council
34855,Yarraville,3,h,1140000.0,6.3,,,,,,,Maribyrnong City Council


In [31]:
#The remaining eleven independent variables (represented as X) in the dataset
#are Suburb, Rooms, Type, Distance, Bedroom2, Bathroom, Car, Landsize,
#BuildingArea, YearBuilt, and CouncilArea. The twelfth variable, located in
#the fifth column of the downloaded dataset, is the dependent variable, which
#is Price (represented as y). As mentioned, decision trees (including gradient
#boosting and random forests) are adept at managing large and highdimensional
#datasets with a high number of variables.
#The following Pandas function can be used to remove rows with missing values:

In [32]:
df.dropna(axis=0, how='any', subset=None, inplace=True)
#if we use dropna function without any arguments, it returns the rows that don't have any missing values
#if we want to remove columns that have a missing value, we use {axis='column_name'}
#if we want to remove rows that contains only missing values, we use {how='all'}
#if we want to return rows that have at least one, two, three missing values ..etc. we use {thresh=num}, num is the number of missing values that row contains
#if we want to remove rows that have a missing value in a specific column, we use {subset='column_name'}
#When inplace = True , the data is modified in place, which means it will return nothing and the dataframe is now updated. When inplace = False , which is the default, then the operation is performed and it returns a copy of the object. You then need to save it to something.

In [33]:
#Keep in mind that it’s important to drop rows with missing values after
#applying the del df function to remove columns (as shown in the previous
#step). This way, there’s a better chance that more rows from the original
#dataset will be preserved. Imagine dropping a whole row because it was
#missing the value for a variable that would be later deleted like the post code
#in our model!

In [34]:
df
#note that rows dropped from 34857 to 8895

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
6,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council
11,Abbotsford,3,h,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra City Council
14,Abbotsford,2,h,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra City Council
...,...,...,...,...,...,...,...,...,...,...,...,...
34847,Wollert,3,h,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,Whittlesea City Council
34849,Wollert,3,h,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,Whittlesea City Council
34853,Yarraville,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council
34854,Yarraville,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council


In [35]:
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])
#This command converts column values for Suburb, CouncilArea, and Type
#into numerical values through the application of one-hot encoding.

In [36]:
#we need to remove the “Price” column because this column will act as
#our dependent variable (y) and for now we are only examining the eleven
#independent variables (X).
del features_df['Price']

In [37]:
#Finally, create X and y arrays from the dataset using the matrix data type
#(as_matrix). The X array contains the independent variables and the y array
#contains the dependent variable of Price.
X = features_df.to_numpy()
y = df['Price'].to_numpy()

In [38]:
X

array([[2. , 2.5, 2. , ..., 1. , 0. , 0. ],
       [3. , 2.5, 3. , ..., 1. , 0. , 0. ],
       [4. , 2.5, 3. , ..., 1. , 0. , 0. ],
       ...,
       [2. , 6.3, 2. , ..., 1. , 0. , 0. ],
       [2. , 6.3, 2. , ..., 0. , 1. , 0. ],
       [2. , 6.3, 2. , ..., 1. , 0. , 0. ]])

In [39]:
y

array([1035000., 1465000., 1600000., ...,  888000.,  705000., 1020000.])

In [40]:
#We are now at the stage of splitting the data into training and test segments.
#For this exercise, we will proceed with a standard 70/30 split by calling the
#Scikit-learn function below with an argument of “0.3.” The dataset’s rows are
#also shuffled randomly to avoid bias using the random_state function.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [41]:
#As you will recall, we are using the gradient boosting algorithm for this
#exercise, as shown.
model = ensemble.GradientBoostingRegressor(
n_estimators=150,
learning_rate=0.1,
max_depth=30,
min_samples_split=4,
min_samples_leaf=6,
max_features=0.6,
loss='huber'
)
   #n_estimators represents how many decision trees to build. Remember that a
#high number of trees will generally improve accuracy (up to a certain point),
#but it will also increase the model’s processing time. Above, I have selected
#150 decision trees as an initial starting point.
   #learning_rate controls the rate at which additional decision trees influence
#the overall prediction. This effectively shrinks the contribution of each tree
#by the set learning_rate. Inserting a low rate here, such as 0.1, should improve accuracy.
   #max_depth defines the maximum number of layers (depth) for each decision
#tree. If “None” is selected, then nodes expand until all leaves are pure or until
#all leaves contain less than min_samples_leaf. Here, I have selected a high
#maximum number of layers (30), which will have a dramatic effect on the
#final result, as we will see later.
   #min_samples_split defines the minimum number of samples required to
#execute a new binary split. For example, min_samples_split = 10 means there
#must be ten available samples in order to create a new branch.
   #min_samples_leaf represents the minimum number of samples that must
#appear in each child node (leaf) before a new branch can be implemented.
#This helps to mitigate the impact of outliers and anomalies in the form of a
#low number of samples found in one leaf as a result of a binary split. For
#example, min_samples_leaf = 4 requires there to be at least four available
#samples within each leaf for a new branch to be created.
   #max_features is the total number of features presented to the model when
#determining the best split. As mentioned in Chapter 11, random forests and
#gradient boosting restrict the total number of features shown to each
#individual tree to create multiple results that can be voted upon later.
#If the max_features value is an integer (whole number), the model will
#consider max_features at each split (branch). If the value is a float (e.g. 0.6),
#then max_features is the percentage of total features randomly selected.
#Although max_features sets a maximum number of features to consider in
#identifying the best split, total features may exceed the max_features limit if
#no split can initially be made.
   #loss calculates the model's error rate. we are using huber which protects against outliers and anomalies.

In [42]:
#we will implement Scikitlearn's fit function to start the model training process.
model.fit(X_train, y_train)

In [43]:
#save the training model as a file
joblib.dump(model, 'house_trained_model.pkl')

['house_trained_model.pkl']

In [44]:
#will use mean absolute error to evaluate the accuracy of the model and compare the difference between the model’s
#expected predictions and the actual values. The same process is repeated with the test data.
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.2f" %mse)

Training Set Mean Absolute Error: 29778.71


In [45]:
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.2f" %mse)

Test Set Mean Absolute Error: 163002.66


In [None]:
#Final Thoughts:
#training set mean absolute error is $27,157.02. This means that our training model was very accurate at predicting the actual
#value of properties contained in the training data. While $27,157.02 may
#seem like a lot of money, this average error value is low given the maximum
#range of our dataset is $8 million. As many of the properties in the dataset are
#in excess of seven figures ($1,000,000+), $27,157.02 constitutes a reasonably low error rate.

#But how did the model fare with the test data? These results are less accurate.
#The test data provided less indicative predictions with an average error rate of
#$169,962.99. A high discrepancy between the training and test data is usually
#a key indicator of overfitting. As our model is tailored to the training data, it
#stumbled when predicting the test data, which probably contains new patterns
#that the model hasn’t adjusted for. The test data, of course, is likely to contain
#slightly different patterns and new potential outliers and anomalies.
#However, in this case, the difference between the training and test data is
#exacerbated by the fact that we configured the model to overfit the training
#data. An example of this issue was setting max_depth to “30.” Although
#setting a high max_depth improves the chances of the model finding patterns
#in the training data, it does tend to lead to overfitting. Another possible cause
#is a poor split of the training and test data, but for this model the data was
#randomized using Scikit-learn.
#Lastly, please take into account that because the training and test data are
#shuffled randomly, your own results will differ slightly when replicating this
#model on your own machine.