In [13]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import statistics
import matplotlib.pyplot as plt
import math
from statsmodels.stats.outliers_influence import variance_inflation_factor

# EDA

Reading in the data, and taking an exploratory analysis

In [14]:
df = pd.read_csv('../data/melb_data.csv')
pd.set_option('display.max_columns', None)


Code to look at correlations between price and other variables, commented for runtime

In [16]:
# for col in df.drop('Price', axis = 1):
#     try:    
#         df.plot.scatter(x=col, y='Price')
#         plt.show()
#     except:
#         print(col)


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [18]:
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

# =============================================================

Imputing missing values, removing outliers and nulls

In [19]:
df = df[df['BuildingArea'] < 20001]
df = df[df['Price'] <  3_000_000]
df.fillna(value= {'Car':0, 'CouncilArea':df['CouncilArea'].mode()[0], 'BuildingArea': df['BuildingArea'].mean()}, inplace=True)
df.dropna(axis=0, inplace=True)

Separating numeric and dummy variables, then concat them together. Then, create independent and dependent variable sets.

In [20]:
dfn = df._get_numeric_data()
y = df['Price']
dummy_vars = pd.get_dummies(df[['Type','Regionname', 'Suburb', 'CouncilArea', 'YearBuilt']])
X = pd.concat([dfn.drop('Price', axis=1), dummy_vars], axis=1)
X = dfn.drop('Price', axis=1)

Measure multicollinearity between numeric columns

In [25]:
vif_data = pd.DataFrame()
# dfn['Elseroom'] = dfn['Rooms'] - dfn['Bedroom2'] - dfn['Bathroom']
# dfn['AvgRoomArea'] = dfn['BuildingArea'] / dfn['Rooms']
# dfn.drop(['Rooms'], axis=1, inplace=True)
vif_data["feature"] = dfn.columns
vif_data['VIF'] = [variance_inflation_factor(dfn.values, i) for i in range(len(dfn.columns))]
print(vif_data)

          feature            VIF
0           Rooms     129.770076
1           Price      11.831217
2        Distance       7.713347
3        Postcode    1736.289615
4        Bedroom2     123.958207
5        Bathroom      12.536889
6             Car       5.082847
7        Landsize       1.296286
8    BuildingArea       6.386627
9       YearBuilt    3961.855218
10      Lattitude  290566.293903
11     Longtitude  287818.157001
12  Propertycount       4.037542


### Running the Model

Runs the model 100 times with different seeds, and takes the average score

In [23]:
from sklearn.tree import plot_tree

In [28]:
model = DecisionTreeRegressor(min_samples_leaf= 0.005)

x = 0
temp = []
for i in range(100):
    Xt, Xv, yt, yv = train_test_split(X, y, test_size=.2, random_state=x)
    model.fit(Xt, yt)
    temp.append(model.score(Xv, yv))
    x += 1

print(statistics.mean(temp))
print(max(temp))
print(min(temp))
# plot_tree(model)

0.7301355509003262
0.7609937870714528
0.6937729801449706
