### 1. Import libraries & setup the environment

In [5]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #SUPPRESS FutureWarning messages
pd.set_option('display.max_columns', 500)

### 2. Read in data & obtain summary stats

In [8]:
melb_house = pd.read_csv('melb_data.csv')
melb_house.describe(include = 'all')

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
count,13580,13580,13580.0,13580,13580.0,13580,13580,13580,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,12211,13580.0,13580.0,13580,13580.0
unique,314,13378,,3,,5,268,58,,,,,,,,,33,,,8,
top,Reservoir,5 Margaret St,,h,,S,Nelson,27/05/2017,,,,,,,,,Moreland,,,Southern Metropolitan,
freq,359,3,,9449,,9022,1565,473,,,,,,,,,1163,,,4695,
mean,,,2.937997,,1075684.0,,,,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,,-37.809203,144.995216,,7454.417378
std,,,0.955748,,639310.7,,,,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,,0.07926,0.103916,,4378.581772
min,,,1.0,,85000.0,,,,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,,-38.18255,144.43181,,249.0
25%,,,2.0,,650000.0,,,,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,,-37.856822,144.9296,,4380.0
50%,,,3.0,,903000.0,,,,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,,-37.802355,145.0001,,6555.0
75%,,,3.0,,1330000.0,,,,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,,-37.7564,145.058305,,10331.0


*2.1 check dimensions, missing vals*

In [9]:
melb_house.shape

(13580, 21)

In [10]:
print('\nColumns with NaNs:\n', melb_house.isna().sum())


Columns with NaNs:
 Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64


In [11]:
print('Column names: ', melb_house.columns)

Column names:  Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')


In [20]:
melb_house_clean = melb_house.dropna(axis = 1)# Remove columns with NA for now

In [21]:
melb_house_clean.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,558.416127,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,3990.669241,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,177.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,440.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,651.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,433014.0,-37.40853,145.52635,21650.0


In [22]:
melb_house_clean.shape

(13580, 17)

### 3. Assign target & predictors

In [23]:
y = melb_house_clean.Price

In [30]:
X = melb_house_clean[['Rooms','Bathroom','Landsize', 'Lattitude', 'Longtitude' ]]#Also called 'features' that are used to predict values of target

### 4. Define & Build models
#### In Python, scikit-learn is the library for implementing Machine Learning algorithms on Data frames
##### *While coding, this library is written as sklearn*
#####  *Working with Decision Trees*


In [31]:
from sklearn.tree import DecisionTreeRegressor

In [32]:
#1. Define the model
mod1 = DecisionTreeRegressor(random_state= 1)#for ensuring reproducibility

In [34]:
#2. Fit the model by first incorporating train-test-split 
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X,y)
mod1.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [36]:
#3. Predict
prediction = mod1.predict(val_X)

In [37]:
#4. Evaluate with mean_absolute_error
from sklearn.metrics import mean_absolute_error
print('Mean absolute error with Decision Tree:', mean_absolute_error(val_y, prediction))

Mean absolute error with Decision Tree: 244201.24565537556
