In [1]:
import pandas as pd

In [3]:
# save filepath to variable for easier access
melbourne_file_path = 'data/melb_data.csv'
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# print a summary of the data in Melbourne data
melbourne_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,13580.0,2.937997,0.955748,1.0,2.0,3.0,3.0,10.0
Price,13580.0,1075684.0,639310.724296,85000.0,650000.0,903000.0,1330000.0,9000000.0
Distance,13580.0,10.13778,5.868725,0.0,6.1,9.2,13.0,48.1
Postcode,13580.0,3105.302,90.676964,3000.0,3044.0,3084.0,3148.0,3977.0
Bedroom2,13580.0,2.914728,0.965921,0.0,2.0,3.0,3.0,20.0
Bathroom,13580.0,1.534242,0.691712,0.0,1.0,1.0,2.0,8.0
Car,13518.0,1.610075,0.962634,0.0,1.0,2.0,2.0,10.0
Landsize,13580.0,558.4161,3990.669241,0.0,177.0,440.0,651.0,433014.0
BuildingArea,7130.0,151.9676,541.014538,0.0,93.0,126.0,174.0,44515.0
YearBuilt,8205.0,1964.684,37.273762,1196.0,1940.0,1970.0,1999.0,2018.0


1. Loading data
2. Review the data
3. Selectiong data modeling

In [24]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
# The Melbourne data has some missing values (some houses for which some variables weren't recorded.)
# We'll learn to handle missing values in a later tutorial.  
# Your Iowa data doesn't have missing values in the columns you use. 
# So we will take the simplest option for now, and drop houses from our data. 
# Don't worry about this much for now, though the code is:

# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)

- Dot notation, which we use to select the "prediction target"
- Selecting with a column list, which we use to select the "features"

# 1. Slectiong the prediction Target

In [6]:
# prediction target
y = melbourne_data.Price

2. Choosing 'Features'
- The columns that are inputted into our model (and later used to make predictions) are called "features.
- By convention, this data is called X.

In [7]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [8]:
X = melbourne_data[melbourne_features]

In [10]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,6196.0,2.931407,0.971079,1.0,2.0,3.0,4.0,8.0
Bathroom,6196.0,1.57634,0.711362,1.0,1.0,1.0,2.0,8.0
Landsize,6196.0,471.00694,897.449881,0.0,152.0,373.0,628.0,37000.0
Lattitude,6196.0,-37.807904,0.07585,-38.16492,-37.855438,-37.80225,-37.7582,-37.45709
Longtitude,6196.0,144.990201,0.099165,144.54237,144.926198,144.9958,145.0527,145.52635


In [11]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


3. Building Model
- Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
- Fit: Capture patterns from provided data. This is the heart of modeling.
- Predict: Just what it sounds like
- Evaluate: Determine how accurate the model's predictions are.

In [12]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

In [13]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [14]:
# Model Validation
- error=actual−predicted



In [17]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

1115.7467183128902

In [18]:
# The Problem with 'In-sample' scores

In [19]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

274316.74262965354


# Overfiting and Underfiting

- overfitting, where a model matches the training data almost perfectly, but does poorly in validation and other new data. On the flip side, if we make our tree very shallow, it doesn't divide up the houses into very distinct groups.

- When a model fails to capture important distinctions and patterns in the data, so it performs poorly even in training data, that is called underfitting.


In [21]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [22]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 50  		 Mean Absolute Error:  279794
Max leaf nodes: 500  		 Mean Absolute Error:  261718
Max leaf nodes: 5000  		 Mean Absolute Error:  271996


Conclusion¶

- Overfitting: capturing spurious patterns that won't recur in the future, leading to less accurate predictions, or
- Underfitting: failing to capture relevant patterns, again leading to less accurate predictions.

# Random Forest

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

218482.25517538196


