In [30]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [31]:
import pandas as pd
    
# Load data
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 
# Filter rows with missing values
filtered_melbourne_data = melbourne_data.dropna(axis=0)
# Choose target and features
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]

from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

In [32]:
train_X.shape, val_X.shape, train_y.shape, val_y.shape

((4647, 7), (1549, 7), (4647,), (1549,))

In [58]:
# compare MAE with differing values of max_leaf_nodes
dd = {}
for max_leaf_nodes in range(360,370):
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    dd[max_leaf_nodes] = round(my_mae)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 360  		 Mean Absolute Error:  241804
Max leaf nodes: 361  		 Mean Absolute Error:  241614
Max leaf nodes: 362  		 Mean Absolute Error:  241688
Max leaf nodes: 363  		 Mean Absolute Error:  241510
Max leaf nodes: 364  		 Mean Absolute Error:  241592
Max leaf nodes: 365  		 Mean Absolute Error:  241447
Max leaf nodes: 366  		 Mean Absolute Error:  241472
Max leaf nodes: 367  		 Mean Absolute Error:  241027
Max leaf nodes: 368  		 Mean Absolute Error:  241335
Max leaf nodes: 369  		 Mean Absolute Error:  241485


In [92]:
print(min(dd, key=dd.get),':',dd[min(dd, key=dd.get)])

367 : 241028


# MORE PRACTICE

In [67]:
# Code you have previously used to load data
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Path of the file to read
iowa_file_path = 'train.csv'

home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 29,653


In [68]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# 1: Compare Different Tree Sizes

In [74]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(max_leaf_nodes, round(mae))

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
scores = {leaf_size: round(get_mae(leaf_size, train_X, val_X, train_y, val_y)) for leaf_size in candidate_max_leaf_nodes}
print(min(scores, key=scores.get),':',scores[min(scores, key=scores.get)])

5 35045
25 29016
50 27406
100 27283
250 27894
500 29454
100 : 27283


In [85]:
candidate_max_leaf_nodes = range(60,80)
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(max_leaf_nodes, round(mae))

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
scores = {leaf_size: round(get_mae(leaf_size, train_X, val_X, train_y, val_y)) for leaf_size in candidate_max_leaf_nodes}
print('Best number of leaf nodes is',min(scores, key=scores.get),':',scores[min(scores, key=scores.get)])

60 27111
61 27077
62 26990
63 26977
64 26927
65 27044
66 26985
67 26985
68 26944
69 26759
70 26763
71 26704
72 26848
73 27044
74 27157
75 27114
76 27105
77 27298
78 27221
79 27155
Best number of leaf nodes is 71 : 26704


# 2: Fit Model Using All Data

In [88]:
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes = 71)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=71)

In [90]:
# when fitting the model, you won't need to create validation data
    # because all the modeling decisions have been made, you've determined
        # the minimum mean absolute error (MAE)
    # max_leaf_nodes = 71, therefore you can use the entire dataset.
    # Use ___.fit(X, y) as opposed to ___.fit(train_X, train_y)