In [2]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Path of the file to read
iowa_file_path = 'train.csv'

home_data = pd.read_csv(iowa_file_path)

# Create target object and call it y
y = home_data.SalePrice

# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

# Specify model
iowa_model = DecisionTreeRegressor(random_state = 1)

# Fit model (training data)
iowa_model.fit(train_X, train_y)

# Make validation predictions (testing data) and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
# mean absolute error is the different between val_prediction and val_y (actual values in testing)
val_mae = mean_absolute_error(val_predictions, val_y)

print("Valiation MAE: {:,.0f}".format(val_mae))

Valiation MAE: 29,653


In [3]:
# get_mae function
# utility function to help compare MAE scores from different values for max_leaf_nodes

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [4]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5 		 Mean Absolute Error: 35044
Max leaf nodes: 25 		 Mean Absolute Error: 29016
Max leaf nodes: 50 		 Mean Absolute Error: 27405
Max leaf nodes: 100 		 Mean Absolute Error: 27282
Max leaf nodes: 250 		 Mean Absolute Error: 27893
Max leaf nodes: 500 		 Mean Absolute Error: 29454


In [8]:
## ALTERNATIVE to EXPLICIT loop above
## Shorter solution with dictionary comprehension

scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key = scores.get)

print("Optimal tree size:", best_tree_size)
print(scores)

Optimal tree size: 100
{5: 35044.51299744237, 25: 29016.41319191076, 50: 27405.930473214907, 100: 27282.50803885739, 250: 27893.822225701646, 500: 29454.18598068598}


In [None]:
# Fit Model Using All Data

# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes = 100, random_state = 1)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)