# The Problem with "In-Sample" Scores

In [115]:
import pandas as pd

# Load data
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 

# Filter rows with missing price values
filtered_melbourne_data = melbourne_data.dropna(axis=0)

# Choose target and features
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]


from sklearn.tree import DecisionTreeRegressor
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(X, y)

DecisionTreeRegressor()

In [116]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

# the mean absolute error (MAE) is suspiciously small

434.71594577146544

# Out-sample scores, training + validation data

In [118]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features X and target Y
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 2)

# Define model
melbourne_model = DecisionTreeRegressor()

# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

# WOW, the MAE is ~$250,000 compared to the ~$500

251167.6391220142


In [35]:
# This is the difference between almost perfect prediction vs practically unusable
# For reference, the avg home value in the vlidation data is $1,100,000
# There are ways to improve the model by experimenting to find better
        # features X, or different model types

# More Practice

In [121]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

# Path of the file to read
iowa_file_path = 'train.csv'

home_data = pd.read_csv(iowa_file_path)
y = home_data.SalePrice
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 
                   'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[feature_columns]

# Specify Model
iowa_model = DecisionTreeRegressor()
# Fit Model
iowa_model.fit(X, y)

print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]


# 1. Split the Data

In [122]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

train_X.shape,val_X.shape,train_y.shape, val_y.shape

((1095, 7), (365, 7), (1095,), (365,))

# 2. Specify and Fit the Model

In [123]:
iowa_model = DecisionTreeRegressor(random_state = 1)

iowa_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

# 3. Make Predictions with Validation Data

In [124]:
# make prediction using the validation data
# ___.predict(val_X) means to predict the val_X (features X of validation data)
val_predictions = iowa_model.predict(val_X)

In [125]:
# print the top few validation predictions
print("validation data predictions      ",val_predictions[:5])

# print the top few actual prices from validation data
print("actual price from validation data",list(val_y.head(5)))

validation data predictions       [186500. 184000. 130000.  92000. 164500.]
actual price from validation data [231500, 179500, 122000, 84500, 142000]


# 4. Calculate the Mean Absolute Error (MAE) in Validation Data

In [136]:
from sklearn.metrics import mean_absolute_error
val_mae = (mean_absolute_error(val_y, val_predictions))

print(val_mae)

29652.931506849316


In [137]:
in_sample_X = iowa_model.predict(X.head())
in_sample_y = y.head().tolist()

in_sample_MAE = (mean_absolute_error(in_sample_X, in_sample_y))

print(in_sample_MAE)

2400.0


In [None]:
# in_sample MAE is much lower, because it makes prediction on the data that it
    # was fitted on
# while validation data is larger and has more practice use, but it can be
    #improved on