In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read training data
train_data_path = 'train.csv'
home_train_data = pd.read_csv(train_data_path, index_col='Id')


# Create target object (training data)
y = home_train_data.SalePrice

# Select features to predict target object
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

# Create X_train - predictor (training data)
X = home_train_data[features].copy()

X.shape

(1460, 7)

In [63]:
# Read testing data
test_data_path = 'test.csv'
home_test_data = pd.read_csv(test_data_path, index_col='Id')

# Create X_test - predictor (testing data)
X_test = home_test_data[features].copy()

X_test.shape

(1459, 7)

In [64]:
# Break off validation set from training data
# note terminology: testing set distinct from validation set

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


In [None]:
## RECAP Available Variables

print("y:\n", y.head())
print("X:\n", X.head())
print("X_test:\n", X_test.head())
print("X_train:\n", X_train.head())
print("X_valid:\n", X_valid.head())
print("y_train:\n", y_train.head())
print("y_valid:\n", y_valid.head())

In [65]:
# Specify (then evaluate) Several RandomForest Models (regression)

from sklearn.ensemble import RandomForestRegressor

# Specify the models

model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]
print(models)

[RandomForestRegressor(n_estimators=50, random_state=0), RandomForestRegressor(random_state=0), RandomForestRegressor(criterion='mae', random_state=0), RandomForestRegressor(min_samples_split=20, n_estimators=200, random_state=0), RandomForestRegressor(max_depth=7, random_state=0)]


In [66]:
# Fit, then Evaluate different models 

from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    # fit model using training data
    model.fit(X_t, y_t)
    # make predictions using validation data
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

# loop through the 5 models, run score_model function on them to see which one performs best
for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706


In [60]:
# GENERATE TEST PREDICTIONS

# Define your model (pick model_3, the best performing one)
my_model = model_3
print(my_model)

# Fit your model to the training data
my_model.fit(X, y)

# Generate test predictions
preds_test = my_model.predict(X_test)

# OPTIONAL: Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})

print(output)
#output.to_csv('submission.csv', index=False)

RandomForestRegressor(criterion='mae', random_state=0)
        Id  SalePrice
0     1461  119433.08
1     1462  158367.50
2     1463  185351.21
3     1464  178343.12
4     1465  192898.29
...    ...        ...
1454  2915   86155.00
1455  2916   89050.00
1456  2917  156296.92
1457  2918  132232.50
1458  2919  230870.60

[1459 rows x 2 columns]


In [61]:
# CANNOT Calculate Mean Absolute Error (MAE) of Test Predictions (different length)

print("Length of target from training data:", len(y))
print("Length of X_test:", len(X_test))
print("Length of preds_test:", len(preds_test))

## NOTE: Train length: 1460, 
##       Test length: 1459


#competition_mae = mean_absolute_error(preds_test, y)
#print("Validation MAE for (Competition) Random Forest Model: {}".format(competition_mae))

Length of target from training data: 1460
Length of X_test: 1459
Length of preds_test: 1459


In [69]:
my_model_mae = score_model(my_model)
print("MAE for current model is:", my_model_mae)

MAE for current model is: 23528.78421232877


In [71]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    # specify model
    model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
    # fit model with training data
    model.fit(X_train, y_train)
    # use model to predict on validation set
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


print("MAE:")
print(score_dataset(X_train, X_valid, y_train, y_valid))

MAE:
23528.78421232877
