In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

# Break off validation set from training data
# Set train and test sizes
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [2]:
X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


## Step 1: Evaluate several models

[Random forest description](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

- n_estimators - the number of trees in the forest (default = 10)
- max_depth - the maximum depth of the tree (default = None)
- min_samples_split - The minimum number of samples required to split an internal node (default = 2)

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [7]:
from sklearn.metrics import mean_absolute_error

def score_model(model, X_train=X_train, X_valid=X_valid, y_train=y_train, y_valid=y_valid):
    model.fit(X_train, y_train)
    pred_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, pred_val)
    return mae

In [9]:
for i, model in enumerate(models, 1):
    mae = score_model(model)
    print("Model {} mae is {}".format(i, mae))

Model 1 mae is 24015.492818003917
Model 2 mae is 23740.979228636657
Model 3 mae is 23528.78421232877
Model 4 mae is 23996.676789668687
Model 5 mae is 23706.672864217904


## Step 2: Generate test predictions

In [10]:
# Choose best model
my_model = model_5

In [11]:
# Fit the model to the training data
my_model.fit(X, y)

# Generate test predictions
preds_test = my_model.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('../output/submission.csv', index=False)

## Step 3: Submit your results