In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [97]:
# Read the data
X_full = pd.read_csv('./train.csv', index_col='Id')
X_test_full = pd.read_csv('./test.csv', index_col='Id')

# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', 'OverallQual', 'FullBath', 'OverallCond', 'TotRmsAbvGrd','Foundation']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [98]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Foundation']


In [102]:
#Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

In [111]:
Label_X = X.copy()
Label_X_test = X_test.copy()

ordinal_encoder = OrdinalEncoder()
Label_X[object_cols] = ordinal_encoder.fit_transform(X[object_cols])
Label_X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])

In [103]:
label_X_train

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,OverallQual,FullBath,OverallCond,TotRmsAbvGrd,Foundation
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
619,11694,2007,1828,9,2,5,9,2.0
871,6600,1962,894,5,1,5,5,1.0
93,13360,1921,964,5,1,7,5,0.0
818,13265,2002,1689,8,2,5,7,2.0
303,13704,2001,1541,7,2,5,6,2.0
...,...,...,...,...,...,...,...,...
764,9430,1999,1268,8,2,5,8,2.0
836,9600,1950,1067,4,2,7,4,1.0
1217,8930,1978,1318,6,2,5,8,3.0
560,3196,2003,1557,7,2,5,7,2.0


In [104]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
Label_imputed_X_train = pd.DataFrame(my_imputer.fit_transform(label_X_train))
Label_imputed_X_valid = pd.DataFrame(my_imputer.transform(label_X_valid))

# Imputation removed column names; put them back
Label_imputed_X_train.columns = label_X_train.columns
Label_imputed_X_valid.columns = label_X_valid.columns

In [105]:
Label_imputed_X_train

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,OverallQual,FullBath,OverallCond,TotRmsAbvGrd,Foundation
0,11694.0,2007.0,1828.0,9.0,2.0,5.0,9.0,2.0
1,6600.0,1962.0,894.0,5.0,1.0,5.0,5.0,1.0
2,13360.0,1921.0,964.0,5.0,1.0,7.0,5.0,0.0
3,13265.0,2002.0,1689.0,8.0,2.0,5.0,7.0,2.0
4,13704.0,2001.0,1541.0,7.0,2.0,5.0,6.0,2.0
...,...,...,...,...,...,...,...,...
1163,9430.0,1999.0,1268.0,8.0,2.0,5.0,8.0,2.0
1164,9600.0,1950.0,1067.0,4.0,2.0,7.0,4.0,1.0
1165,8930.0,1978.0,1318.0,6.0,2.0,5.0,8.0,3.0
1166,3196.0,2003.0,1557.0,7.0,2.0,5.0,7.0,2.0


In [106]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, criterion='mae', random_state=0, max_depth=10)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=50, min_samples_split=5, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)
model_6 = RandomForestRegressor(n_estimators=50, random_state=0)
model_7 = RandomForestRegressor(n_estimators=25, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5,model_6, model_7]

from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=Label_imputed_X_train, X_v=Label_imputed_X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 21707
Model 2 MAE: 21420
Model 3 MAE: 21734
Model 4 MAE: 21528
Model 5 MAE: 21820
Model 6 MAE: 21157
Model 7 MAE: 21622


In [113]:
# Fit the model to the training data
model_6.fit(Label_X, y)

# Generate test predictions
preds_test = model_6.predict(Label_X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': Label_X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)