In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeRegressor
import math
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import ensemble

In [2]:
# This notebook is version 1 of tree based model for the ML project on Ames Housing Data
# This notebook will be very basic as I will remove all columns on train and test data sets whenever
# there is a null value

In [3]:
# Import the train and test data sets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# Check the first 5 rows of train database
train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
# Check the first 5 rows of test database
test.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [6]:
# Check the shape of the train data
train.shape

(1460, 81)

In [7]:
# Check the shape of the test data - the test data has 1 field less since it doesn't have the target variable
test.shape

(1459, 80)

In [8]:
# Get the list of column names that has at least 1 null value in train data, then convert to set
train_null_col = train.columns[train.isnull().any()]
set_train_null_col = set(train_null_col)

In [9]:
# Get the list of column names that has at least 1 null value in the test data, then convert to set
test_null_col = test.columns[test.isnull().any()]
set_test_null_col = set(test_null_col)

In [10]:
# Combine the two sets to get a unique list of column names I need to remove
null_col = set_train_null_col | set_test_null_col
null_col = list(null_col)

In [11]:
# Create updated train and test data that has NA columns removed
train_noNA = train.drop(null_col, axis = 1)
test_noNA = test.drop(null_col, axis = 1)

In [12]:
# Check the shape of updated train data
train_noNA.shape

(1460, 47)

In [13]:
# Check the shape of updated test data
test_noNA.shape

(1459, 46)

In [14]:
# Check the data type of columns in train data
train_noNA.dtypes

Id                int64
MSSubClass        int64
LotArea           int64
Street           object
LotShape         object
LandContour      object
LotConfig        object
LandSlope        object
Neighborhood     object
Condition1       object
Condition2       object
BldgType         object
HouseStyle       object
OverallQual       int64
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
RoofStyle        object
RoofMatl         object
ExterQual        object
ExterCond        object
Foundation       object
Heating          object
HeatingQC        object
CentralAir       object
1stFlrSF          int64
2ndFlrSF          int64
LowQualFinSF      int64
GrLivArea         int64
FullBath          int64
HalfBath          int64
BedroomAbvGr      int64
KitchenAbvGr      int64
TotRmsAbvGrd      int64
Fireplaces        int64
PavedDrive       object
WoodDeckSF        int64
OpenPorchSF       int64
EnclosedPorch     int64
3SsnPorch         int64
ScreenPorch       int64
PoolArea        

In [15]:
# Get a list of column names that have data type "object" - should be the same for test data
obj_col = train_noNA.columns[train_noNA.dtypes == "object"]
obj_col

Index(['Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation',
       'Heating', 'HeatingQC', 'CentralAir', 'PavedDrive', 'SaleCondition'],
      dtype='object')

In [16]:
# Create sets of data without the object fields
train_noobj = train_noNA.drop(obj_col, axis = 1)
test_noobj = test_noNA.drop(obj_col, axis = 1)

In [17]:
# Use label encoder to make string categorical variables into numerical variables
# For reference, train_noNA['MSZoning_N'] = labelencoder.fit_transform(train_noNA['MSZoning'])
labelencoder = LabelEncoder()
for obj in obj_col:
    train_noobj[obj] = labelencoder.fit_transform(train_noNA[obj])
    test_noobj[obj] = labelencoder.fit_transform(test_noNA[obj])

In [18]:
# Check if loop is working as expected
train_noobj

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,...,RoofStyle,RoofMatl,ExterQual,ExterCond,Foundation,Heating,HeatingQC,CentralAir,PavedDrive,SaleCondition
0,1,60,8450,7,5,2003,2003,856,854,0,...,1,1,2,4,2,1,0,1,2,4
1,2,20,9600,6,8,1976,1976,1262,0,0,...,1,1,3,4,1,1,0,1,2,4
2,3,60,11250,7,5,2001,2002,920,866,0,...,1,1,2,4,2,1,0,1,2,4
3,4,70,9550,7,5,1915,1970,961,756,0,...,1,1,3,4,0,1,2,1,2,0
4,5,60,14260,8,5,2000,2000,1145,1053,0,...,1,1,2,4,2,1,0,1,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,7917,6,5,1999,2000,953,694,0,...,1,1,3,4,2,1,0,1,2,4
1456,1457,20,13175,6,6,1978,1988,2073,0,0,...,1,1,3,4,1,1,4,1,2,4
1457,1458,70,9042,7,9,1941,2006,1188,1152,0,...,1,1,0,2,4,1,0,1,2,4
1458,1459,20,9717,5,6,1950,1996,1078,0,0,...,3,1,3,4,1,1,2,1,2,4


In [19]:
# Get the list of columns
list_col = list(train_noobj.columns)
list_col

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice',
 'Street',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'PavedDrive',
 'SaleCondition']

In [20]:
# Reorder the columns so the target variable is at the end
final_col = ['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'Street',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'PavedDrive',
 'SaleCondition',
 'SalePrice']

In [21]:
# Create final train data
final_train = train_noobj[final_col]
final_train

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,...,RoofMatl,ExterQual,ExterCond,Foundation,Heating,HeatingQC,CentralAir,PavedDrive,SaleCondition,SalePrice
0,1,60,8450,7,5,2003,2003,856,854,0,...,1,2,4,2,1,0,1,2,4,208500
1,2,20,9600,6,8,1976,1976,1262,0,0,...,1,3,4,1,1,0,1,2,4,181500
2,3,60,11250,7,5,2001,2002,920,866,0,...,1,2,4,2,1,0,1,2,4,223500
3,4,70,9550,7,5,1915,1970,961,756,0,...,1,3,4,0,1,2,1,2,0,140000
4,5,60,14260,8,5,2000,2000,1145,1053,0,...,1,2,4,2,1,0,1,2,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,7917,6,5,1999,2000,953,694,0,...,1,3,4,2,1,0,1,2,4,175000
1456,1457,20,13175,6,6,1978,1988,2073,0,0,...,1,3,4,1,1,4,1,2,4,210000
1457,1458,70,9042,7,9,1941,2006,1188,1152,0,...,1,0,2,4,1,0,1,2,4,266500
1458,1459,20,9717,5,6,1950,1996,1078,0,0,...,1,3,4,1,1,2,1,2,4,142125


In [22]:
# Create train & test set for the model
x = np.array(final_train.iloc[:, 0:46])
y = np.ravel(final_train.iloc[:, -1])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,  random_state = 42)

In [23]:
# Create decision tree for regression
tree_model = tree.DecisionTreeRegressor()
tree_model.fit(x_train, y_train)

DecisionTreeRegressor()

In [24]:
# Look at the errors for decision tree
train_error = tree_model.score(x_train, y_train)
test_error = tree_model.score(x_test, y_test)
print("The training score is: %.5f" %train_error)
print("The test     score is: %.5f" %test_error)

The training score is: 1.00000
The test     score is: 0.81829


In [25]:
# Create random forest for regression
randomForest = ensemble.RandomForestRegressor()
randomForest.set_params(n_estimators=50, random_state=42, max_features=10)
randomForest.fit(x_train, y_train)

RandomForestRegressor(max_features=10, n_estimators=50, random_state=42)

In [26]:
# Look at the errors for random forest
print("The training score of random forest is: %.5f" %(randomForest.score(x_train, y_train))
print("The test     score of random forest is: %.5f" %(randomForest.score(x_test, y_test))

SyntaxError: invalid syntax (<ipython-input-26-f8da13c849ef>, line 3)

In [None]:
#call MSE instead
#consider log transforming the target variable
#grid search CV for rf
#ordinal encoding... think about ordering levels
