In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeRegressor
import math
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import ensemble

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
pd.set_option('display.max_rows', 500)
train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinSF1         int64
BsmtFinType2      object
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
Heating           object


In [5]:
train.isnull().sum(axis = 0)

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [6]:
#Remove the variables that have more than 10% of the observations as null
NullThreshold = 0.1
train = train.loc[:,(train.isnull().sum(axis = 0) / train.shape[0]) < NullThreshold]

In [7]:
#Removed 6 variables
train.shape

(1460, 75)

In [8]:
train.isnull().sum(axis = 0)

Id                0
MSSubClass        0
MSZoning          0
LotArea           0
Street            0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType        8
MasVnrArea        8
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinSF1        0
BsmtFinType2     38
BsmtFinSF2        0
BsmtUnfSF         0
TotalBsmtSF       0
Heating           0
HeatingQC         0
CentralAir        0
Electrical        1
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
HalfBath          0
BedroomAbvGr      0


In [9]:
NullCol = train.columns[train.isnull().sum(axis = 0) > 0]
NullCol

Index(['MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'],
      dtype='object')

In [10]:
#Need to split into train and test here.

In [11]:
# Imputations
for col in NullCol:
    if train[col].dtype == 'O':
        train[col] = train[col].fillna(train[col].mode()[0])
    if train[col].dtype == 'int64':
        train[col] = train[col].fillna(train[col].mean())
    if train[col].dtype == 'float64':
        train[col] = train[col].fillna(train[col].mean())
    else:
        pass

In [12]:
train.isnull().sum(axis = 0)

Id               0
MSSubClass       0
MSZoning         0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd

In [13]:
obj_col = train.columns[train.dtypes == 'object']
obj_col

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [14]:
train_noobj = train.drop(obj_col, axis = 1)

In [15]:
ordinalcol5 = ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual']
ordinalcol6 = ['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
ordinalcol7 = ['BsmtFinType1', 'BsmtFintype2']

In [None]:
#EDA on house prices vs. variables above..... also take a look at each variable's distribution & relation to the target variable.



In [None]:
#Series.replace({‘key’:0

In [16]:
for col in ordinalcol5:
    ordinalencoder = OrdinalEncoder(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'])
    train_noobj[col] = ordinalencoder.fit_transform(train[col])

for col in ordinalcol6:
    ordinalencoder = OrdinalEncoder(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])
    train_noobj[col] = ordinalencoder.fit_transform(train[col])
    
for col in ordinalcol7:
    ordinalencoder = OrdinalEncoder(categories=['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'])
    train_noobj[col] = ordinalencoder.fit_transform(train[col])



ValueError: Expected 2D array, got 1D array instead:
array=['Gd' 'TA' 'Gd' ... 'Ex' 'TA' 'Gd'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [59]:
set_obj_col = set(obj_col)
set_ordinalcol5 = set(ordinalcol5)
set_ordinalcol6 = set(ordinalcol6)
set_ordinalcol7 = set(ordinalcol7)
set_nonordinal_col = set_obj_col - set_ordinalcol5 - set_ordinalcol6 - set_ordinalcol7
nonordinal_col = list(set_nonordinal_col)
nonordinal_col

['LandContour',
 'Functional',
 'Exterior1st',
 'CentralAir',
 'BldgType',
 'Condition2',
 'MasVnrType',
 'Foundation',
 'Heating',
 'BsmtExposure',
 'RoofMatl',
 'SaleType',
 'Street',
 'LotConfig',
 'GarageType',
 'Utilities',
 'Neighborhood',
 'LandSlope',
 'MSZoning',
 'Exterior2nd',
 'PavedDrive',
 'LotShape',
 'GarageFinish',
 'SaleCondition',
 'HouseStyle',
 'Electrical',
 'Condition1',
 'RoofStyle',
 'BsmtFinType2']

In [None]:
labelencoder = LabelEncoder()
for col in obj_col:
    train_noobj[col] = labelencoder.fit_transform(train[col])

In [None]:
list_col = list(train_noobj.columns)
list_col

In [None]:
final_col = #Fill this in
#Just create database that has target

In [None]:
final_train = train_noobj[final_col]
final_train

In [None]:
final_train['logSalePrice'] = log(final_train['SalePrice'])

In [None]:
x = np.array(final_train.iloc[:, 0:74])
y = np.ravel(final_train.iloc[:, -1])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,  random_state = 42)

In [None]:
tree_model = tree.DecisionTreeRegressor()
tree_model.fit(x_train, y_train)

In [None]:
tree_predict = tree_model.predict(x_test)

In [None]:
print(np.sqrt(mean_squared_error(y_test, tree_predict)))

In [None]:
randomForest = ensemble.RandomForestRegressor()
randomForest.set_params(n_estimators=50, random_state=42, max_features=10)
randomForest = randomForest.fit(x_train, y_train)

In [None]:
rf_predict = randomForest.predict(x_test)

In [None]:
print(np.sqrt(mean_squared_error(y_test, rf_predict)))