LabelEncoder used for categorical values.
SimpleImputer used for missing values.

Categorical values that are missing can not be used in LabelEncoder

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [30]:
data = pd.read_csv("H:\\Hobby\\ML Dataset\\house_price\\train.csv")
test = pd.read_csv("H:\\Hobby\\ML Dataset\\house_price\\test.csv")

y = data.SalePrice
data=data.drop(['SalePrice'],axis=1)

In [31]:
cols = (data.dtypes=='object')
cols = list(cols[cols].index)


bad_data = [col for col in cols if data[col].isnull().any()]
bad_test = [col for col in cols if test[col].isnull().any()]

bad = set(bad_data) | set(bad_test)
#Alley,MiscFeature, PoolQC, Fence

In [32]:
data = data.drop(['Alley', 'MiscFeature', 'PoolQC', 'Fence'], axis=1)
test = test.drop(['Alley', 'MiscFeature', 'PoolQC', 'Fence'], axis=1)

In [33]:
# Fill not available categorical data

data['BsmtCond'] = data['BsmtCond'].fillna('TA')
test['BsmtCond'] = test['BsmtCond'].fillna('TA')
data['BsmtExposure'] = data['BsmtExposure'].fillna('No')
test['BsmtExposure'] = test['BsmtExposure'].fillna('No')

In [34]:
data['BsmtFinType1'].value_counts()

Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: BsmtFinType1, dtype: int64

In [6]:
BsmtFinType_mapping = {'Unf':1, 'GLQ':2, 'ALQ':3, 'BLQ':4, 'Rec':5, 'LwQ':6}
data['BsmtFinType1'] = data['BsmtFinType1'].map(BsmtFinType_mapping)
test['BsmtFinType1'] = test['BsmtFinType1'].map(BsmtFinType_mapping)
data['BsmtFinType2'] = data['BsmtFinType2'].map(BsmtFinType_mapping)
test['BsmtFinType2'] = test['BsmtFinType2'].map(BsmtFinType_mapping)

In [7]:
BsmtQual_mapping = {'TA':1, 'Gd':2, 'Ex':3, 'Fa':4}
data['BsmtQual'] = data['BsmtQual'].map(BsmtQual_mapping)
test['BsmtQual'] = test['BsmtQual'].map(BsmtQual_mapping)

In [8]:
Electrical_mapping = {'SBrkr':1, 'FuseA':2, 'FuseF':3, 'FuseP':4, 'Mix': 5}
data['Electrical'] = data['Electrical'].map(Electrical_mapping)
test['Electrical'] = test['Electrical'].map(Electrical_mapping)

In [9]:
Exterior_mapping = {'VinylSd':1, 'HdBoard':2, 'MetalSd':3, 'Wd Sdng':4, 'Wd Shng':4, 'Plywood':5,
                      'CemntBd':6, 'BrkFace':7, 'WdShing':8, 'Stucco':8, 'AsbShng':8,
                       'Stone':8, 'BrkComm':8, 'CBlock':8, 'ImStucc':8, 'AsphShn':8}

data['Exterior1st'] = data['Exterior1st'].map(Exterior_mapping)
test['Exterior1st'] = test['Exterior1st'].map(Exterior_mapping)
data['Exterior2nd'] = data['Exterior2nd'].map(Exterior_mapping)
test['Exterior2nd'] = data['Exterior2nd'].map(Exterior_mapping)

In [10]:
FireplaceQu_mapping = {'Gd':1, 'TA':2, 'Fa':3, 'Ex':4, 'Po':5}
data['FireplaceQu'] = data['FireplaceQu'].map(FireplaceQu_mapping)
test['FireplaceQu'] = test['FireplaceQu'].map(FireplaceQu_mapping)

In [11]:
Functional_mapping = {'Typ':1, 'Min1':2, 'Min2':3, 'Mod':4, 'Maj1':5, 'Maj2':5, 'Sev':5}
data['Functional'] = data['Functional'].map(Functional_mapping)
test['Functional'] = test['Functional'].map(Functional_mapping)

In [12]:
GarageCond_mapping = {'TA':1, 'Fa':2, 'Gd':3, 'Po':3, 'Ex':3}
data['GarageCond'] = data['GarageCond'].map(GarageCond_mapping)
test['GarageCond'] = test['GarageCond'].map(GarageCond_mapping)

In [13]:
GarageFinish_mapping = {'Unf':1, 'RFn':2, 'Fin':3}
data['GarageFinish'] = data['GarageFinish'].map(GarageFinish_mapping)
test['GarageFinish'] = test['GarageFinish'].map(GarageFinish_mapping)

In [14]:
GarageQual_mapping = {'TA':1, 'Fa':2, 'Gd':3, 'Po':3, 'Ex':3}
data['GarageQual'] = data['GarageQual'].map(GarageQual_mapping)
test['GarageQual'] = test['GarageQual'].map(GarageQual_mapping)

In [15]:
GarageType_mapping = {'Attchd':1, 'Detchd':2, 'BuiltIn':3, 'Basment':4, 'CarPort':4, '2Types':4}
data['GarageType'] = data['GarageType'].map(GarageType_mapping)
test['GarageType'] = test['GarageType'].map(GarageType_mapping)

In [16]:
KitchenQual_mapping = {'TA':1, 'Gd':2, 'Ex':3, 'Fa':4}
data['KitchenQual'] = data['KitchenQual'].map(KitchenQual_mapping)
test['KitchenQual'] = test['KitchenQual'].map(KitchenQual_mapping)

In [17]:
MSZoning_mapping = {'RL':1, 'RM':2, 'FV':3, 'C (all)':3, 'RH':3}
data['MSZoning'] = data['MSZoning'].map(MSZoning_mapping)
test['MSZoning'] = test['MSZoning'].map(MSZoning_mapping)

In [18]:
MasVnrType_mapping = {'None':1, 'BrkFace':2, 'Stone':3, 'BrkCmn':4}
data['MasVnrType'] = data['MasVnrType'].map(MasVnrType_mapping)
test['MasVnrType'] = test['MasVnrType'].map(MasVnrType_mapping)

In [19]:
SaleType_mapping = {'WD':1, 'New':2, 'COD':3, 'ConLD':4, 'ConLI':4,
                    'ConLw':4, 'CWD':4, 'Oth':4, 'Con':4}
data['SaleType'] = data['SaleType'].map(SaleType_mapping)
test['SaleType'] = test['SaleType'].map(SaleType_mapping)

In [20]:
Utilities_mapping = {'AllPub':1, 'NoSeWa':2}
data['Utilities'] = data['Utilities'].map(Utilities_mapping)
test['Utilities'] = test['Utilities'].map(Utilities_mapping)

In [21]:
cols = (data.dtypes=='object')
cols = list(cols[cols].index)

In [22]:
encoder = LabelEncoder()
for col in cols:
    data[col] = encoder.fit_transform(data[col])
    test[col] = encoder.transform(test[col])

In [23]:
imputer = SimpleImputer()

temp = test.copy()

data = pd.DataFrame(imputer.fit_transform(data))
test = pd.DataFrame(imputer.transform(test))

data.columns = temp.columns
test.columns = temp.columns
data = data.drop(['Id'], axis=1)
test = test.drop(['Id'], axis=1)

X=data

In [24]:
xTrain, xVal, yTrain, yVal = train_test_split(X, y)

In [28]:
xTrain.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
296,50.0,2.0,75.0,13710.0,1.0,3.0,3.0,1.0,4.0,0.0,9.0,2.0,2.0,0.0,0.0,5.0,5.0,1950.0,1950.0,1.0,1.0,4.0,4.0,1.0,0.0,3.0,4.0,1.0,1.0,3.0,3.0,4.0,420.0,1.0,0.0,490.0,910.0,1.0,4.0,1.0,2.0,910.0,648.0,0.0,1558.0,0.0,0.0,1.0,1.0,4.0,1.0,1.0,6.0,1.0,0.0,1.68961,1.0,1950.0,1.0,1.0,282.0,1.0,1.0,2.0,289.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,1.0,4.0
17,90.0,1.0,72.0,10791.0,1.0,3.0,3.0,1.0,4.0,0.0,19.0,2.0,2.0,2.0,2.0,4.0,5.0,1967.0,1967.0,1.0,1.0,3.0,3.0,1.0,0.0,3.0,4.0,3.0,1.678145,3.0,3.0,2.54884,0.0,1.419831,0.0,0.0,0.0,1.0,4.0,1.0,1.0,1296.0,0.0,0.0,1296.0,0.0,0.0,2.0,0.0,2.0,2.0,1.0,6.0,1.0,0.0,1.68961,4.0,1967.0,1.0,2.0,516.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,500.0,10.0,2006.0,1.0,4.0
900,20.0,1.0,70.049958,7340.0,1.0,0.0,3.0,1.0,4.0,0.0,12.0,2.0,2.0,0.0,2.0,4.0,6.0,1971.0,1971.0,1.0,1.0,2.0,2.0,1.0,0.0,3.0,4.0,1.0,1.0,3.0,3.0,3.0,322.0,1.0,0.0,536.0,858.0,1.0,4.0,1.0,1.0,858.0,0.0,0.0,858.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,4.0,1.0,0.0,1.68961,2.0,1979.0,1.0,1.0,684.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,1.0,4.0
841,70.0,2.0,60.0,10440.0,1.0,3.0,3.0,1.0,4.0,0.0,17.0,2.0,2.0,0.0,5.0,5.0,8.0,1904.0,2002.0,1.0,1.0,3.0,3.0,1.0,0.0,3.0,4.0,2.0,1.0,3.0,3.0,1.0,0.0,1.0,0.0,650.0,650.0,1.0,2.0,1.0,1.0,958.0,581.0,0.0,1539.0,0.0,0.0,2.0,0.0,3.0,1.0,2.0,8.0,1.0,1.0,5.0,2.0,1983.0,1.0,2.0,686.0,3.0,1.0,1.0,70.0,78.0,68.0,0.0,0.0,0.0,0.0,6.0,2008.0,1.0,4.0
896,30.0,2.0,50.0,8765.0,1.0,3.0,3.0,1.0,4.0,0.0,9.0,2.0,2.0,0.0,2.0,4.0,6.0,1936.0,1950.0,1.0,1.0,4.0,4.0,1.0,0.0,3.0,4.0,0.0,1.0,3.0,3.0,3.0,285.0,1.0,0.0,666.0,951.0,1.0,0.0,0.0,1.0,951.0,0.0,0.0,951.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,6.0,1.0,0.0,1.68961,2.0,1936.0,1.0,1.0,327.0,1.0,1.0,2.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,4.0,2006.0,1.0,0.0


In [27]:
mini = pow(10,9);optimized=0
for x in range(20,80,5):
    model = RandomForestRegressor(random_state=0, n_estimators=x)
    model.fit(xTrain,yTrain)
    preds = model.predict(xVal)
    error = mean_absolute_error(yVal, preds)
    if error<mini:
        mini=error
        optimized=x
    print(error,x)
print('Least Error: %.3f at %d' %(mini,optimized))

19868.678356164382 20
19666.87671232877 25
19591.12319634703 30
19585.14833659491 35
19602.03582191781 40
19680.825631659056 45
19520.493698630136 50
19517.309539227896 55
19545.38164383562 60
19455.476037934666 65
19375.519334637967 70
19343.16898630137 75
Least Error: 19343.169 at 75


In [452]:
final_model = RandomForestRegressor(n_estimators=65)
final_model.fit(X,y)
y_pred = final_model.predict(test)

output = pd.DataFrame({'Id': temp.Id,
                     'SalePrice': y_pred})

output.to_csv('4thSub.csv', index=False)