In [55]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [56]:
data = pd.read_csv("H:\\Hobby\\ML Dataset\\house_price\\train.csv")
test = pd.read_csv("H:\\Hobby\\ML Dataset\\house_price\\test.csv")

y = data.SalePrice
data = data.drop(['SalePrice'], axis=1)

In [57]:
cols = (data.dtypes=='object')
cols = list(cols[cols].index)


bad_data = [col for col in cols if data[col].isnull().any()]
bad_test = [col for col in cols if test[col].isnull().any()]

bad = set(bad_data) | set(bad_test)
#Alley,MiscFeature, PoolQC, Fence

In [58]:
data = data.drop(['Alley', 'MiscFeature', 'PoolQC', 'Fence'], axis=1)
test = test.drop(['Alley', 'MiscFeature', 'PoolQC', 'Fence'], axis=1)

In [59]:
# Fill not available categorical data

data['BsmtCond'] = data['BsmtCond'].fillna('TA')
test['BsmtCond'] = test['BsmtCond'].fillna('TA')
data['BsmtExposure'] = data['BsmtExposure'].fillna('No')
test['BsmtExposure'] = test['BsmtExposure'].fillna('No')

In [62]:
data['BsmtFinType2'].value_counts()

Unf    1256
Rec      54
LwQ      46
BLQ      33
ALQ      19
GLQ      14
Name: BsmtFinType2, dtype: int64

In [63]:
BsmtFinType1_mapping = {'Unf':1, 'GLQ':0.97, 'ALQ':0.51, 'BLQ':0.34, 'Rec':0.31, 'LwQ':0.17}
BsmtFinType2_mapping = {'Unf':0.98, 'GLQ':0.011, 'ALQ':0.015, 'BLQ':0.026, 'Rec':0.04, 'LwQ':0.037}

data['BsmtFinType1'] = data['BsmtFinType1'].map(BsmtFinType1_mapping)
test['BsmtFinType1'] = test['BsmtFinType1'].map(BsmtFinType1_mapping)
data['BsmtFinType2'] = data['BsmtFinType2'].map(BsmtFinType2_mapping)
test['BsmtFinType2'] = test['BsmtFinType2'].map(BsmtFinType2_mapping)

In [64]:
BsmtQual_mapping = {'TA':1, 'Gd':0.95, 'Ex':0.19, 'Fa':0.05}
data['BsmtQual'] = data['BsmtQual'].map(BsmtQual_mapping)
test['BsmtQual'] = test['BsmtQual'].map(BsmtQual_mapping)

In [66]:
Electrical_mapping = {'SBrkr':1, 'FuseA':0.07, 'FuseF':0.02, 'FuseP':0.003, 'Mix': 0.003}
data['Electrical'] = data['Electrical'].map(Electrical_mapping)
test['Electrical'] = test['Electrical'].map(Electrical_mapping)

In [69]:
Exterior_mapping1 = {'VinylSd':1, 'HdBoard':0.43, 'MetalSd':0.42, 'Wd Sdng':0.39, 'Plywood':0.28,
                      'CemntBd':0.12, 'BrkFace':0.097, 'WdShing':0.075, 'Stucco':0.15, 'AsbShng':0.15,
                       'Stone':0.15, 'BrkComm':0.15, 'CBlock':0.15, 'ImStucc':0.15, 'AsphShn':0.15}

Exterior_mapping2 = {'VinylSd':1, 'HdBoard':0.41, 'MetalSd':0.425, 'Wd Sdng':0.4, 'Plywood':0.21,
                      'CemntBd':0.12, 'BrkFace':0.097, 'WdShing':0.15, 'Stucco':0.19, 'AsbShng':0.19,
                       'Stone':0.19, 'BrkComm':0.19, 'CBlock':0.19, 'ImStucc':0.19, 'AsphShn':0.19}

data['Exterior1st'] = data['Exterior1st'].map(Exterior_mapping1)
test['Exterior1st'] = test['Exterior1st'].map(Exterior_mapping1)
data['Exterior2nd'] = data['Exterior2nd'].map(Exterior_mapping2)
test['Exterior2nd'] = data['Exterior2nd'].map(Exterior_mapping2)

In [74]:
FireplaceQu_mapping = {'Gd':1, 'TA':0.82, 'Fa':0.087, 'Ex':0.063, 'Po':0.053}
data['FireplaceQu'] = data['FireplaceQu'].map(FireplaceQu_mapping)
test['FireplaceQu'] = test['FireplaceQu'].map(FireplaceQu_mapping)

In [79]:
Functional_mapping = {'Typ':1, 'Min1':0.023, 'Min2':0.025, 'Mod':0.011, 'Maj1':0.011, 'Maj2':0.005, 'Sev':0.005}
data['Functional'] = data['Functional'].map(Functional_mapping)
test['Functional'] = test['Functional'].map(Functional_mapping)

In [81]:
GarageCond_mapping = {'TA':1, 'Fa':0.026, 'Gd':0.014, 'Po':0.014, 'Ex':0.014}
data['GarageCond'] = data['GarageCond'].map(GarageCond_mapping)
test['GarageCond'] = test['GarageCond'].map(GarageCond_mapping)

In [83]:
GarageFinish_mapping = {'Unf':1, 'RFn':0.7, 'Fin':0.58}
data['GarageFinish'] = data['GarageFinish'].map(GarageFinish_mapping)
test['GarageFinish'] = test['GarageFinish'].map(GarageFinish_mapping)

In [85]:
GarageQual_mapping = {'TA':1, 'Fa':0.04, 'Gd':0.015, 'Po':0.015, 'Ex':0.015}
data['GarageQual'] = data['GarageQual'].map(GarageQual_mapping)
test['GarageQual'] = test['GarageQual'].map(GarageQual_mapping)

In [87]:
GarageType_mapping = {'Attchd':1, 'Detchd':0.44, 'BuiltIn':0.1, 'Basment':0.04, 'CarPort':0.04, '2Types':0.04}
data['GarageType'] = data['GarageType'].map(GarageType_mapping)
test['GarageType'] = test['GarageType'].map(GarageType_mapping)

In [89]:
KitchenQual_mapping = {'TA':1, 'Gd':0.8, 'Ex':0.013, 'Fa':0.05}
data['KitchenQual'] = data['KitchenQual'].map(KitchenQual_mapping)
test['KitchenQual'] = test['KitchenQual'].map(KitchenQual_mapping)

In [93]:
MSZoning_mapping = {'RL':1, 'RM':0.19, 'FV':0.08, 'C (all)':0.08, 'RH':0.08}
data['MSZoning'] = data['MSZoning'].map(MSZoning_mapping)
test['MSZoning'] = test['MSZoning'].map(MSZoning_mapping)

In [95]:
MasVnrType_mapping = {'None':1, 'BrkFace':0.52, 'Stone':0.15, 'BrkCmn':0.017}
data['MasVnrType'] = data['MasVnrType'].map(MasVnrType_mapping)
test['MasVnrType'] = test['MasVnrType'].map(MasVnrType_mapping)

In [97]:
SaleType_mapping = {'WD':1, 'New':0.096, 'COD':0.034, 'ConLD':0.022, 'ConLI':0.022,
                    'ConLw':0.022, 'CWD':0.022, 'Oth':0.022, 'Con':0.022}
data['SaleType'] = data['SaleType'].map(SaleType_mapping)
test['SaleType'] = test['SaleType'].map(SaleType_mapping)

In [99]:
Utilities_mapping = {'AllPub':1, 'NoSeWa':1}
data['Utilities'] = data['Utilities'].map(Utilities_mapping)
test['Utilities'] = test['Utilities'].map(Utilities_mapping)

In [100]:
cols = (data.dtypes=='object')
cols = list(cols[cols].index)

In [101]:
encoder = LabelEncoder()
for col in cols:
    data[col] = encoder.fit_transform(data[col])
    test[col] = encoder.transform(test[col])

In [102]:
imputer = SimpleImputer()

temp = test.copy()

data = pd.DataFrame(imputer.fit_transform(data))
test = pd.DataFrame(imputer.transform(test))

data.columns = temp.columns
test.columns = temp.columns
data = data.drop(['Id'], axis=1)
test = test.drop(['Id'], axis=1)

X=data

In [103]:
xTrain, xVal, yTrain, yVal = train_test_split(X, y)

In [106]:
mini = pow(10,9);optimized=0
for x in range(75,120,5):
    model = RandomForestRegressor(random_state=0, n_estimators=x)
    model.fit(xTrain,yTrain)
    preds = model.predict(xVal)
    error = mean_absolute_error(yVal, preds)
    if error<mini:
        mini=error
        optimized=x
    print(error,x)
print('Least Error: %.3f at %d' %(mini,optimized))

18602.82312328767 75
18468.39390410959 80
18502.328219178085 85
18533.701369863014 90
18509.395356885365 95
18539.325671232873 100
18607.740639269407 105
18646.016737235368 110
18669.723287671233 115
Least Error: 18468.394 at 80


In [107]:
final_model = RandomForestRegressor(n_estimators=80)
final_model.fit(X,y)
y_pred = final_model.predict(test)

output = pd.DataFrame({'Id': temp.Id,
                     'SalePrice': y_pred})

output.to_csv('5thSub.csv', index=False)