In [40]:
# https://www.kaggle.com/poojaswami123/house-price-prediciton-score-137?scriptVersionId=35576309


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [2]:
# Based on https://www.kaggle.com/hemingwei/top-2-from-laurenstc-on-house-price-prediction

train = pd.read_csv("house_train.csv")
test = pd.read_csv("house_test.csv")
print("Train set size:", train.shape)
print("Test set size:", test.shape)
#print('START data processing', datetime.now(), )

Train set size: (1460, 81)
Test set size: (1459, 80)


In [3]:
train_ID = train['Id']
test_ID = test['Id']
# Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [4]:
# Deleting outliers
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

In [5]:
# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
#train["SalePrice"] = np.log1p(train["SalePrice"])
y = train.SalePrice
train_features = train.drop(['SalePrice'], axis=1)
test_features = test


In [6]:
features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)

(2917, 79)


In [7]:
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)

features['Functional'] = features['Functional'].fillna('Typ')
features['Electrical'] = features['Electrical'].fillna("SBrkr")
features['KitchenQual'] = features['KitchenQual'].fillna("TA")
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

features["PoolQC"] = features["PoolQC"].fillna("None")

In [8]:

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    features[col] = features[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    features[col] = features[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    features[col] = features[col].fillna('None')


In [9]:
features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda 
                                                                    x: x.fillna(x.mode()[0]))


In [10]:
# object type features added in objects list
objects = []
for i in features.columns:
    if features[i].dtype == object:
        objects.append(i)

In [11]:
features.update(features[objects].fillna('None'))

features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

In [12]:
# Filling in the rest of the NA's

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics.append(i)
features.update(features[numerics].fillna(0))

In [13]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)

In [14]:

from scipy import stats
#use: stats.skew
    
skew_features = features[numerics2].apply(lambda x: stats.skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

In [15]:
import scipy

In [16]:
for i in skew_index:
    features[i] = scipy.special.boxcox1p(features[i], scipy.stats.boxcox_normmax(features[i] + 1))

In [17]:
# combining features


features = features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])

In [18]:
# simplified features
features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)


In [19]:
print(features.shape)
final_features = pd.get_dummies(features).reset_index(drop=True)
print(final_features.shape)

(2917, 86)
(2917, 333)


In [20]:
X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(X):, :]

print('X-->', X.shape, 'y-->', y.shape, 'X_sub-->', X_sub.shape)

X--> (1458, 333) y--> (1458,) X_sub--> (1459, 333)


In [21]:
#sns.scatterplot(X,X_sub,hue=y)

In [22]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

In [23]:
overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)
        

In [24]:
overfit = list(overfit)
overfit.append('MSZoning_C (all)')

In [25]:
X = X.drop(overfit, axis=1).copy()
X_sub = X_sub.drop(overfit, axis=1).copy()

print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

X (1453, 331) y (1453,) X_sub (1459, 331)


In [26]:
import datetime

# ################## ML ########################################
print('START ML', datetime.datetime.now(), )

START ML 2020-06-08 07:42:29.001424


# Using GradientBoosting algorithm

In [27]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split


np.random.seed(42)
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)

x_train,x_test,target_train,target_test=train_test_split(X,y,test_size=.3)

gbr.fit(x_train,target_train)
gbr.score(x_test,target_test)

  from numpy.core.umath_tests import inner1d


0.9087626761113101

In [28]:
from sklearn.metrics import  r2_score

np.random.seed(42)
y_prd=gbr.predict(x_test)
r2_score(target_test,y_prd)

0.9087626761113101

# Prediciton on test dataset

In [38]:
z=pd.read_csv("sample_submission.csv")

In [73]:
y_pred_result=gbr.predict(X_sub)
y_test_result=pd.DataFrame(y_pred_result,columns=['SalePrice'],index=None)
df=[z['Id'],y_test_result['SalePrice']]

from pandas import DataFrame 

df1 = DataFrame(df, index= ['Id', 'SalePrice']).transpose()

df1['Id']=df1['Id'].astype(int)

df1.to_csv("HP_GBR.csv",index=False)

df1.head(3)

Unnamed: 0,Id,SalePrice
0,1461,120977.902003
1,1462,164589.426329
2,1463,194907.967001


# Using Linear regression

In [29]:
from sklearn.linear_model import LinearRegression

lr=LinearRegression()
lr.fit(x_train,target_train)
lr.score(x_test,target_test)

-6983684892.408979

# Using random forest regressor

In [30]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
rf=RandomForestRegressor()
rf.fit(x_train,target_train)
rf.score(x_test,target_test)

y_p=rf.predict(x_test)
r2_score(target_test,y_p)

0.8759008044468598

In [31]:
from sklearn.metrics import mean_squared_error

mean_squared_error(target_test,y_p)

663787403.3070183

# Using GradientBoostingRegressor

In [32]:
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(42)
gb=GradientBoostingRegressor()
gb.fit(x_train,target_train)
gb.score(x_test,target_test)

y_p=gb.predict(x_test)
r2_score(target_test,y_p)

0.9164216535338207

In [112]:
y_pred_result=gb.predict(X_sub)
y_test_result=pd.DataFrame(y_pred_result,columns=['SalePrice'],index=None)
df=[z['Id'],y_test_result['SalePrice']]

from pandas import DataFrame 

df1 = DataFrame(df, index= ['Id', 'SalePrice']).transpose()

df1['Id']=df1['Id'].astype(int)

df1.to_csv("HP_GBR_91.csv",index=False)

# Grid search cv

In [33]:
from sklearn.model_selection import GridSearchCV

In [35]:
parmeter={'n_estimators':[100,150,200,250,300,350,400,450,500]}
gcv=GridSearchCV(estimator=gb,param_grid=parmeter)

In [36]:
gcv.fit(x_train,target_train)
gcv.score(x_test,target_test)

0.9258071419888898

In [39]:
y_pred_result=gcv.predict(X_sub)
y_test_result=pd.DataFrame(y_pred_result,columns=['SalePrice'],index=None)
df=[z['Id'],y_test_result['SalePrice']]

from pandas import DataFrame 

df1 = DataFrame(df, index= ['Id', 'SalePrice']).transpose()

df1['Id']=df1['Id'].astype(int)

df1.to_csv("HP_GBR_Grid_search_92.csv",index=False)