In [None]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame, read_csv, concat, get_dummies, Series
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import normalize, StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
import xgboost
from xgboost import plot_importance, XGBRegressor
from pprint import pprint
from json import load
import seaborn as sns
import numpy as np
from math import sqrt

%matplotlib inline

In [None]:
project = 'house-prices'
version='v0.1'

In [None]:
fulltrain=read_csv('./train.csv',index_col=0)
test=read_csv('./test.csv',index_col=0)
fixed_seed=1234578416
train80, valid20 = train_test_split(fulltrain, test_size=0.2, random_state=fixed_seed)

In [None]:
categories = load(open('categories.json',"r"))
categories.pop('OverallQual')
categories.pop('OverallCond')
categories

In [None]:

#train80_qual = train80.select_dtypes(exclude=np.number)
#valid20_qual = valid20.select_dtypes(exclude=np.number)
#test_qual = test.select_dtypes(exclude=np.number)
#train80 = train80.select_dtypes(include=np.number)
#valid20 = valid20.select_dtypes(include=np.number)
#test = test.select_dtypes(include=np.number)

fields=['MoSold', 'YrSold','Utilities','MiscVal', 'PoolArea']

train80.drop(fields, axis=1, inplace=True)
valid20.drop(fields, axis=1, inplace=True)
test.drop(fields, axis=1, inplace=True)

for key in fields:
    categories.pop(key, None)


qual_columns = list(categories.keys())

In [None]:
categories['Neighborhood'][categories['Neighborhood'].index('Names')]='NAmes'

In [None]:
qual_columns

In [None]:
train80.columns

In [None]:
for category in categories.keys():
    # print(categories[category])
    train80.loc[:,category] = train80[category].astype('category',categories=categories[category])
    valid20.loc[:,category] = valid20[category].astype('category',categories=categories[category])
    test.loc[:,category] = test[category].astype('category',categories=categories[category])

Remove outliers

In [None]:
train80 = train80.drop(train80[train80.LotArea > 200000].index)
train80 = train80.drop(train80[train80.LotFrontage > 250].index)
valid20 = valid20.drop(valid20[valid20.LotArea > 200000].index)
valid20 = valid20.drop(valid20[valid20.LotFrontage > 250].index)

In [None]:
for cond in ['Abnorml','Partial']:
    print((valid20['SaleCondition']==cond).sum())
    print((train80['SaleCondition']==cond).sum())
    #valid20 = valid20.drop(valid20[(valid20['SaleCondition']==cond)].index)
    #train80 = train80.drop(train80[(train80['SaleCondition']==cond)].index)

In [None]:
train80.loc[:,'SalePrice']=np.log(train80.loc[:,'SalePrice'])
valid20.loc[:,'SalePrice']=np.log(valid20.loc[:,'SalePrice'])

In [None]:
mean = {}
mode = {}
median ={}
for column in list(train80.columns):
    if column not in qual_columns:
        mean[column]=train80[column].mean()
        mode[column]=train80[column].mode()[0]
        median[column]=train80[column].median()

In [None]:
mean_price_neigh={}

neighborhoods = categories['Neighborhood']
for name in list(neighborhoods):
    mean_price_neigh[name]=train80.loc[train80['Neighborhood']==name,'SalePrice'].mean()

In [None]:
train80

In [None]:
target_column='SalePrice'
X_train = train80.drop(target_column, axis=1)
y_train = train80[target_column]
X_val = valid20.drop(target_column, axis=1)
y_val = valid20[target_column]

In [None]:
for name in list(neighborhoods):
    X_train.loc[X_train['Neighborhood']==name,'mean_price']=mean_price_neigh[name]
    X_val.loc[X_val['Neighborhood']==name,'mean_price']=mean_price_neigh[name]
    test.loc[test['Neighborhood']==name,'mean_price']=mean_price_neigh[name]
X_train.loc[X_train['Neighborhood'].isna(),'mean_price']=mean['SalePrice']
X_val.loc[X_val['Neighborhood'].isna(),'mean_price']=mean['SalePrice']
test.loc[test['Neighborhood'].isna(),'mean_price']=mean['SalePrice']

# train80_qual['Neighborhood']
#mean_price_neigh

In [None]:
mean_price_neigh

In [None]:
for df in [X_train, X_val, test]:
    na_sum = df['mean_price'].isna().sum()
    print(na_sum[na_sum!=0])

In [None]:
for df in [X_train, X_val, test]:
    df.loc[:,'Exterior1st'] = df['Exterior1st'].fillna('VinylSd')
    df.loc[:,'Exterior2nd'] = df['Exterior2nd'].fillna('VinylSd')

    df.loc[:,'Functional'] = df['Functional'].fillna('Typ')
    df.loc[:,'MSZoning'] = df['MSZoning'].fillna('RL')
    df.loc[:,'SaleType'] = df['SaleType'].fillna('WD')
#    for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'MasVnrType'):
#        df.loc[:,col] = df[col].fillna('None')
#    for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'GarageType'):
#        df.loc[:,col] = df[col].fillna('None')
    
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea'):
        df.loc[:,col] = df[col].fillna(0)
    
#    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
#        df.loc[:,col] = df[col].fillna('None')    

    for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
        df.loc[:,col] = df[col].fillna(0)
        
    df.loc[:,'Electrical'] = df['Electrical'].fillna('SBrkr')
    df.loc[:,'LotFrontage'] = df['LotFrontage'].fillna(median['LotFrontage'])

    for col in ('BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'):
        df.loc[:,col] = df[col].fillna('NA')
    for col in ('ExterQual', 'KitchenQual'):
        df.loc[:,col] = df[col].fillna('TA')

In [None]:
#X_train = get_dummies(X_train, columns=qual_columns, drop_first=True)
#X_val = get_dummies(X_val, columns=qual_columns, drop_first=True)
#test = get_dummies(test, columns=qual_columns, drop_first=True)



In [None]:
X_train.columns

In [None]:
ordinal_cols={}
for key, value in list(categories.items()):
    if value[0] == 'Ex':
        ordinal_cols[key] = value
        categories.pop(key)

for col, tags in ordinal_cols.items():
    tags.reverse()
    for df in [X_train, X_val, test]:
        for i, tag in enumerate(tags):
            df.loc[ df[col]==tags[i], col+'_ord']= i
        df.drop(col, axis=1, inplace=True)


In [None]:
#for category in categories.keys():
X_train = concat([X_train,
                  get_dummies(X_train[list(categories.keys())])
                 ],axis=1).drop(list(categories.keys()),axis=1)
X_val = concat([X_val,
                  get_dummies(X_val[list(categories.keys())])
                 ],axis=1).drop(list(categories.keys()),axis=1)
test = concat([test,
                  get_dummies(test[list(categories.keys())])
                 ],axis=1).drop(list(categories.keys()),axis=1)


In [None]:
pprint(X_train.columns)
print(X_train.shape[0])

Surface Area

In [None]:
for df in [X_train, X_val, test]:
    # df['Surface1'] =  df['2ndFlrSF'] * 0.67918 + df['1stFlrSF'] * 0.15835 + df['TotalBsmtSF'] * 0.56624
    # df['Surface2'] =  df['1stFlrSF'] * 0.51 + df['TotalBsmtSF'] * 0.35
    df['Surface'] =  df['2ndFlrSF'] + df['1stFlrSF'] + df['TotalBsmtSF']
#    df.drop('2ndFlrSF', axis=1, inplace=True)
#    df.drop('1stFlrSF', axis=1, inplace=True)
#    df.drop('TotalBsmtSF', axis=1, inplace=True)
    df.drop('GrLivArea', axis=1, inplace=True)
#    df['Garage'] = df['GarageCond'] + df['GarageFinish'] + df['GarageQual'] + df['GarageCars']

In [None]:
for df in [X_train, X_val, test]:
    na_sum = df.isna().sum()
    print(na_sum[na_sum!=0])

In [None]:
zero_cols=X_train.columns[( X_train == 0).all()]
display(zero_cols)

In [None]:
for df in [X_train, X_val, test]:
    df.drop(columns=zero_cols, inplace=True)

In [None]:
# normalization
scaler = StandardScaler()
scaler.fit(X_train)

X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)
test[:] = scaler.transform(test)

# Outliers detection

In [None]:
clf = IsolationForest(max_samples=100, random_state=42, behaviour='new')
clf.fit(X_train)
y_pred_train = DataFrame(data=clf.predict(X_train),index=X_train.index)
y_pred_train_str = ('C' + y_pred_train.astype('str'))[0]
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_train)
plt.figure(figsize=(16,12))
ax =sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y_pred_train_str)

Removing outliers

In [None]:
isnotoutlier = (y_pred_train[0]==1)
X_train = X_train.loc[isnotoutlier,:]
y_train = y_train.loc[isnotoutlier]
nb_outliers = (y_pred_train[0]==-1).sum()
print('There was ' + str(nb_outliers) + ' outliers')

Removing empty columns

TODO remove constant columns

In [None]:
zero_cols = X_train.columns[( X_train == 0).all()]
display(zero_cols)
for df in [X_train, X_val, test]:
    df.drop(columns=zero_cols, inplace=True)
X_train.shape

Gridsearch hyperparameters estimation

In [None]:
from hypopt import GridSearch
params = {'min_child_weight':[6,7,8,9], 'gamma':[i/100.0 for i in range(1,5)],  'subsample':[i/10.0 for i in range(2,5)],
'colsample_bytree':[i/10.0 for i in range(8,10)], 'max_depth': [3,4,5]}

model = XGBRegressor(booster="gbtree")
grid = GridSearch(model, params)
grid.fit(X_train, y_train, X_val, y_val)
grid.best_params

XGBoost model

In [None]:
model = XGBRegressor(booster="gbtree",colsample_bytree=0.9,
                     max_depth=3, n_estimators=400, gamma= 0.01,
                     min_child_weight=6,
                     subsample=0.2)

In [None]:
model = XGBRegressor(booster="gbtree",colsample_bytree=0.5,
                     max_depth=3, n_estimators=400, subsample=0.7)

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_val)

In [None]:
RMSLE = sqrt(mean_squared_error(y_val,y_pred))
RMSLE

In [None]:
test_pred = model.predict(test)

In [None]:
submission = DataFrame({"SalePrice": np.exp (test_pred)}, index=test.index)
submission.to_csv('test-prediction.csv')

In [None]:
sns.distplot(y_pred-y_val)
plt.show()

In [None]:
y_t_pred = model.predict(X_train)
print(sqrt(mean_squared_error(y_train,y_t_pred)))
sns.distplot(y_t_pred-y_train)
plt.show()

# Feature importance
Feature importance as reported by XGBoost

In [None]:
importance_dict = model.get_booster().get_score(importance_type="gain")
importance = DataFrame.from_dict(importance_dict, orient='index')[0].sort_values(ascending=False)
order = list(importance.index)

In [None]:
nb_bars = 25

plt.figure(figsize=(18,20))
plt.subplots_adjust(hspace=0.5)
for i in range(0,2):
    plt.subplot(4,1,i+1)
    low = nb_bars*i
    hi = nb_bars*(i+1)
    bars = sns.barplot(x=importance[low:hi].index, y=importance[low:hi])
    bars.set_xticklabels(bars.get_xticklabels(), rotation=45)
plt.show()

Reordering features by importance

In [None]:
X_train = X_train[order]
X_val = X_val[order]
test = test[order]
X_train.shape

Save prepared datasets

In [None]:
X_train.to_csv('X_train.csv')
X_val.to_csv('X_val.csv')
test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv', header=True)
y_val.to_csv('y_val.csv', header=True)