In [None]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame, read_csv, concat, get_dummies, Series, CategoricalDtype
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import normalize, StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator
import xgboost
from xgboost import plot_importance, XGBRegressor
from pprint import pprint
from json import load
import seaborn as sns
import numpy as np
from math import sqrt

%matplotlib inline

In [None]:
project = 'house-prices'
version='v0.1'

In [None]:
fulltrain=read_csv('./train.csv',index_col=0)
test=read_csv('./test.csv',index_col=0)
fixed_seed=1234578416
train80, valid20 = train_test_split(fulltrain, test_size=0.2, random_state=fixed_seed)

In [None]:
target_column='SalePrice'
X_train = train80.drop(target_column, axis=1)
y_train = train80[target_column]
X_val = valid20.drop(target_column, axis=1)
y_val = valid20[target_column]

In [None]:
categories = load(open('categories.json',"r"))

In [None]:
num_columns = [c for c in X_train.columns if c not in categories.keys()]

In [None]:
ordinals={}
for key, value in list(categories.items()):
    if value[0] == 'Ex':
        ordinals[key] = value
        categories.pop(key)
for col, tags in ordinals.items():
    tags.reverse()
    
ord_columns = list(ordinals.keys())
ord_values = list(ordinals.values())
cat_columns = list(categories.keys())
cat_values = list(categories.keys())

In [None]:
ordinals.values()

In [None]:
# for col in ord_columns:
#     print(X_train[col].unique())
# for col in cat_columns:
#     print(X_train[col].unique())

## Ordinal Encoding

In [None]:
ordinal_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ord_values))
])

## OneHot Encoding of categorical features

In [None]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
#    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

## Numerical columns

### Surface Area

In [None]:
class AddSurface(TransformerMixin, BaseEstimator):
    #def __init__(self):
    #    pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['Surface'] =  X['2ndFlrSF'] + X['1stFlrSF'] + X['TotalBsmtSF']
        return X

In [None]:
numeric_pipe = Pipeline([
    ('surface', AddSurface()),
    ('imputer', SimpleImputer(strategy='median')),   
    ('scaler', StandardScaler())
])

## Preprocessing

In [None]:
preprocess_pipe = ColumnTransformer([
    ('cat', categorical_pipe, cat_columns),
    ('num', numeric_pipe, num_columns),
    ('ord', ordinal_pipe, ord_columns)
])

## Removing outliers

In [None]:
outlier_detection = Pipeline([
    ('pp', preprocess_pipe),
    ('outlier', IsolationForest(max_samples=100, random_state=42))
])

In [None]:
outliers = outlier_detection.fit_predict(X_train)

X_train_clean = X_train.loc[outliers==1,:]
y_train_clean = y_train.loc[outliers==1]

## Preprocess Train, Test and Valid sets

In [None]:
X_train_process = preprocess_pipe.fit_transform(X_train_clean)
test_process = preprocess_pipe.transform(test)
X_valid_process = preprocess_pipe.transform(X_val)

## Output processing

In [None]:
output_pipe = Pipeline([
    ('log', FunctionTransformer(func=np.log, inverse_func=np.exp)),
    ('scaler', StandardScaler())
])

In [None]:
output_pipe.fit(y_train_clean.to_numpy().reshape(-1,1))

y_train_process = output_pipe.transform(y_train_clean.to_numpy().reshape(-1,1))
y_val_process = output_pipe.transform(y_val.to_numpy().reshape(-1,1))

## List column with a single value

In [None]:
def nunique_percol_sort(a):
    b = np.sort(a, axis=0)
    return (b[1:] != b[:-1]).sum(axis=0)+1

In [None]:
i, j = np.where(nunique_percol_sort(X_train_process.todense())==1)
j

## Gridsearch hyperparameters estimation

In [None]:
# from hypopt import GridSearch
# params = {'min_child_weight':[6,7,8,9], 'gamma':[i/100.0 for i in range(1,5)],  'subsample':[i/10.0 for i in range(2,5)],
# 'colsample_bytree':[i/10.0 for i in range(8,10)], 'max_depth': [3,4,5]}

# model = XGBRegressor(booster="gbtree")
# grid = GridSearch(model, params)
# grid.fit(X_train, y_train, X_val, y_val)
# grid.best_params

XGBoost model

In [None]:
model = XGBRegressor(booster="gbtree",colsample_bytree=0.9,
                     max_depth=4, n_estimators=400, gamma= 0.01,
                     min_child_weight=8,
                     subsample=0.3)

In [None]:
full_pipe = Pipeline([
    ('pp', preprocess_pipe),
    ('xgb', model)
])

# Model training

In [None]:
y_train_process.shape, y_train_clean.shape

In [None]:
full_pipe.fit(X_train_clean, y_train_process);

In [None]:
def RMSLE(y_true, y_pred):
    return sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)))

## Evaluation

### Train

In [None]:
y_pred_process = full_pipe.predict(X_train_clean)
y_pred = output_pipe.inverse_transform(y_pred_process)

In [None]:
score = RMSLE(y_train_clean, y_pred)
score

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(y_pred-y_train_clean)
plt.show()

### Validation

In [None]:
y_pred_process = full_pipe.predict(X_val)
y_pred = output_pipe.inverse_transform(y_pred_process)

In [None]:
score = RMSLE(y_val, y_pred)
score

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(y_pred-y_val)
plt.show()

## Test

In [None]:
test_pred_process = full_pipe.predict(test)
test_pred = output_pipe.inverse_transform(test_pred_process)

In [None]:
submission = DataFrame({"SalePrice": test_pred}, index=test.index)
submission.to_csv('test-prediction.csv')

# Feature importance
Feature importance as reported by XGBoost

In [None]:
# importance_dict = model.get_booster().get_score(importance_type="gain")
# importance = DataFrame.from_dict(importance_dict, orient='index')[0].sort_values(ascending=False)
# order = list(importance.index)

In [None]:
model.feature_importances_
importance = DataFrame(data=model.feature_importances_ )[0].sort_values(ascending=False)
order = importance.index

In [None]:
importance.index[0:10]

In [None]:
nb_bars = 25

plt.figure(figsize=(18,20))
plt.subplots_adjust(hspace=0.5)
for i in range(0,2):
    plt.subplot(4,1,i+1)
    low = nb_bars*i
    hi = nb_bars*(i+1)
    bars = sns.barplot(x=importance.index[low:hi], y=importance[importance.index[low:hi]])
    bars.set_xticklabels(bars.get_xticklabels(), rotation=45)
plt.show()

Save prepared datasets

In [None]:
X_train_clean.to_csv('X_train.csv')
X_val.to_csv('X_val.csv')
test.to_csv('X_test.csv')
y_train_clean.to_csv('y_train.csv', header=True)
y_val.to_csv('y_val.csv', header=True)