<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Ordinal-Encoding" data-toc-modified-id="Ordinal-Encoding-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Ordinal Encoding</a></span></li><li><span><a href="#OneHot-Encoding-of-categorical-features" data-toc-modified-id="OneHot-Encoding-of-categorical-features-0.2"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>OneHot Encoding of categorical features</a></span></li><li><span><a href="#Numerical-columns" data-toc-modified-id="Numerical-columns-0.3"><span class="toc-item-num">0.3&nbsp;&nbsp;</span>Numerical columns</a></span><ul class="toc-item"><li><span><a href="#Surface-Area" data-toc-modified-id="Surface-Area-0.3.1"><span class="toc-item-num">0.3.1&nbsp;&nbsp;</span>Surface Area</a></span></li></ul></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-0.4"><span class="toc-item-num">0.4&nbsp;&nbsp;</span>Preprocessing</a></span></li><li><span><a href="#Removing-outliers" data-toc-modified-id="Removing-outliers-0.5"><span class="toc-item-num">0.5&nbsp;&nbsp;</span>Removing outliers</a></span></li><li><span><a href="#Output-processing" data-toc-modified-id="Output-processing-0.6"><span class="toc-item-num">0.6&nbsp;&nbsp;</span>Output processing</a></span></li><li><span><a href="#XGBoost-model" data-toc-modified-id="XGBoost-model-0.7"><span class="toc-item-num">0.7&nbsp;&nbsp;</span>XGBoost model</a></span></li></ul></li><li><span><a href="#Model-training" data-toc-modified-id="Model-training-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Model training</a></span><ul class="toc-item"><li><span><a href="#Gridsearch-hyperparameters-estimation" data-toc-modified-id="Gridsearch-hyperparameters-estimation-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Gridsearch hyperparameters estimation</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#Evaluation-on-the-training-set" data-toc-modified-id="Evaluation-on-the-training-set-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Evaluation on the training set</a></span></li><li><span><a href="#Evaluation-on-the-validation-set" data-toc-modified-id="Evaluation-on-the-validation-set-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Evaluation on the validation set</a></span></li></ul></li><li><span><a href="#Test" data-toc-modified-id="Test-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Test</a></span></li></ul></li></ul></div>

In [None]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame, read_csv, concat, get_dummies, Series, CategoricalDtype
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import (normalize, StandardScaler, LabelEncoder,
                                   OneHotEncoder, OrdinalEncoder, FunctionTransformer,
                                   PowerTransformer)
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.base import BaseEstimator
import xgboost
from xgboost import plot_importance, XGBRegressor
from pprint import pprint
from json import load
import seaborn as sns
import numpy as np
from math import sqrt

%matplotlib inline

In [None]:
project = 'house-prices'
version='v0.1-pipe'

In [None]:
fulltrain=read_csv('./train.csv',index_col=0)
test=read_csv('./test.csv',index_col=0)
fixed_seed=12345

In [None]:
target_column='SalePrice'
X_fulltrain = fulltrain.drop(target_column, axis=1)
y_fulltrain = fulltrain[target_column]
X_train, X_val, y_train, y_val = train_test_split(X_fulltrain, y_fulltrain, test_size=0.2, random_state=fixed_seed)

In [None]:
categories = load(open('categories.json',"r"))

In [None]:
num_columns = [c for c in X_train.columns if c not in categories.keys()]

In [None]:
ordinals={}
for key, value in list(categories.items()):
    if value[0] == 'Ex':
        ordinals[key] = value
        categories.pop(key)
for col, tags in ordinals.items():
    tags.reverse()
    
ord_columns = list(ordinals.keys())
ord_values = list(ordinals.values())
cat_columns = list(categories.keys())
cat_values = list(categories.keys())

In [None]:
ordinals.values()

In [None]:
# for col in ord_columns:
#     print(X_train[col].unique())
# for col in cat_columns:
#     print(X_train[col].unique())

## Ordinal Encoding

In [None]:

from scipy.stats import norm
    
EPSILON = 0.001
    
class GaussianOrdinals(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        self.quantiles = []
        for col in range(X.shape[1]):
            self.quantiles.append([])
            maximum = int(np.max(X[:,col]))
            value,count = np.unique(X[:,col], return_counts=True)
            counts = dict(zip(value.astype(int),count))
            prev_count = 0
            for i in range(maximum+1):
                count = prev_count + counts.get(i,0)
                self.quantiles[col].append((prev_count + count)/(2*X.shape[0]))
                # print(i, count, self.quantiles[col][i])
                prev_count = count
            if self.quantiles[col][0] < EPSILON:
                self.quantiles[col][0] = EPSILON
            if self.quantiles[col][-1] > 1 - EPSILON:
                self.quantiles[col][-1] = 1 - EPSILON   
        return self
    
    def transform(self, X):
        Xout = X.copy()
        for col in range(X.shape[1]):
            if np.max(X[:,col]) >= len(self.quantiles[col]):
                print(np.max(X[:,col]))
            for value,quantile in enumerate(self.quantiles[col]):
                Xout[:,col][(X[:,col])==value] = norm.ppf(quantile)
        return Xout
            
    
#    def inverse_transform(self, X):
#        pass

In [None]:
ordinal_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ord_values)),
#    ('gauss', PowerTransformer())
    ('gauss', GaussianOrdinals())
#    ('scaler', StandardScaler())
])

In [None]:
test_pipe = ColumnTransformer([
    ('ord', ordinal_pipe, ord_columns)
])
test_pipe.transformers[0][1].steps[2] = ('scaler', StandardScaler())
X1 = test_pipe.fit_transform(X_train)
test_pipe.transformers[0][1].steps[2] = ('gauss', PowerTransformer())
X2 = test_pipe.fit_transform(X_train)
test_pipe.transformers[0][1].steps[2] = ('gauss', GaussianOrdinals())
X3 = test_pipe.fit_transform(X_train)

plt.figure(figsize=(18,8))
plt.subplot(3,1,1)
sns.distplot(DataFrame(X1[:,4]), hist=True, kde_kws={'bw':0.5})
plt.subplot(3,1,2)
sns.distplot(DataFrame(X2[:,4]), hist=True, kde_kws={'bw':0.5})
plt.subplot(3,1,3)
sns.distplot(DataFrame(X3[:,4]), hist=True, kde_kws={'bw':0.5})
plt.show()

## OneHot Encoding of categorical features

In [None]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
#    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

## Numerical columns

### Surface Area

In [None]:
class AddSurface(TransformerMixin, BaseEstimator):
    #def __init__(self):
    #    pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['Surface'] =  X['2ndFlrSF'] + X['1stFlrSF'] + X['TotalBsmtSF']
        return X

In [None]:
numeric_pipe = Pipeline([
    ('surface', AddSurface()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

## Preprocessing

In [None]:
preprocess_pipe = ColumnTransformer([
    ('cat', categorical_pipe, cat_columns),
    ('num', numeric_pipe, num_columns),
    ('ord', ordinal_pipe, ord_columns)
])

## Removing outliers

In [None]:
class OutlierRemoverComposer(TransformerMixin, BaseEstimator):
    def __init__(self, model, outlier_estimator, **kwargs):
        self.outlier_estimator = outlier_estimator
        self.model = model
        self.kwargs = kwargs
        
    def fit(self, X, y):
        outliers = self.outlier_estimator.fit_predict(X)
        mask = outliers == 1

        X_clean = X[mask]
        y_clean = y[mask]
        
        self.model.fit(X_clean, y_clean)
        
        return self
    
    def predict(self,X, y=None):
        return self.model.predict(X)


In [None]:
outlier_detection = Pipeline([
    ('pp', preprocess_pipe),
    ('outlier', IsolationForest(max_samples=100, random_state=42))
])

## Output processing

In [None]:
output_pipe = Pipeline([
    ('log', FunctionTransformer(func=np.log, inverse_func=np.exp)),
    ('scaler', StandardScaler())
])

## XGBoost model

In [None]:
model = XGBRegressor(booster="gbtree",colsample_bytree=0.9,
                     max_depth=4, n_estimators=400, gamma= 0.01,
                     min_child_weight=8,
                     subsample=0.3,
                     random_state=fixed_seed)

In [None]:
model_pipe = Pipeline([
    ('pp', preprocess_pipe),
    ('xgb', model)
])

In [None]:
full_pipe = TransformedTargetRegressor(regressor=model_pipe, transformer=output_pipe)

In [None]:
outlier_pipe = OutlierRemoverComposer(model=full_pipe, outlier_estimator=outlier_detection)

# Model training

In [None]:
pipe = outlier_pipe
#pipe = full_pipe

## Gridsearch hyperparameters estimation

In [None]:
# from hypopt import GridSearch
# params = {'min_child_weight':[6,7,8,9], 'gamma':[i/100.0 for i in range(1,5)],  'subsample':[i/10.0 for i in range(2,5)],
# 'colsample_bytree':[i/10.0 for i in range(8,10)], 'max_depth': [3,4,5]}

# model = XGBRegressor(booster="gbtree")
# grid = GridSearch(model, params)
# grid.fit(X_train, y_train, X_val, y_val)
# grid.best_params

## Training

In [None]:
pipe.fit(X_train, y_train);

In [None]:
def RMSLE(y_true, y_pred):
    return sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)))

## Evaluation

### Evaluation on the training set

In [None]:
y_pred = pipe.predict(X_train)
#y_pred = output_pipe.inverse_transform(y_pred_process)

In [None]:
score = RMSLE(y_train, y_pred)
score

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(y_pred-y_train)
plt.show()

### Evaluation on the validation set

In [None]:
y_pred = pipe.predict(X_val)
#y_pred = output_pipe.inverse_transform(y_pred_process)

In [None]:
score = RMSLE(y_val, y_pred)
score

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(y_pred-y_val)
plt.show()

## Test

In [None]:
test_pred = pipe.predict(test)
#test_pred = output_pipe.inverse_transform(test_pred_process)

In [None]:
submission = DataFrame({"SalePrice": test_pred}, index=test.index)
submission.to_csv('test-prediction.csv')

Save prepared datasets

In [None]:
# X_train_process.to_csv('X_train.csv')
# X_val.to_csv('X_val.csv')
# test.to_csv('X_test.csv')
# y_train_clean.to_csv('y_train.csv', header=True)
# y_val.to_csv('y_val.csv', header=True)