In [None]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame, read_csv, concat, get_dummies, Series, CategoricalDtype
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import normalize, StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.base import BaseEstimator
import xgboost
from xgboost import plot_importance, XGBRegressor
from pprint import pprint
from json import load
import seaborn as sns
import numpy as np
from math import sqrt

%matplotlib inline

In [None]:
project = 'house-prices'
version='v0.1'

In [None]:
fulltrain=read_csv('./train.csv',index_col=0)
test=read_csv('./test.csv',index_col=0)
fixed_seed=12345
train80, valid20 = train_test_split(fulltrain, test_size=0.2, random_state=fixed_seed)

In [None]:
target_column='SalePrice'
X_train = train80.drop(target_column, axis=1)
y_train = train80[target_column]
X_val = valid20.drop(target_column, axis=1)
y_val = valid20[target_column]

In [None]:
categories = load(open('categories.json',"r"))

In [None]:
num_columns = [c for c in X_train.columns if c not in categories.keys()]

In [None]:
ordinals={}
for key, value in list(categories.items()):
    if value[0] == 'Ex':
        ordinals[key] = value
        categories.pop(key)
for col, tags in ordinals.items():
    tags.reverse()
    
ord_columns = list(ordinals.keys())
ord_values = list(ordinals.values())
cat_columns = list(categories.keys())
cat_values = list(categories.keys())

In [None]:
ordinals.values()

In [None]:
# for col in ord_columns:
#     print(X_train[col].unique())
# for col in cat_columns:
#     print(X_train[col].unique())

## Ordinal Encoding

In [None]:
ordinal_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ord_values))
])

## OneHot Encoding of categorical features

In [None]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
#    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

## Numerical columns

### Surface Area

In [None]:
class AddSurface(TransformerMixin, BaseEstimator):
    #def __init__(self):
    #    pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['Surface'] =  X['2ndFlrSF'] + X['1stFlrSF'] + X['TotalBsmtSF']
        return X

In [None]:
numeric_pipe = Pipeline([
    ('surface', AddSurface()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

## Preprocessing

In [None]:
preprocess_pipe = ColumnTransformer([
    ('cat', categorical_pipe, cat_columns),
    ('num', numeric_pipe, num_columns),
    ('ord', ordinal_pipe, ord_columns)
])

## Removing outliers

In [None]:
class OutlierRemoverComposer(TransformerMixin, BaseEstimator):
    def __init__(self, model, outlier_estimator, **kwargs):
        self.outlier_estimator = outlier_estimator
        self.model = model
        self.kwargs = kwargs
        
    def fit(self, X, y):
        outliers = self.outlier_estimator.fit_predict(X)
        mask = outliers == 1
        # X_clean = X.loc[self.outliers==1,:]
        # y_clean = y.loc[self.outliers==1]
        X_clean = X[mask]
        y_clean = y[mask]
        
        self.model.fit(X_clean, y_clean)
        
        return self
    
    def predict(self,X, y=None):
        return self.model.predict(X)


In [None]:
outlier_detection = Pipeline([
    ('pp', preprocess_pipe),
    ('outlier', IsolationForest(max_samples=100, random_state=42))
])

## Output processing

In [None]:
output_pipe = Pipeline([
    ('log', FunctionTransformer(func=np.log, inverse_func=np.exp)),
    ('scaler', StandardScaler())
])

## XGBoost model

In [None]:
model = XGBRegressor(booster="gbtree",colsample_bytree=0.9,
                     max_depth=4, n_estimators=400, gamma= 0.01,
                     min_child_weight=8,
                     subsample=0.3,
                     random_state=fixed_seed)

In [None]:
model_pipe = Pipeline([
    ('pp', preprocess_pipe),
    ('xgb', model)
])

In [None]:
full_pipe = TransformedTargetRegressor(regressor=model_pipe, transformer=output_pipe)

In [None]:
outlier_pipe = OutlierRemoverComposer(model=full_pipe, outlier_estimator=outlier_detection)

# Model training

In [None]:
outlier_pipe.fit(X_train, y_train);

In [None]:
def RMSLE(y_true, y_pred):
    return sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)))

## Evaluation

## Gridsearch hyperparameters estimation

In [None]:
# from hypopt import GridSearch
# params = {'min_child_weight':[6,7,8,9], 'gamma':[i/100.0 for i in range(1,5)],  'subsample':[i/10.0 for i in range(2,5)],
# 'colsample_bytree':[i/10.0 for i in range(8,10)], 'max_depth': [3,4,5]}

# model = XGBRegressor(booster="gbtree")
# grid = GridSearch(model, params)
# grid.fit(X_train, y_train, X_val, y_val)
# grid.best_params

### Train

In [None]:
y_pred = outlier_pipe.predict(X_train)
#y_pred = output_pipe.inverse_transform(y_pred_process)

In [None]:
score = RMSLE(y_train, y_pred)
score

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(y_pred-y_train)
plt.show()

### Validation

In [None]:
y_pred = outlier_pipe.predict(X_val)
#y_pred = output_pipe.inverse_transform(y_pred_process)

In [None]:
score = RMSLE(y_val, y_pred)
score

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(y_pred-y_val)
plt.show()

## Test

In [None]:
test_pred = outlier_pipe.predict(test)
#test_pred = output_pipe.inverse_transform(test_pred_process)

In [None]:
submission = DataFrame({"SalePrice": test_pred}, index=test.index)
submission.to_csv('test-prediction.csv')

Save prepared datasets

In [None]:
# X_train_process.to_csv('X_train.csv')
# X_val.to_csv('X_val.csv')
# test.to_csv('X_test.csv')
# y_train_clean.to_csv('y_train.csv', header=True)
# y_val.to_csv('y_val.csv', header=True)