# Set-up

In [3]:
# Import libraries

# General
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, make_pipeline  # Same, but with the latter it is not necessary to name estimator and transformer
#from imblearn.pipeline import Pipeline as Imb_Pipe
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN

# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2, f_classif, GenericUnivariateSelect, mutual_info_classif
import eli5

# Predictive Modeling (Models)
from sklearn.dummy import DummyClassifier, DummyRegressor
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_predict, cross_val_score, cross_validate, KFold
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, NuSVC, SVR
from sklearn.linear_model import LinearRegression, LogisticRegression, PassiveAggressiveRegressor, ElasticNet, SGDRegressor, RANSACRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, VotingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from xgboost import XGBClassifier, XGBRegressor
from scipy.stats import randint

# Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer, fbeta_score, accuracy_score, confusion_matrix, f1_score, precision_recall_curve, recall_score, precision_score
from scipy.sparse import csr_matrix

# Neural Networks
from keras import models, layers, optimizers, regularizers
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG

In [4]:
# Dashboard (Global Variables)
dataset_loc = "berlin"  # "berlin", "paris", "amsterdam"
dataset_date = "2020-03-17"  # "2019-12-11", "2020-01-10", "2020-02-18", "2020-03-17", "2020-05-14"
target = 'price_log'  # for regression: 'occupancy_rate', 'price_log' | for classification: 'price_class', 'occupancy_class'
drop_cols = [
    'occupancy_rate', 'occupancy_class', 'price_class',
    "review_scores_class_new", "review_scores_class", "review_scores_calc"
]  # additional columns to drop: 'occupancy_class', 'occupancy_rate', 'price_log', 'price_class'
scoring = 'neg_mean_squared_error'  # for regression: 'neg_mean_squared_error', 'r2', 'neg_mean_poisson_deviance', 'neg_median_absolute_error' | for classification: "f1", "recall", "precision", "accuracy", "roc_auc"
test_size = 0.2
random_state = 42
pd.set_option('display.max_rows', 200)

In [5]:
# Import data_engineered
data = pd.read_pickle(f"saves/{dataset_loc}_{dataset_date}/data_engineered.pkl")

In [6]:
# Alternative: Import from csv
#data_types_engineered = pd.read_csv('saves/types_engineered.csv')['types']
#data = pd.read_csv("saves/data_engineered.csv", dtype=data_types_engineered.to_dict())
#data.set_index('id', inplace=True)

# Preprocessing (Train/Test Split and Pipeline)

In [7]:
# Drop columns
data = data.drop(drop_cols, axis=1)

In [8]:
# Drop rows (optional, just temporary)
#data = data[data.number_of_reviews_ltm_log>1.7]

## Preprocessing pipeline

In [9]:
# Create list for categorical predictors/features (used in "Scaling with Preprocessing Pipeline")
cat_features = list(data.columns[data.dtypes == object])
#cat_features.remove("neighbourhood")
#cat_features.remove("zipcode")
cat_features

['cancellation_policy',
 'host_response_time',
 'neighbourhood_cleansed',
 'property_type',
 'room_type',
 'zipcode']

In [10]:
# Create list for numerical predictors/features (removing target column, used in "Scaling with Preprocessing Pipeline")
num_features = list(data.columns[data.dtypes != object])
num_features.remove(target)
num_features

['accommodates',
 'accommodates_per_bed',
 'am_balcony',
 'am_breakfast',
 'am_child_friendly',
 'am_elevator',
 'am_essentials',
 'am_nature_and_views',
 'am_pets_allowed',
 'am_private_entrance',
 'am_smoking_allowed',
 'am_tv',
 'am_white_goods',
 'availability_90',
 'bathrooms_log',
 'bedrooms',
 'calc_host_lst_count_sqrt_log',
 'first_review_days_sqrt',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_response_rate',
 'instant_bookable',
 'last_review_days_sqrt',
 'latitude',
 'listing_no',
 'longitude',
 'maximum_nights',
 'minimum_nights_log',
 'number_of_reviews_ltm_log',
 'price_extra_fees_sqrt',
 'price_extra_people',
 'review_scores_location',
 'review_scores_rating_sqrt',
 'text_len_sqrt',
 'wk_mth_discount']

In [11]:
# Build preprocessor pipeline
# Pipeline for numerical features
num_pipeline = Pipeline([('imputer_num', SimpleImputer(strategy='median')),
                         ('std_scaler', StandardScaler())])

# Pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(drop='first', handle_unknown='error'))
])

# Complete pipeline
preprocessor = ColumnTransformer([('num', num_pipeline, num_features),
                                  ('cat', cat_pipeline, cat_features)])

## Train/test split

In [12]:
# Define predictors and target variable
X = data.drop([target], axis=1)
y = data[target]

In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=random_state,
                                                    shuffle=True)
#                                                   stratify=y) # Use stratify=y if labels are inbalanced (e.g. most wines are 5 or 6; check with value_counts()!)

In [14]:
# Saving preprocessed X_train and X_test
X_train_prep_preprocessor = preprocessor.fit(X_train)

X_train_prep = X_train_prep_preprocessor.transform(X_train)
X_train_num_prep = num_pipeline.fit_transform(X_train[num_features])
X_test_prep = X_train_prep_preprocessor.transform(X_test)

In [15]:
# Get feature names from pipeline after one-hot encoding as "column_names"
onehot_columns = list(preprocessor.named_transformers_['cat']['1hot'].get_feature_names(cat_features))
column_names = num_features + onehot_columns

In [23]:
# Define function for feature importances
def feat_importances(model,
                     cv_model=True,
                     named_steps=None,
                     column_names=column_names):
    if cv_model == True:
        if named_steps == None:
            print("Please provide named_steps parameter")
#            break
        else:
            importances = pd.DataFrame(
                model.best_estimator_.named_steps[named_steps].
                feature_importances_,
                index=column_names,
                columns=['weight'])
    else:
        importances = pd.DataFrame(model.feature_importances_,
                                   index=column_names,
                                   columns=['weight'])
    importances.sort_values('weight', inplace=True, ascending=False).plot(
        kind='barh', figsize=(20, len(importances) / 2))
    return importances

## Outlier Detection

In [14]:
# Preprocess data
#train_outl = num_pipeline.fit_transform(X_train[num_features], y_train)

In [15]:
# Fit DBSCAN model
#outl_model = DBSCAN(eps=3.0, min_samples=10).fit(train_outl)
#outl_labels = outl_model.labels_

In [16]:
# Display results (# of outliers)
#pd.Series(outl_labels).value_counts()

In [17]:
# Illustrate results
#plt.figure(figsize=(10,10))
#
#unique_labels = set(outl_labels)
#colors = ['blue', 'red']
#
#for color,label in zip(colors, unique_labels):
#    sample_mask = [True if l == label else False for l in outl_labels]
#    plt.plot(train_outl[:,0][sample_mask], train_outl[:, 1][sample_mask], 'o', color=color);
#plt.xlabel('accommodates_per_bed');
#plt.ylabel('accommodates_per_room');

Interpreting the results:

- https://www.kaggle.com/kevinarvai/outlier-detection-practice-uni-multivariate
- https://datascience.stackexchange.com/questions/46092/how-do-we-interpret-the-outputs-of-dbscan-clustering

## Perform Feature Selection (add most useful to modeling pipeline)

In [18]:
# Set X_fs to desired variable
X_fs = X_train[
    num_features]  # X_train_prep, X_train_num_prep, X_train[num_features]
#X_fs = pd.DataFrame(X_fs, columns = X_train_prep_cols)

**GenericUnivariateSelect** (Classification and Regression)

In [19]:
# Apply GenericUnivariateSelect
trans_GUS = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0),
                                    mode='k_best',
                                    param=15)  #mode='percentile', 'k_best'
X_train_GUS = trans_GUS.fit_transform(X_fs, y_train)

**mutual_info_classif** (Classification)

In [20]:
# Fit mutual_info_classif
#X_train_mic = mutual_info_classif(X_fs, y_train)

In [21]:
# Plot feature importance
#plt.subplots(1, figsize=(26, 1))
#sns.heatmap(X_train_mic[:, np.newaxis].T, cmap='Blues', cbar=False, linewidths=1, annot=True)
#plt.yticks([], [])
#plt.gca().set_xticklabels(X_fs.columns, rotation=45, ha='right', fontsize=12)
#plt.suptitle("Feature Importance (mutual_info_classif)", fontsize=18, y=1.2)
#plt.gcf().subplots_adjust(wspace=0.2)
#pass

In [22]:
# Apply GenericUnivariateSelect to reduce features (optional)
#trans_mic = GenericUnivariateSelect(score_func=mutual_info_classif, mode='k_best', param=15) #mode='percentile', 'k_best',
#X_train_mic_GUS = trans_mic.fit_transform(X_fs, y_train)

In [23]:
# Print kept features
#print("We started with {0} features but retained only {1} of them!".format(
#    X_fs.shape[1] - 1, X_train_mic_GUS.shape[1]))

#columns_retained_Select = X_fs.columns[trans_mic.get_support()].values
#pd.DataFrame(X_train_mic_GUS, columns=columns_retained_Select).head()

**chi2** (Classification)

**mutual_info_regression** (Regression)

# Predictive Modeling: Classification ("price_class")

## Apply Classification Models

In [133]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" +
      " as the target")
print("The target variable y is currently set to " +
      f"\033[1m{y_upper}\033[0m")

You are currently using [1mPRICE_CLASS[0m as the target
The target variable y is currently set to [1mPRICE_CLASS[0m


In [134]:
# Select models for comparison
models = {
    'Baseline':
    DummyClassifier(strategy='most_frequent'),
    'LogReg':
    LogisticRegression(max_iter=1000),
    'KNN':
    KNeighborsClassifier(),
    'SVC':
    SVC(kernel='rbf', C=1E6),
    'Decision Tree':
    DecisionTreeClassifier(criterion="gini",
                           max_depth=3,
                           random_state=random_state),
    'Random Forest':
    RandomForestClassifier(random_state=random_state,
                           max_features='sqrt',
                           n_jobs=-1),
    'Gradient Boost':
    GradientBoostingClassifier(random_state=random_state),
    'XGBoost':
    XGBClassifier(),
    'AdaBoost':
    AdaBoostClassifier(random_state=random_state)
}

In [135]:
# Calculate and display results
results = pd.DataFrame(columns=['Model', 'MSE', 'RMSE', 'R2'])
i = 0
for m in models.items():
    # Building a full pipeline with our preprocessor and a Classifier
    pipe = Pipeline([('preprocessor', preprocessor), (m[0], m[1])])
    # Making predictions on the training set using cross validation as well as calculating the probabilities
    y_train_pred = cross_val_predict(pipe,
                                     X_train,
                                     y_train.values.ravel(),
                                     cv=5,
                                     verbose=4,
                                     n_jobs=-1)
    # Calculating metrices
    temp = pd.DataFrame(
        {
            'Model': m[0],
            'MSE': mean_squared_error(y_train, y_train_pred),
            'RMSE': mean_squared_error(y_train, y_train_pred, squared=False),
            'MAE': mean_absolute_error(y_train, y_train_pred),
            'R2': r2_score(y_train, y_train_pred),
            'Accuracy': accuracy_score(y_train, y_train_pred),
            'Recall': recall_score(y_train, y_train_pred, average="weighted"),
            'Precision': precision_score(
                y_train, y_train_pred, average="weighted"),
            'F1 Score': f1_score(y_train, y_train_pred, average="weighted")
        },
        index=[i])
    print(f"Confusion Matrix {m[0]}: \n" +
          str(confusion_matrix(y_train, y_train_pred)))
    i += 1
    results = pd.concat([results, temp])
results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Confusion Matrix Baseline: 
[[   0    0 1085    0    0]
 [   0    0 1508    0    0]
 [   0    0 1934    0    0]
 [   0    0 1174    0    0]
 [   0    0  903    0    0]]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.3s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Confusion Matrix LogReg: 
[[ 527  411  116   14   17]
 [ 309  704  418   60   17]
 [  97  377 1123  253   84]
 [  20   63  536  342  213]
 [  10    7  148  201  537]]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.6s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Confusion Matrix KNN: 
[[452 412 173  41   7]
 [384 634 394  73  23]
 [287 541 811 219  76]
 [ 99 206 446 294 129]
 [ 30  76 207 184 406]]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.0s remaining:   13.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Confusion Matrix SVC: 
[[539 374 147  18   7]
 [397 619 404  71  17]
 [168 430 927 333  76]
 [ 29 110 438 390 207]
 [  7  28 151 233 484]]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
  _warn_prf(average, modifier, msg_start, len(result))


Confusion Matrix Decision Tree: 
[[ 248  654  101    0   82]
 [ 176  903  375    0   54]
 [  57  540 1093    0  244]
 [  23  106  699    0  346]
 [  22   38  275    0  568]]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.1s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Confusion Matrix Random Forest: 
[[ 479  462  123    9   12]
 [ 268  746  448   37    9]
 [  72  395 1226  180   61]
 [  11   67  592  314  190]
 [   3   10  194  155  541]]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   15.0s remaining:   22.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Confusion Matrix Gradient Boost: 
[[ 541  435   86   14    9]
 [ 298  741  410   37   22]
 [  85  430 1111  215   93]
 [  10   72  568  310  214]
 [   6   16  158  167  556]]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.0s remaining:   13.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Confusion Matrix XGBoost: 
[[ 505  463   96    9   12]
 [ 283  764  414   25   22]
 [  74  434 1162  172   92]
 [  13   76  603  262  220]
 [   6   14  177  144  562]]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.0s remaining:    1.5s


Confusion Matrix AdaBoost: 
[[ 364  574  113    6   28]
 [ 260  757  421   34   36]
 [ 103  367 1127  203  134]
 [  18   66  542  275  273]
 [  16   10  166  158  553]]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.5s finished


Unnamed: 0,Model,MSE,RMSE,R2,MAE,Accuracy,Recall,Precision,F1 Score
0,Baseline,1.610236,1.268951,-0.006986,1.008177,0.292853,0.292853,0.085763,0.132672
1,LogReg,0.900061,0.948715,0.437133,0.626287,0.489552,0.489552,0.487786,0.48555
2,KNN,1.444579,1.201906,0.09661,0.850697,0.393247,0.393247,0.404764,0.393662
3,SVC,1.023622,1.011742,0.359862,0.695639,0.448062,0.448062,0.449577,0.448038
4,Decision Tree,1.305421,1.14255,0.183635,0.774833,0.425803,0.425803,0.355363,0.374278
5,Random Forest,0.832374,0.912346,0.479462,0.601908,0.500606,0.500606,0.506176,0.49385
6,Gradient Boost,0.856905,0.925692,0.464121,0.612508,0.493489,0.493489,0.493111,0.488081
7,XGBoost,0.863568,0.929283,0.459955,0.614627,0.492883,0.492883,0.493327,0.48366
8,AdaBoost,0.998789,0.999394,0.375392,0.668686,0.465778,0.465778,0.461504,0.454862


# Predictive Modeling: Regression ("price_log")

## Apply Regression Models

In [24]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" +
      " as the target")
print("The target variable y is currently set to " +
      f"\033[1m{y_upper}\033[0m")

You are currently using [1mPRICE_LOG[0m as the target
The target variable y is currently set to [1mPRICE_LOG[0m


In [25]:
# Select models for comparison
models = {
    'Baseline':
    DummyRegressor(strategy='mean'),
    'LinReg':
    LinearRegression(),
    'Passive Aggressive':
    PassiveAggressiveRegressor(),
    #        'RANSAC' : RANSACRegressor(),
    'ElasticNet':
    ElasticNet(),
    'Stochastic Gradient Descent':
    SGDRegressor(max_iter=1000, tol=1e-3),
    'Decision Tree':
    DecisionTreeRegressor(criterion="mse",
                          max_depth=3,
                          random_state=random_state),
    'Random Forest':
    RandomForestRegressor(random_state=random_state,
                          max_features='sqrt',
                          n_jobs=-1),
    'Gradient Boost':
    GradientBoostingRegressor(random_state=random_state),
    'XGBoost':
    XGBRegressor(),
    'AdaBoost':
    AdaBoostRegressor(random_state=random_state),
    'SVR':
    SVR()
}

In [26]:
# Calculate and display results
results = pd.DataFrame(columns=['Model', 'MSE', 'RMSE', 'R2'])
i = 0
for m in models.items():
    # Building a full pipeline with our preprocessor and a Classifier
    pipe = Pipeline([('preprocessor', preprocessor), (m[0], m[1])])
    # Making predictions on the training set using cross validation as well as calculating the probabilities
    y_train_pred = cross_val_predict(pipe,
                                     X_train,
                                     y_train.values.ravel(),
                                     cv=5,
                                     verbose=4,
                                     n_jobs=-1)
    # Calculating metrices
    temp = pd.DataFrame(
        {
            'Model':
            m[0],
            'MSE':
            "{:.2f}".format(mean_squared_error(y_train, y_train_pred)),
            'RMSE':
            "{:.2f}".format(
                mean_squared_error(y_train, y_train_pred, squared=False)),
            'MAE':
            "{:.2f}".format(mean_absolute_error(y_train, y_train_pred)),
            'R2':
            "{:.2f}".format(r2_score(y_train, y_train_pred))
        },
        index=[i])
    i += 1
    results = pd.concat([results, temp])
results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.1s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.7s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_

Unnamed: 0,Model,MSE,RMSE,R2,MAE
0,Baseline,0.33,0.57,-0.0,0.46
1,LinReg,0.14,0.37,0.57,0.29
2,Passive Aggressive,0.28,0.53,0.14,0.41
3,ElasticNet,0.33,0.57,-0.0,0.46
4,Stochastic Gradient Descent,0.25,0.5,0.24,0.38
5,Decision Tree,0.2,0.45,0.39,0.34
6,Random Forest,0.14,0.37,0.59,0.28
7,Gradient Boost,0.13,0.36,0.61,0.27
8,XGBoost,0.13,0.36,0.61,0.27
9,AdaBoost,0.19,0.43,0.44,0.34


## Reg Model 1: XGBoost

In [27]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" +
      " as the target")
print("The target variable y is currently set to " +
      f"\033[1m{y_upper}\033[0m")

You are currently using [1mPRICE_LOG[0m as the target
The target variable y is currently set to [1mPRICE_LOG[0m


In [28]:
# Create pipeline to use in RandomSearchCV and GridSearchCV
pipeline_xgb_reg = Pipeline([('preprocessor', preprocessor),
                             ('xgb_reg',
                              XGBRegressor(n_estimators=184,
                                           learning_rate=0.05,
                                           random_state=random_state,
                                           max_depth=8,
                                           bootstrap=True,
                                           max_features=20,
                                           scoring=scoring,
                                           n_jobs=-1))])

### Hyperparameter Pre-Tuning with RandomizedSearchCV

In [29]:
# Display possible hyperparameters for XGBoost Regressor
test_xgb_reg = XGBRegressor()
test_xgb_reg.get_params().keys()

dict_keys(['base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'importance_type', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'n_jobs', 'nthread', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample', 'verbosity'])

**Default values for XGBRegressor** (as base for hyperparameter search):

max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, silent=None, objective='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, importance_type='gain'

In [30]:
# Define hyperparameter distribution
param_distribs_xgb_reg = {
    'xgb_reg__n_estimators': randint(low=10, high=200),
    'xgb_reg__max_depth': randint(low=1, high=10),
    'xgb_reg__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]
}

In [31]:
# Create and fit RandomizedSearchCV, save "best_model"
rnd_xgb_reg = RandomizedSearchCV(pipeline_xgb_reg,
                                 param_distribs_xgb_reg,
                                 cv=5,
                                 scoring=scoring,
                                 n_iter=20,
                                 return_train_score=True,
                                 verbose=4,
                                 n_jobs=-1,
                                 random_state=random_state)

best_model_rnd_xgb_reg = rnd_xgb_reg.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.3min finished




In [32]:
# Display best_score_, best_params_ and best_estimator_
print('Best score:\n{:.2f}'.format(rnd_xgb_reg.best_score_))
print("Best parameters:\n{}".format(rnd_xgb_reg.best_params_))
#print("Best estimator:\n{}".format(grid_rf_reg.best_estimator_))

Best score:
-0.12
Best parameters:
{'xgb_reg__learning_rate': 0.1, 'xgb_reg__max_depth': 7, 'xgb_reg__n_estimators': 181}


### Hyperparameter Tuning with GridSearchCV

In [33]:
# Define hyperparameter grid
param_grid_xgb_reg = {
#    'xgb_reg__bootstrap': [True, False],
#    'xgb_reg__n_estimators': [190, 230, 290],
    'xgb_reg__max_features': [15, 30, 45],
#    'xgb_reg__max_depth': [6, 8, 10],
    'xgb_reg__learning_rate': [0.01, 0.02, 0.05, 0.1]
}

In [34]:
# Create and fit GridSearchCV, save "best_model"
grid_xgb_reg = GridSearchCV(pipeline_xgb_reg,
                            param_grid_xgb_reg,
                            cv=5,
                            scoring=scoring,
                            return_train_score=True,
                            verbose=4,
                            n_jobs=-1)

grid_xgb_reg.fit(X_train, y_train)
best_model_xgb_reg = grid_xgb_reg.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.1min finished




In [35]:
# Display best_score_, best_params_ and best_estimator_
print('Best score:\n{:.2f}'.format(grid_xgb_reg.best_score_))
print("Best parameters:\n{}".format(grid_xgb_reg.best_params_))
#print("Best estimator:\n{}".format(grid_rf_reg.best_estimator_))

Best score:
-0.12
Best parameters:
{'xgb_reg__learning_rate': 0.05, 'xgb_reg__max_features': 15}


### Feature Importances

In [36]:
type(grid_xgb_reg)

sklearn.model_selection._search.GridSearchCV

In [37]:
# Display feature importances
fi_xgb_reg = pd.DataFrame(grid_xgb_reg.best_estimator_.named_steps['xgb_reg'].feature_importances_, columns=['weight'], index=column_names)
fi_xgb_reg.sort_values('weight', inplace=True, ascending=False)
fi_xgb_reg

Unnamed: 0,weight
room_type_Private room,0.433737
property_type_Boutique hotel,0.029881
bedrooms,0.028565
room_type_Shared room,0.027884
neighbourhood_cleansed_Osloer Straße,0.01943
room_type_Hotel room,0.012455
neighbourhood_cleansed_Alexanderplatz,0.010959
zipcode_zip_10178,0.009162
zipcode_zip_12099,0.008989
bathrooms_log,0.008735


### Final Evaluation Best Model

In [None]:
# Predict target with "best model"
y_train_pred_xgb_reg = best_model_xgb_reg.predict(X_train_prep)

In [None]:
# Final evaluation of "best model"
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_train_pred_xgb_reg))),
print("RMSE: {:.2f}".format(
    mean_squared_error(y_test, y_train_pred_xgb_reg, squared=False))),
print("MAE: {:.2f}".format(mean_absolute_error(y_test, y_train_pred_xgb_reg))),
print("R2: {:.2f}".format(r2_score(y_test, y_train_pred_xgb_reg))),

In [None]:
# Display confidence interval (scipy stats)
confidence = 0.95
squared_errors = (y_train_pred_xgb_reg - y_test)**2
np.sqrt(
    stats.t.interval(confidence,
                     len(squared_errors) - 1,
                     loc=squared_errors.mean(),
                     scale=stats.sem(squared_errors)))

## Reg Model 2: Support Vector Machines

In [58]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" +
      " as the target")
print("The target variable y is currently set to " +
      f"\033[1m{y_upper}\033[0m")

You are currently using [1mPRICE_LOG[0m as the target
The target variable y is currently set to [1mPRICE_LOG[0m


In [59]:
# Create pipeline to use in RandomSearchCV and GridSearchCV
pipeline_svm_reg = Pipeline([('preprocessor', preprocessor),
                             ('svm_reg', SVR(kernel='rbf', C=1, degree=3))])

### Hyperparameter Pre-Tuning with RandomizedSearchCV

In [60]:
# Display possible hyperparameters for Supply Vector Machine
test_svr_reg = SVR()
test_svr_reg.get_params().keys()

dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])

**Default values for Supply Vector Machine** (as base for hyperparameter search):

kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1

In [61]:
# Define hyperparameter distribution
param_distribs_svm_reg = {
    'svm_reg__kernel': ['linear', 'poly', 'rbf'],
    'svm_reg__C': [0.1, 0.5, 1, 2, 5, 10, 50, 100, 500, 1000],
    'svm_reg__degree': randint(low=1, high=5)
}

In [62]:
# Create and fit RandomizedSearchCV, save "best_model"
rnd_svm_reg = RandomizedSearchCV(pipeline_svm_reg,
                                 param_distribs_svm_reg,
                                 cv=2,
                                 scoring=scoring,
                                 n_iter=1,
                                 return_train_score=True,
                                 verbose=4,
                                 n_jobs=-1,
                                 random_state=random_state)

best_model_rnd_svm_reg = rnd_svm_reg.fit(X_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  6.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  6.0min finished


In [63]:
# Display best_score_, best_params_ and best_estimator_
print('Best score:\n{:.2f}'.format(rnd_svm_reg.best_score_))
print("Best parameters:\n{}".format(rnd_svm_reg.best_params_))
#print("Best estimator:\n{}".format(grid_rf_reg.best_estimator_))

Best score:
0.56
Best parameters:
{'svm_reg__C': 50, 'svm_reg__degree': 4, 'svm_reg__kernel': 'linear'}


### Hyperparameter Tuning with GridSearchCV

In [51]:
# Define hyperparameter grid
param_grid_svm_reg = {
#    'svm_reg__kernel': ['linear', 'poly', 'rbf'],
#    'xgb_reg__C': [0.01, 0.05, 0.1, 1, 5, 10, 50, 100],
#    'xgb_reg__degree': [1, 3, 5]
}

In [55]:
# Create and fit GridSearchCV, save "best_model"
grid_svm_reg = GridSearchCV(pipeline_svm_reg,
                            param_grid_svm_reg,
                            cv=5,
                            scoring=scoring,
                            return_train_score=True,
                            verbose=4,
                            n_jobs=-1)

grid_svm_reg.fit(X_train, y_train)
best_model_svm_reg = grid_svm_reg.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.5min finished




In [56]:
# Display best_score_, best_params_ and best_estimator_
print('Best score:\n{:.2f}'.format(grid_svm_reg.best_score_))
print("Best parameters:\n{}".format(grid_svm_reg.best_params_))
#print("Best estimator:\n{}".format(grid_rf_reg.best_estimator_))

Best score:
-0.12
Best parameters:
{'xgb_reg__learning_rate': 0.05, 'xgb_reg__max_features': 15}


### Feature Importances

In [45]:
# Display feature importances
fi_svm_reg = pd.DataFrame(grid_svm_reg.best_estimator_.named_steps['svm_reg'].feature_importances_, columns=['weight'], index=column_names)
fi_svm_reg.sort_values('weight', inplace=True, ascending=False)
fi_svm_reg

Unnamed: 0,weight
room_type_Private room,0.405381
bedrooms,0.028635
room_type_Shared room,0.027683
property_type_Boutique hotel,0.025367
neighbourhood_cleansed_Osloer Straße,0.017542
room_type_Hotel room,0.013085
neighbourhood_cleansed_Alexanderplatz,0.013
bathrooms_log,0.011078
calc_host_lst_count_sqrt_log,0.009484
zipcode_zip_10119,0.009205


### Final Evaluation Best Model

## Final Evaluation with Testing Set

In [None]:
best_models_reg = [best_model_xgb_reg, best_model_svm_reg]

In [None]:
# Transform X_test for final evaluation
#X_test_prep = preprocessor.transform(X_test)

In [None]:
# Predict target with "best model"
#y_pred_rf_reg = best_model_rf_reg.predict(X_test_prep)

In [None]:
# Final evaluation of "best model"
#print("MSE: {:.2f}".format(mean_squared_error(y_test, y_pred_rf_reg))),
#print("RMSE: {:.2f}".format(mean_squared_error(y_test, y_pred_rf_reg, squared=False))),
#print("MAE: {:.2f}".format(mean_absolute_error(y_test, y_pred_rf_reg))),
#print("R2: {:.2f}".format(r2_score(y_test, y_pred_rf_reg))),

In [None]:
# Illustrate best model
#fig, axes = plt.subplots(1, 2, figsize = (14, 6))
#axes = axes.flatten()

#y_pred = best_model.predict(X_test_prep)
#axes[0].scatter(y_test, y_pred)
#axes[0].set_xlabel('y_test')
#axes[0].set_ylabel('y_pred')

#coef = best_model.best_estimator_.named_steps['xgb'].coef_
#mean_coef = np.mean(coef)
#axes[1].plot(coef, 'o')
#axes[1].set_xlabel('coefficient index')
#axes[1].set_ylabel('coefficient size')
#axes[1].axhline(y = mean_coef, color = 'red', linestyle = '--', alpha = 0.5)
#plt.show()

In [None]:
# Display confidence interval (scipy stats)
#confidence = 0.95
#squared_errors = (y_pred_rf_reg - y_test) ** 2
#np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
#                         loc=squared_errors.mean(),
#                         scale=stats.sem(squared_errors)))

# Predictive Modeling: Neural Networks ("price_log")

In [None]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" +
      " as the target")
print("The target variable y is currently set to " +
      f"\033[1m{y_upper}\033[0m")

In [None]:
# Build the model
model_nn_seq = models.Sequential()
model_nn_seq.add(
    layers.Dense(128,
                 input_shape=(X_train_prep.shape[1], ),
                 kernel_regularizer=regularizers.l1(0.005),
                 activation='relu'))
model_nn_seq.add(
    layers.Dense(256,
                 kernel_regularizer=regularizers.l1(0.005),
                 activation='relu'))
model_nn_seq.add(
    layers.Dense(256,
                 kernel_regularizer=regularizers.l1(0.005),
                 activation='relu'))
model_nn_seq.add(
    layers.Dense(512,
                 kernel_regularizer=regularizers.l1(0.005),
                 activation='relu'))
model_nn_seq.add(layers.Dense(1, activation='linear'))

# Compile the model
model_nn_seq.compile(loss='mean_squared_error',
                     optimizer='adam',
                     metrics=['mean_squared_error'])

# Model summary
print(model_nn_seq.summary())

# Visualize the neural network
#SVG(model_to_dot(model_nn_seq, show_layer_names=False, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
# Train the model
#model_nn_seq_start = time.time()

model_nn_seq_history = model_nn_seq.fit(X_train_prep,
                                        y_train,
                                        epochs=20,
                                        batch_size=256,
                                        validation_split=0.2)

#model_nn_seq_end = time.time()

#print(f"Time taken to run: {round((model_nn_seq_end - model_nn_seq_start)/60,1)} minutes")

In [None]:
# Evaluate the model
#model_nn_seq_model_evaluation(model_nn_seq, skip_epochs=2, X_train=X_train, X_test=X_test)

#score_nn_seq = model_nn_seq.evaluate(X_train_prep, y_train,verbose=1)
#print(score_nn_seq)