### Imports

In [None]:
# !pip install --upgrade category_encoders rich catboost
from rich.console import Console
console = Console()
print = console.print
from wrangling import X, y

In [None]:
import time
import math
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import matplotlib.pyplot as plt

import yellowbrick as yb
from yellowbrick.features import Rank1D
from yellowbrick.regressor import AlphaSelection, PredictionError, ResidualsPlot
from yellowbrick.datasets import load_energy
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.style import set_palette

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lars
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import OrthogonalMatchingPursuit

from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.isotonic import IsotonicRegression

import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
import catboost as ctb

from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe

np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import warnings
warnings.filterwarnings('ignore')

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=21)

In [None]:
plt.style.context('dark_background')
set_palette('sns_bright')

cm = sns.color_palette("blend:white,#00ff77", as_cmap=True)

def headd(i):
    return i.style.background_gradient(cmap = cm,axis=None)

### Visuals

In [None]:
## Ranking the features

fig, ax = plt.subplots(1, figsize=(10, 35))
vzr = Rank1D(ax=ax, color='#00ff77')
vzr.fit(X_train, y_train)
vzr.transform(X_train)
sns.despine(left=True, bottom=True)
vzr.poof();

In [None]:
# Showing the Residuals, differences between observed and predicted values of data 
# the 'delta' between the actual target value and the fitted value. Residual is a crucial concept in regression problems

model = Ridge()
visualizer = ResidualsPlot(
    model,
    hist=False,
    qqplot=True,
    size=(600, 200),
    train_color="indigo",
    test_color="#00ff77", 
    )

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
# visualizer.score(X_test, y_test)
g = visualizer.poof();

In [None]:
X_outliers = pd.DataFrame(index=X.columns, columns=['outliers', 'outliers%'])

for col in X.columns:
    if any(x in str(X[col].dtype)for x in ['int', 'float', 'int64', 'uint8']):
        
        X_outliers.loc[col, 'count'] = len(X)
        X_outliers.loc[col, 'q1'] = X[col].quantile(0.25)
        X_outliers.loc[col, 'q3'] = X[col].quantile(0.75)
        X_outliers.loc[col, 'iqr'] = X_outliers.loc[col, 'q3'] - X_outliers.loc[col, 'q1']
        X_outliers.loc[col, 'lower'] = X_outliers.loc[col, 'q1'] - (3 * X_outliers.loc[col, 'iqr'])
        X_outliers.loc[col, 'upper'] = X_outliers.loc[col, 'q3'] + (3 * X_outliers.loc[col, 'iqr'])
        X_outliers.loc[col, 'min'] = X[col].min()
        X_outliers.loc[col, 'max'] = X[col].max()
        X_outliers.loc[col, 'outliers'] = ((X[col] < X_outliers.loc[col, 'lower']) | (X[col] > X_outliers.loc[col,'upper'])).sum()
        X_outliers.loc[col, 'outliers%'] = np.round(X_outliers.loc[col,
        'outliers'] / len(X) *100)
        
# headd(X_outliers.head(10))

In [None]:
#Distribution of price
%matplotlib inline

fig, axs = plt.subplots(ncols=2, figsize=(14, 4))
fig.suptitle('Distribution of max guests (before and after removing large listings > 10)', weight='bold', fontsize=12)

# Before cleaning
x_axis=X['numberOfGuests'].dropna()
sns.distplot(pd.Series(x_axis, name='Max guests (before cleaning)'), ax=axs[0])

# Remove where price > 1000
condition = X[X['numberOfGuests'] > 400]
rows_to_drop = condition.index
print("You dropped {} rows.".format(condition.shape[0]))
X = X.drop(rows_to_drop, axis=0)
print("Dataset has {} rows, {} columns.".format(*X.shape))

#After cleaning
x_axis=X['numberOfGuests'].dropna()
sns.distplot(pd.Series(x_axis, name='Max guests (after cleaning)'), ax=axs[1]);

In [None]:
## Adding est. Annual Revenue
print("Dataset has {} rows, {}  before engineering.".format(*X.shape))
avg_occupancy_per_week = 4
X['yield'] = avg_occupancy_per_week * y  * 52

# cols_to_drop = ['cleaning_fee']
# df = df.drop(cols_to_drop, axis = 1)
print("Dataset has {} rows, {} columns.".format(*X.shape))


### Linear Regression

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_pred, y_test, edgecolors=(0, 0, 1))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.show()

In [None]:
# model evaluation for testing set

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)


print("The model performance for testing set")
print("--------------------------------------")
print('MAE:  {}'.format(round(mae)))
print('MSE:  {}'.format(round(mse)))
print('RMSE: {}'.format(round(rmse)))
print('R2:   {}'.format(round(r2, 3)))

### Multiple Models

In [None]:
regressors = {
    "XGBRegressor": XGBRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "SVR": SVR(),
    "NuSVR": NuSVR(),
    "LinearSVR": LinearSVR(),
    "KernelRidge": KernelRidge(),
    "LinearRegression": LinearRegression(),
    "Ridge":Ridge(),
    "HuberRegressor": HuberRegressor(),
    "PassiveAggressiveRegressor": PassiveAggressiveRegressor(),
    "ARDRegression": ARDRegression(),
    "BayesianRidge": BayesianRidge(),
    "ElasticNet": ElasticNet(),
    "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
}


In [None]:
df_models = pd.DataFrame(columns=['Model', 'Run_Time', 'MAE', 'MSE', 'R2', 'RMSE', 'RMSE_CV'])

for key in regressors:

    print('✓',key)

    start_time = time.time()

    regressor = regressors[key]

    model = regressor.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    scores = cross_val_score(model, 
                             X_train, 
                             y_train,
                             scoring="neg_mean_squared_error", 
                             cv=10)

    row = {'Model': key,
           'Run_Time': format(round((time.time() - start_time)/60,2)),
           'MAE': round(mean_absolute_error(y_test, y_pred)),
           'MSE': round(mean_squared_error(y_test, y_pred)),
           'R2': round(r2_score(y_test, y_pred), 3),
           'RMSE': round(np.sqrt(mean_squared_error(y_test, y_pred))),
           'RMSE_CV': round(np.mean(np.sqrt(-scores)))
    }
    
    df_models = df_models.append(row, ignore_index=True)
    df_models

In [None]:
df_models.sort_values(by='RMSE_CV', ascending=True)

### Focusing on XGB

In [None]:
hyperparameter_grid = {
    'n_estimators': [100],
    'max_depth': [2, 3, 5],
    'learning_rate': [.001,.01]
    }

In [None]:
random_cv = RandomizedSearchCV(
    estimator=XGBRegressor(),
    param_distributions=hyperparameter_grid,
    cv=3,
    n_iter=30,
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1,
    verbose = 5, 
    return_train_score = True,
    random_state=13
    )

random_cv.fit(X_train,y_train)

random_cv.best_estimator_

In [None]:
regressor = random_cv.best_estimator_
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

In [None]:
# model evaluation for testing set

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("The model performance for testing set")
print("--------------------------------------")
print('MAE:  {}'.format(round(mae)))
print('MSE:  {}'.format(round(mse)))
print('RMSE: {}'.format(round(rmse)))
print('R2:   {}'.format(round(r2, 3)))

In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': np.around(y_pred)})
df

### XGB with DMatrix

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
mean_train = np.mean(y_train)
baseline_predictions = np.ones(y_test.shape) * mean_train
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE: {:.2f}".format(mae_baseline))

In [None]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror',
}


params['eval_metric'] = "mae"
num_boost_round = 999

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    )

In [None]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

In [None]:
print(f'CV-MAE: {round(cv_results["test-mae-mean"].min())}')

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

min_mae = float("Inf")

best_params = None

for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10,
    )
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
params['max_depth'] = 9
params['min_child_weight'] = 5

gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
        
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
%time

params['subsample'] = 1
params['colsample_bytree'] = 1

min_mae = float("Inf")
best_params = None

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))    # We update our parameters
    params['eta'] = eta
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['mae'],
        early_stopping_rounds=10
        )
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

In [None]:
params['eta'] = .05

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [None]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
# model evaluation for testing set
mae = mean_absolute_error(y_test, best_model.predict(dtest))
mse = mean_squared_error(y_test, best_model.predict(dtest))
rmse = math.sqrt(mean_squared_error(y_test, best_model.predict(dtest)))
r2 = r2_score(y_test, best_model.predict(dtest))

print("The model performance for testing set")
print("--------------------------------------")
print('MAE:  {}'.format(round(mae)))
print('MSE:  {}'.format(round(mse)))
print('RMSE: {}'.format(round(rmse)))
print('R2:   {}'.format(round(r2, 3)))
df = pd.DataFrame({'Actual': y_test, 'Predicted': best_model.predict(dtest)})
df

### Working with HyperOpt

In [None]:
# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     100,
}
xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': False
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

In [None]:
# LightGBM parameters
lgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     100,
}
lgb_fit_params = {
    'eval_metric': 'l2',
    'early_stopping_rounds': 10,
    'verbose': False
}
lgb_para = dict()
lgb_para['reg_params'] = lgb_reg_params
lgb_para['fit_params'] = lgb_fit_params
lgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


In [None]:
# CatBoost parameters
ctb_reg_params = {
    'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 0.31, 0.05)),
    'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
    'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
    'n_estimators':      100,
    'eval_metric':       'RMSE',
}
ctb_fit_params = {
    'early_stopping_rounds': 10,
    'verbose': False
}
ctb_para = dict()
ctb_para['reg_params'] = ctb_reg_params
ctb_para['fit_params'] = ctb_fit_params
ctb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

In [None]:
class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def lgb_reg(self, para):
        reg = lgb.LGBMRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def ctb_reg(self, para):
        reg = ctb.CatBoostRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK}

In [None]:
obj = HPOpt(X_train, X_test, y_train, y_test)

xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)
lgb_opt = obj.process(fn_name='lgb_reg', space=lgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)
ctb_opt = obj.process(fn_name='ctb_reg', space=ctb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)