### Import Libraries and Read in Data

In [73]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objects as go
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import time

In [74]:
# Read in Data
df = pd.read_csv("../input/math301-final-project-data/encoded_data.csv", index_col=0)
train = pd.read_csv("../input/math301-final-project-data/train.csv", index_col=0)
test = pd.read_csv("../input/math301-final-project-data/test.csv", index_col=0)

### RF on Features selected by RF Feature Importance - with increased number of estimators

In [75]:
# Drop features with feature importance 0 (by RF Feature Importance)

X_train = train.drop(labels=['price','year_2018','year_2020', 'room_type_Hotel room','year_2019'],axis=1).values
y_train = train[['price']].values

X_test = test.drop(labels=['price','year_2018','year_2020', 'room_type_Hotel room','year_2019'],axis=1).values
y_test = test[['price']].values

In [76]:
# RandomForestRegressor Estimator (with increased number of estimators)

rf_fi_fs = RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=42)

In [77]:
%%time
# Fit Feature Selected Data

rf_fi_fs.fit(X_train,y_train.ravel())

CPU times: user 6min 50s, sys: 3.54 s, total: 6min 53s
Wall time: 1min 46s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [78]:
%%time
# Save Predictions

rf_fi_fs_pred = rf_fi_fs.predict(X_test)

CPU times: user 3.08 s, sys: 48.8 ms, total: 3.13 s
Wall time: 906 ms


In [79]:
print("RF FI FS with increased n_estimators | Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, rf_fi_fs_pred),5)))
print("RF FI FS with increased n_estimators | Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, rf_fi_fs_pred),5)))
print("RF FI FS with increased n_estimators | Predicton R Squared: {}".format(round(r2_score(y_test, rf_fi_fs_pred),5)))
print("RF FI FS with increased n_estimators | Predicton Mean Squared Error: {}".format(round(mean_squared_error(y_test, rf_fi_fs_pred),5)))

RF FI FS with increased n_estimators | Predicton Median Absolute Error: 0.46667
RF FI FS with increased n_estimators | Predicton Mean Absolute Error: 1.18045
RF FI FS with increased n_estimators | Predicton R Squared: 0.99958
RF FI FS with increased n_estimators | Predicton Mean Squared Error: 11.34055


### Permutation Importance

In [80]:
%%time
import eli5
from eli5.sklearn import PermutationImportance

# rf_pi = RandomForestRegressor(n_estimators=500)

perm = PermutationImportance(rf_fi_fs, random_state=42).fit(X_train,y_train.ravel())

CPU times: user 19min, sys: 5.57 s, total: 19min 6s
Wall time: 5min 3s


In [81]:
eli5.show_weights(perm, 
                  feature_names = train.drop(labels=['price','year_2018','year_2020', 'room_type_Hotel room','year_2019'],axis=1).columns.tolist(),top=300)

Weight,Feature
1.0405  ± 0.1047,room_type_Private room
0.7960  ± 0.0494,longitude
0.7375  ± 0.1304,minimum_nights
0.3746  ± 0.0387,availability_365
0.2852  ± 0.0129,latitude
0.1689  ± 0.0359,calculated_host_listings_count
0.1547  ± 0.0163,last_review_month
0.0798  ± 0.0067,number_of_reviews
0.0792  ± 0.0102,name_cc
0.0581  ± 0.0056,room_type_Shared room


### LGBM Baseline

In [82]:
import lightgbm as lgb
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingRegressor

lgbm = LGBMRegressor(boosting_type='gbdt', n_estimators = 5000, learning_rate=0.03, max_depth=-1,
                     n_jobs=-1,objective='regression', random_state=42)

In [83]:
%%time
lgbm.fit(X_train,y_train.ravel())

CPU times: user 51.6 s, sys: 54.5 ms, total: 51.6 s
Wall time: 51.6 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.03, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=31,
              objective='regression', random_state=42, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [84]:
# Save Predictions

lgbm_pred = lgbm.predict(X_test)

In [85]:
# LGBM Baseline Prediction Performance

print("LGBM Baseline | Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, lgbm_pred),5)))
print("LGBM Baseline | Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, lgbm_pred),5)))
print("LGBM Baseline | Predicton R Squared: {}".format(round(r2_score(y_test, lgbm_pred),5)))
print("LGBM Baseline | Predicton Mean Squared Error: {}".format(round(mean_squared_error(y_test, lgbm_pred),5)))

LGBM Baseline | Predicton Median Absolute Error: 11.79497
LGBM Baseline | Predicton Mean Absolute Error: 17.75596
LGBM Baseline | Predicton R Squared: 0.97036
LGBM Baseline | Predicton Mean Squared Error: 798.11967


### XGBoost Baseline

In [86]:
%%time

from xgboost import XGBRegressor

xgb = XGBRegressor(max_depth=3,learning_rate=0.1,n_estimators=1000,reg_alpha=0.001,reg_lambda=0.000001,n_jobs=-1,min_child_weight=3)
xgb.fit(X_train,y_train.ravel())

CPU times: user 1min 40s, sys: 20.6 ms, total: 1min 40s
Wall time: 1min 40s


XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=3, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0.001,
             reg_lambda=1e-06, scale_pos_weight=1, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

In [87]:
%%time

xgb_pred = xgb.predict(X_test)

CPU times: user 920 ms, sys: 0 ns, total: 920 ms
Wall time: 918 ms


In [88]:
# XGB Baseline Prediction Performance

print("XGB Baseline | Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, xgb_pred),5)))
print("XGB Baseline | Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, xgb_pred),5)))
print("XGB Baseline | Predicton R Squared: {}".format(round(r2_score(y_test, xgb_pred),5)))
print("XGB Baseline | Predicton Mean Squared Error: {}".format(round(mean_squared_error(y_test, xgb_pred),5)))

XGB Baseline | Predicton Median Absolute Error: 22.4877
XGB Baseline | Predicton Mean Absolute Error: 38.30211
XGB Baseline | Predicton R Squared: 0.80093
XGB Baseline | Predicton Mean Squared Error: 5359.63985


### Bagging on Best Performing RF Model

In [89]:
from sklearn.ensemble import BaggingRegressor

bagr = BaggingRegressor(RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=42))

In [90]:
%%time

bagr.fit(X_train,y_train.ravel())

CPU times: user 21.4 s, sys: 10.5 s, total: 31.9 s
Wall time: 12min 26s


BaggingRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                      ccp_alpha=0.0,
                                                      criterion='mse',
                                                      max_depth=None,
                                                      max_features='auto',
                                                      max_leaf_nodes=None,
                                                      max_samples=None,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      min_weight_fraction_leaf=0.0,
                                                      n_estimators=300,
                                 

In [91]:
%%time 

bagr_pred = bagr.predict(X_test)

CPU times: user 28.7 s, sys: 396 ms, total: 29.1 s
Wall time: 8.68 s


In [92]:
# Bagging on Best Performing RF - Prediction Performance

print("Bagging on Best Performing RF | Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, bagr_pred),5)))
print("Bagging on Best Performing RF | Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, bagr_pred),5)))
print("Bagging on Best Performing RF | Predicton R Squared: {}".format(round(r2_score(y_test, bagr_pred),5)))
print("Bagging on Best Performing RF | Predicton Mean Squared Error: {}".format(round(mean_squared_error(y_test, bagr_pred),5)))

Bagging on Best Performing RF | Predicton Median Absolute Error: 2.94133
Bagging on Best Performing RF | Predicton Mean Absolute Error: 6.36247
Bagging on Best Performing RF | Predicton R Squared: 0.98693
Bagging on Best Performing RF | Predicton Mean Squared Error: 352.01356
