In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
import time
import torch
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,  mean_absolute_error
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, average_precision_score, log_loss

In [5]:
# Train data
data = pd.read_csv('C:/Users/susum/Documents/year5_sem1/QBUS3820/Group_Assignment/final_version/data_train.csv')

In [6]:
# Test data
data_test = pd.read_csv('C:/Users/susum/Documents/year5_sem1/QBUS3820/Group_Assignment/final_version/data_test.csv')

# Prep data

In [7]:
# Prep data
predictors = list(data.columns)
predictors.remove('price')
response = ['price']

X_train1 = data.loc[:, predictors]
y_train1  = np.log(data.loc[:, response])


X_test1 = data_test.loc[:, predictors]



# OLS

In [75]:
from sklearn.linear_model import LinearRegression
ols1 = LinearRegression()
ols1.fit(X_train1, y_train1)

y_pred_train = ols1.predict(X_test1)

In [76]:
y_pred = ols1.predict(X_test1)
y_pred_clean = []
for i in y_pred:
    for j in i:
        y_pred_clean.append(j)
y_pred_clean = np.array(y_pred_clean)
y_test_exp = np.exp(y_pred_clean)
df = []
df = pd.DataFrame({'price':y_test_exp})
df

Unnamed: 0,price
0,141.494358
1,441.677048
2,162.504312
3,91.106566
4,104.487113
...,...
24813,114.485013
24814,96.184730
24815,91.776628
24816,88.750147


In [78]:
ids = df.index.values
values = df.values.flatten()

with open('out_OLS.csv', 'w') as f:
    f.write('id,price\n')
    for id_, val_ in zip(ids, values):
        f.write('%d, %.5f\n'%(id_, val_))

# GBR

In [79]:
from sklearn.ensemble import GradientBoostingRegressor

time_start = time.time()

gbr = GradientBoostingRegressor(learning_rate= 0.05, max_depth = 4, n_estimators= 1500, subsample = 0.8)
gbr.fit(X_train1, y_train1)

y_pred_train = gbr.predict(X_train1)

time_end = time.time()

print(f'time taken is {time_end - time_start} seconds')

print(f'\nTrain MSE is {(mean_squared_error(y_train1, y_pred_train))}')
print(f'Train RMSE is {np.sqrt(mean_squared_error(y_train1, y_pred_train))}')
print(f'Train r2 is {r2_score(y_train1, y_pred_train)}')

  y = column_or_1d(y, warn=True)


time taken is 76.11376476287842 seconds

Train MSE is 0.03653837122083797
Train RMSE is 0.19115012744133333
Train r2 is 0.9319444811525348


In [80]:
y_pred = gbr.predict(X_test1)
y_test_exp = np.exp(y_pred)
df = []
df = pd.DataFrame({'price':y_test_exp})
df

Unnamed: 0,price
0,147.390525
1,518.692220
2,170.676098
3,83.226966
4,73.192505
...,...
24813,100.505675
24814,75.691087
24815,142.142813
24816,79.427262


In [81]:
ids = df.index.values
values = df.values.flatten()

with open('out_gbr.csv', 'w') as f:
    f.write('id,price\n')
    for id_, val_ in zip(ids, values):
        f.write('%d, %.5f\n'%(id_, val_))

In [96]:
gbr_table = pd.DataFrame(gbr.feature_importances_, columns=['weight'], index=X_train1.columns)
gbr_table.sort_values('weight', ascending=False, inplace=True)
gbr_table.head(10)

Unnamed: 0,weight
bedrooms,0.308062
accommodates,0.210201
cleaning_fee_perc,0.072622
longitude,0.057509
Entire_home_apt,0.043201
bathrooms,0.040285
Private_room,0.034636
latitude,0.023441
security_deposit_perc,0.020763
availability_90,0.016495


# LightGBM

In [82]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgb

time_start = time.time()

lbst = lgb.LGBMRegressor(objective='regression', subsample = 1.0, num_leaves = 7, 
                                       n_estimators = 1500, learning_rate = 0.05)

lbst.fit(X_train1, y_train1)

y_pred_train = lbst.predict(X_train1)

time_end = time.time()

print(f'time taken is {time_end - time_start} seconds')

print(f'\nTrain MSE is {(mean_squared_error(y_train1, y_pred_train))}')
print(f'Train RMSE is {np.sqrt(mean_squared_error(y_train1, y_pred_train))}')
print(f'Train r2 is {r2_score(y_train1, y_pred_train)}')


time taken is 1.3100035190582275 seconds

Train MSE is 0.06420380903747057
Train RMSE is 0.2533847056107976
Train r2 is 0.8804154813136077


In [83]:
y_pred = lbst.predict(X_test1)
y_test_exp = np.exp(y_pred)
df = []
df = pd.DataFrame({'price':y_test_exp})
df

Unnamed: 0,price
0,135.865639
1,489.548777
2,168.724694
3,78.820959
4,74.608435
...,...
24813,102.400161
24814,76.068736
24815,148.739687
24816,90.442992


In [84]:
ids = df.index.values
values = df.values.flatten()

with open('out_lbst.csv', 'w') as f:
    f.write('id,price\n')
    for id_, val_ in zip(ids, values):
        f.write('%d, %.5f\n'%(id_, val_))

In [97]:
lbst_table = pd.DataFrame(lbst.feature_importances_, columns=['weight'], index=X_train1.columns)
lbst_table.sort_values('weight', ascending=False, inplace=True)
lbst_table.head(10)

Unnamed: 0,weight
cleaning_fee_perc,728
latitude,585
longitude,551
host_since,463
first_review,385
security_deposit_perc,336
last_review,332
accommodates,331
maximum_nights,310
zipcode,301


# XG Boosting

In [85]:
xgb_reg_start = time.time()

xgb_reg = xgb.XGBRegressor(subsample = 0.8, n_estimators = 1500, max_depth = 4, learning_rate = 0.05)
xgb_reg.fit(X_train1, y_train1)
training_preds_xgb_reg = xgb_reg.predict(X_train1)

xgb_reg_end = time.time()

print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")

print(f'\nTrain MSE is {(mean_squared_error(y_train1, training_preds_xgb_reg))}')
print(f'Train RMSE is {np.sqrt(mean_squared_error(y_train1, training_preds_xgb_reg))}')
print(f'Train r2 is {r2_score(y_train1, training_preds_xgb_reg)}')



Time taken to run: 0.6 minutes

Train MSE is 0.03878864283041783
Train RMSE is 0.19694832527954592
Train r2 is 0.9277531776865954


In [86]:
y_pred = xgb_reg.predict(X_test1)
y_test_exp = np.exp(y_pred)
df = []
df = pd.DataFrame({'price':y_test_exp})
df

Unnamed: 0,price
0,136.761902
1,497.719055
2,170.689209
3,84.113335
4,74.932732
...,...
24813,99.378433
24814,74.352226
24815,130.234055
24816,79.714218


In [87]:
ids = df.index.values
values = df.values.flatten()

with open('out_xgb.csv', 'w') as f:
    f.write('id,price\n')
    for id_, val_ in zip(ids, values):
        f.write('%d, %.5f\n'%(id_, val_))

# Random Forest

In [98]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV

regr_start = time.time()

rf = RandomForestRegressor(n_estimators = 10000, min_samples_split = 5, min_samples_leaf = 2,
                                        max_features = 'auto', max_depth = 70, bootstrap = True)

rf.fit(X_train1, y_train1)

y_pred_train = rf.predict(X_train1)

print(f'\nTrain MSE is {(mean_squared_error(y_train1, y_pred_train))}')
print(f'Train RMSE is {np.sqrt(mean_squared_error(y_train1, y_pred_train))}')
print(f'Train r2 is {r2_score(y_train1, y_pred_train)}')

In [None]:
y_pred = rf.predict(X_test1)
y_test_exp = np.exp(y_pred)
df = []
df = pd.DataFrame({'price':y_test_exp})
df

In [None]:
ids = df.index.values
values = df.values.flatten()

with open('out_rf.csv', 'w') as f:
    f.write('id,price\n')
    for id_, val_ in zip(ids, values):
        f.write('%d, %.5f\n'%(id_, val_))

# Stacking Model

In [8]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR        
from pygam import LinearGAM
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.svm import SVR

In [9]:
def get_stacking():
    
    #define the base models
    level0 = list()
    level0.append(('lgbm', lgb.LGBMRegressor(objective='regression', subsample = 0.8, num_leaves = 24, 
                                       n_estimators = 500, learning_rate = 0.05)))
    level0.append(('rf',  RandomForestRegressor(n_estimators = 1800, min_samples_split = 5, min_samples_leaf = 2,
                                        max_features = 'auto', max_depth = 70, bootstrap = True)))
    level0.append(('xgb', xgb.XGBRegressor(colsample_bytree = 0.306, subsample = 1.0, n_estimators = 1975, max_depth = 6,
                            learning_rate = 0.05, reg_lambda = 1000)))
    level0.append(('gbr',GradientBoostingRegressor(learning_rate= 0.05, max_depth = 4, n_estimators= 1500, subsample = 0.8)))
    
    # define meta learner model
    level1 = None

    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5, verbose=2, n_jobs=-1)
    return model

In [None]:
stacking = get_stacking()
stacking.fit(X_train,y_train)

In [None]:
# Find prediction 
y_test = stacking.predict(X_test)

In [None]:
y_test_exp = np.exp(y_test)
df = []
df = pd.DataFrame({'price':y_test_exp})
df

In [None]:
ids = df.index.values
values = df.values.flatten()

with open('out_stack3.csv', 'w') as f:
    f.write('id, price\n')
    for id_, val_ in zip(ids, values):
        f.write('%d, %.5f\n'%(id_, val_))

# Plot

In [10]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X_train1, y_train1, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [11]:
def get_models():
    models = dict()
    models['lgbm'] = lgb.LGBMRegressor(objective='regression', subsample = 0.8, num_leaves = 24, 
                                       n_estimators = 500, learning_rate = 0.05)
    models['rf'] =  RandomForestRegressor(n_estimators = 1800, min_samples_split = 5, min_samples_leaf = 2,
                                        max_features = 'auto', max_depth = 70, bootstrap = True)
    models['xgb'] = xgb.XGBRegressor(colsample_bytree = 0.306, subsample = 1.0, n_estimators = 1975, max_depth = 6,
                            learning_rate = 0.05, reg_lambda = 1000)
    models['gbr'] = GradientBoostingRegressor(learning_rate= 0.05, max_depth = 4, n_estimators= 1500, subsample = 0.8)
    models['stack'] = get_stacking()
    return models

In [None]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

>lgbm -0.101 (0.007)
>rf -0.121 (0.008)
>xgb -0.099 (0.007)
>gbr -0.099 (0.007)


