In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor                        

from sklearn.metrics import *
from src.model_helper import *

%matplotlib inline
plt.style.use('ggplot')

### load and split data

In [125]:
df = pd.read_csv('data/processed_data.csv')
df.shape

(48782, 235)

In [126]:
df.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,nb_g_Bronx,...,nb_Westerleigh,nb_Whitestone,nb_Williamsbridge,nb_Williamsburg,nb_Willowbrook,nb_Windsor Terrace,nb_Woodhaven,nb_Woodlawn,nb_Woodrow,nb_Woodside
0,1,9,0.21,6,365,149,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,45,0.38,2,355,225,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0.0,1,365,150,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,270,4.64,1,194,89,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10,9,0.1,1,0,80,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [127]:
X_train, X_test, y_train, y_test = load_and_split_data()

In [128]:
print(f'X_train shape:  {X_train.shape}')
print(f'y_train length: {len(y_train)}')
print(f'y_train + y_test length: {len(y_train)+len(y_test)}')
print((len(y_train)+len(y_test))==df.shape[0])

X_train shape:  (36586, 234)
y_train length: 36586
y_train + y_test length: 48782
True


In [129]:
y_test[:10]

47844     75
38685    481
31439    140
19645     70
10986     29
21477     28
25325    325
16483    115
29918     90
16617     67
Name: price, dtype: int64

#### Linear Regression

In [130]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)

In [131]:
def rmsle(actual, predictions):
    log_diff = np.log(predictions+1) - np.log(actual+1)
    return np.sqrt(np.mean(log_diff**2))

In [132]:
lr_mse = mean_squared_error(y_test, lr_y_pred)
print(f'mse for LinearRegression: {lr_mse}')
print(f'R2 for LinearRegression: {lr.score(X_train, y_train)}')
# lr_mlse=mean_squared_log_error(y_test, lr_y_pred)

mse for LinearRegression: 96069927605401.44
R2 for LinearRegression: 0.2941213552423504


In [133]:
(lr_y_pred<0).sum()

48

#### Lasso

In [140]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso.score(X_train, y_train)

0.2853069085320342

#### Define models

In [134]:
lr = LinearRegression()

rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=1)

gdbr = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                 n_estimators=100, random_state=1)

abr = AdaBoostRegressor(DecisionTreeRegressor(), learning_rate=0.1,
                                 loss='linear', n_estimators=100, random_state=1)

#### compare model score

In [135]:
# from sklearn.model_selection import cross_val_score
# def cross_val(model, X, y, nfolds):
    
#     mse = cross_val_score(model, X, y, 
#                           scoring='neg_mean_squared_error',
#                           cv=nfolds, n_jobs=-1) * -1
#     r2  = cross_val_score(model, X, y, 
#                           scoring='r2',
#                           cv=nfolds, n_jobs=-1)
#     mean_mse = mse.mean()
#     mean_r2 = r2.mean()
#     name = model.__class__.__name__
#     print(f'{name}  Train CV | MSE: {mean_mse.round(2)} | R2: {mean_r2.round(3)}')
#     return mean_mse,mean_r2 

In [136]:
k = 10 # number of folds in the cross-validation 
cross_val(lr, X_train, y_train, k) 
cross_val(rf, X_train, y_train, k) 
cross_val(gdbr, X_train, y_train, k) 
cross_val(abr, X_train, y_train, k); 

LinearRegression           Train CV | MSE: 13220545063196330.000 | R2: -791988536573.692
RandomForestRegressor      Train CV | MSE: 13807.215 | R2: 0.288
GradientBoostingRegressor  Train CV | MSE: 13284.315 | R2: 0.316


KeyboardInterrupt: 

In [None]:
####  Choosing RandomForestRegressor

####  Choosing GradientBoostingRegressor

In [None]:
#Try some gradient boosting regressor with different n_estimators

In [96]:
#learning_rate=1.0
gdbr_lr10 = GradientBoostingRegressor(learning_rate=1.0, loss='ls',
                                     n_estimators=100, random_state=1) 

In [97]:
# learning_rate=0.8
gdbr_lr8 = GradientBoostingRegressor(learning_rate=0.8,loss='ls',
                                     n_estimators=100, random_state=1) 

In [98]:
# learning_rate=0.5
gdbr_lr5 = GradientBoostingRegressor(learning_rate=0.5, loss='ls',
                                     n_estimators=100, random_state=1) 

In [99]:
# learning_rate=0.3
gdbr_lr3 = GradientBoostingRegressor(learning_rate=0.3, loss='ls',
                                     n_estimators=100, random_state=1) 

In [101]:
print("Cross validation score of Gradient Boosting Regressor with lr = 0.1:")
cross_val(gdbr, X_train, y_train, k) 
print("Cross validation score of Gradient Boosting Regressor with lr = 0.3:")
cross_val(gdbr_lr3, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with lr = 0.5:")
cross_val(gdbr_lr5, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with lr = 0.8:")
cross_val(gdbr_lr8, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with lr = 1.0:")
cross_val(gdbr_lr10, X_train, y_train, k);

Cross validation score of Gradient Boosting Regressor with lr = 0.1:
GradientBoostingRegressor  Train CV | MSE: 56229.54 | R2: 0.144
Cross validation score of Gradient Boosting Regressor with lr = 0.3:
GradientBoostingRegressor  Train CV | MSE: 58143.46 | R2: 0.115
Cross validation score of Gradient Boosting Regressor with lr = 0.5:
GradientBoostingRegressor  Train CV | MSE: 66882.39 | R2: -0.043
Cross validation score of Gradient Boosting Regressor with lr = 0.8:


KeyboardInterrupt: 

In [105]:
#n_estimators =50
# nums50 = 50
gdbr_ne50 = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                     n_estimators=50, random_state=1)
#n_estimators =200
gdbr_ne200 = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                     n_estimators=200, random_state=1)
#n_estimators =300
gdbr_ne300 = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                     n_estimators=300, random_state=1)


In [106]:
print("Cross validation score of Gradient Boosting Regressor with ne = 50:")
cross_val(gdbr_ne50, X_train, y_train, k) 
print("Cross validation score of Gradient Boosting Regressor with n_estimators = 100:")
cross_val(gdbr, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with n_estimators = 200:")
cross_val(gdbr_ne200, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with n_estimators = 300:")
cross_val(gdbr_ne300, X_train, y_train, k)

Cross validation score of Gradient Boosting Regressor with ne = 50:
GradientBoostingRegressor  Train CV | MSE: 56416.36 | R2: 0.14
Cross validation score of Gradient Boosting Regressor with n_estimators = 100:
GradientBoostingRegressor  Train CV | MSE: 56229.54 | R2: 0.144
Cross validation score of Gradient Boosting Regressor with n_estimators = 200:
GradientBoostingRegressor  Train CV | MSE: 56700.93 | R2: 0.14
Cross validation score of Gradient Boosting Regressor with n_estimators = 300:


KeyboardInterrupt: 

#### Tuning the number of estimators

In [107]:
N_ESTIMATORS = 3000

In [None]:
fig, ax = plt.subplots()
model = GradientBoostingRegressor(n_estimators=N_ESTIMATORS, 
                                  learning_rate=0.01)
model.fit(X_train, y_train)


In [115]:
model.score(X_train, y_train)

0.38836562852594225