In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor                        

from sklearn.metrics import mean_squared_error, r2_score
from src.model_helper import *

%matplotlib inline
plt.style.use('ggplot')

### load and split data

In [2]:
df = pd.read_csv('processed_v0.csv')
df.shape

(48879, 237)

In [3]:
df.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price,room_type_Entire home/apt,room_type_Private room,...,neighbourhood_Westerleigh,neighbourhood_Whitestone,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside
0,40.64749,-73.97237,1,9,0.21,6,365,149,0,1,...,0,0,0,0,0,0,0,0,0,0
1,40.75362,-73.98377,1,45,0.38,2,355,225,1,0,...,0,0,0,0,0,0,0,0,0,0
2,40.80902,-73.9419,3,0,0.0,1,365,150,0,1,...,0,0,0,0,0,0,0,0,0,0
3,40.68514,-73.95976,1,270,4.64,1,194,89,1,0,...,0,0,0,0,0,0,0,0,0,0
4,40.79851,-73.94399,10,9,0.1,1,0,80,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
path = 'processed_v0.csv'

In [8]:
X_train, X_test, y_train, y_test = load_and_split_data(path)

In [9]:
print(f'X_train shape:  {X_train.shape}')
print(f'y_train length: {len(y_train)}')
print(f'X_test shape:  {X_test.shape}')
print(f'y_train + y_test length: {len(y_train)+len(y_test)}')
print((len(y_train)+len(y_test))==df.shape[0])

X_train shape:  (36659, 236)
y_train length: 36659
X_test shape:  (12220, 236)
y_train + y_test length: 48879
True


In [10]:
y_test[:10]

19625    265
8473      59
37155     50
43737     33
24745     33
43723    475
10676     42
43721    390
27699    700
28965    100
Name: price, dtype: int64

#### Linear Regression

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)

In [12]:
def rmsle(actual, predictions):
    log_diff = np.log(predictions+1) - np.log(actual+1)
    return np.sqrt(np.mean(log_diff**2))

In [13]:
lr_mse = mean_squared_error(y_test, lr_y_pred)
print(f'mse for LinearRegression: {lr_mse}')
print(f'R2 for LinearRegression: {lr.score(X_train, y_train)}')
# lr_mlse=mean_squared_log_error(y_test, lr_y_pred)

mse for LinearRegression: 33503.132384384735
R2 for LinearRegression: 0.10968501095473993


In [14]:
(lr_y_pred<0).sum()

52

#### Lasso

In [15]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
lasso.score(X_train, y_train)

  positive)


0.10891961451088605

#### Define models

In [16]:
lr = LinearRegression()

lasso = Lasso(alpha=0.01)

rf = RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=1)

gdbr = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                 n_estimators=300, random_state=1)


#### compare model score

In [17]:
# from sklearn.model_selection import cross_val_score
# def cross_val(model, X, y, nfolds):
    
#     mse = cross_val_score(model, X, y, 
#                           scoring='neg_mean_squared_error',
#                           cv=nfolds, n_jobs=-1) * -1
#     r2  = cross_val_score(model, X, y, 
#                           scoring='r2',
#                           cv=nfolds, n_jobs=-1)
#     mean_mse = mse.mean()
#     mean_r2 = r2.mean()
#     name = model.__class__.__name__
#     print(f'{name}  Train CV | MSE: {mean_mse.round(2)} | R2: {mean_r2.round(3)}')
#     return mean_mse,mean_r2 

In [20]:
k = 10 # number of folds in the cross-validation 
cross_val(lr, X_train, y_train, k) 

cross_val(lasso, X_train, y_train, k); 

cross_val(rf, X_train, y_train, k) 

cross_val(gdbr, X_train, y_train, k) 

LinearRegression           Train CV | MSE: 16213094469414884.000 | R2: -229268990926.838
Lasso                      Train CV | MSE: 57301.107 | R2: 0.120
RandomForestRegressor      Train CV | MSE: 59357.008 | R2: 0.073
GradientBoostingRegressor  Train CV | MSE: 57106.071 | R2: 0.132


(57106.071248720844, 0.13150273549601993)

In [None]:
####  Choosing RandomForestRegressor



####  Choosing GradientBoostingRegressor

In [None]:
#Try some gradient boosting regressor with different n_estimators

In [96]:
#learning_rate=1.0
gdbr_lr10 = GradientBoostingRegressor(learning_rate=1.0, loss='ls',
                                     n_estimators=100, random_state=1) 

In [97]:
# learning_rate=0.8
gdbr_lr8 = GradientBoostingRegressor(learning_rate=0.8,loss='ls',
                                     n_estimators=100, random_state=1) 

In [98]:
# learning_rate=0.5
gdbr_lr5 = GradientBoostingRegressor(learning_rate=0.5, loss='ls',
                                     n_estimators=100, random_state=1) 

In [99]:
# learning_rate=0.3
gdbr_lr3 = GradientBoostingRegressor(learning_rate=0.3, loss='ls',
                                     n_estimators=100, random_state=1) 

In [101]:
print("Cross validation score of Gradient Boosting Regressor with lr = 0.1:")
cross_val(gdbr, X_train, y_train, k) 
print("Cross validation score of Gradient Boosting Regressor with lr = 0.3:")
cross_val(gdbr_lr3, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with lr = 0.5:")
cross_val(gdbr_lr5, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with lr = 0.8:")
cross_val(gdbr_lr8, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with lr = 1.0:")
cross_val(gdbr_lr10, X_train, y_train, k);

Cross validation score of Gradient Boosting Regressor with lr = 0.1:
GradientBoostingRegressor  Train CV | MSE: 56229.54 | R2: 0.144
Cross validation score of Gradient Boosting Regressor with lr = 0.3:
GradientBoostingRegressor  Train CV | MSE: 58143.46 | R2: 0.115
Cross validation score of Gradient Boosting Regressor with lr = 0.5:
GradientBoostingRegressor  Train CV | MSE: 66882.39 | R2: -0.043
Cross validation score of Gradient Boosting Regressor with lr = 0.8:


KeyboardInterrupt: 

In [105]:
#n_estimators =50
# nums50 = 50
gdbr_ne50 = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                     n_estimators=50, random_state=1)
#n_estimators =200
gdbr_ne200 = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                     n_estimators=200, random_state=1)
#n_estimators =300
gdbr_ne300 = GradientBoostingRegressor(learning_rate=0.1, loss='ls',
                                     n_estimators=300, random_state=1)


In [106]:
print("Cross validation score of Gradient Boosting Regressor with ne = 50:")
cross_val(gdbr_ne50, X_train, y_train, k) 
print("Cross validation score of Gradient Boosting Regressor with n_estimators = 100:")
cross_val(gdbr, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with n_estimators = 200:")
cross_val(gdbr_ne200, X_train, y_train, k)
print("Cross validation score of Gradient Boosting Regressor with n_estimators = 300:")
cross_val(gdbr_ne300, X_train, y_train, k)

Cross validation score of Gradient Boosting Regressor with ne = 50:
GradientBoostingRegressor  Train CV | MSE: 56416.36 | R2: 0.14
Cross validation score of Gradient Boosting Regressor with n_estimators = 100:
GradientBoostingRegressor  Train CV | MSE: 56229.54 | R2: 0.144
Cross validation score of Gradient Boosting Regressor with n_estimators = 200:
GradientBoostingRegressor  Train CV | MSE: 56700.93 | R2: 0.14
Cross validation score of Gradient Boosting Regressor with n_estimators = 300:


KeyboardInterrupt: 

#### Tuning the number of estimators

In [22]:
N_ESTIMATORS = 300

In [23]:
# fig, ax = plt.subplots()
gdbr_model = GradientBoostingRegressor(n_estimators=N_ESTIMATORS, 
                                  learning_rate=0.01)
gdbr_model.fit(X_train, y_train)
gdbr_pred = model.predict(X_test)

NameError: name 'model' is not defined

In [None]:
model.score(X_train, y_train)

In [None]:
r2_score(y_test, gdbr_pred)