## NYC Airbnb Price Prediction





In [1]:
import numpy as np 
import pandas as pd

from feature_engine.encoding import RareLabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, PowerTransformer, MinMaxScaler
)

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    KFold, RandomizedSearchCV, train_test_split
)

from sklearn.metrics import (
    mean_squared_log_error, r2_score
)


In [2]:
df = pd.read_csv("../Data/airbnb_imputed.csv")

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,34194.0,19089490.0,10982050.0,2539.0,9511539.0,19760090.0,29231520.0,36487240.0
host_id,34194.0,67761780.0,78774260.0,2438.0,7858210.0,30869690.0,107434400.0,274321300.0
latitude,34194.0,40.72884,0.05462905,40.50641,40.68992,40.7229,40.76315,40.91306
longitude,34194.0,-73.95213,0.04608519,-74.24285,-73.98303,-73.95568,-73.93615,-73.7169
price,34194.0,152.0763,234.5629,10.0,69.0,105.0,175.0,10000.0
minimum_nights,34194.0,7.057729,20.71104,1.0,1.0,3.0,5.0,1250.0
number_of_reviews,34194.0,23.20214,44.32853,0.0,1.0,5.0,23.0,629.0
reviews_per_month,34194.0,1.158098,1.592788,0.01,0.13,0.51,1.59,58.5
calculated_host_listings_count,34194.0,7.181143,32.9744,1.0,1.0,1.0,2.0,327.0
availability_365,34194.0,113.5531,131.9172,0.0,0.0,46.0,230.0,365.0


In [4]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
name,34194,33681,Hillside Hotel,15
host_name,34194,9148,Michael,309
neighbourhood_group,34194,5,Manhattan,15134
neighbourhood,34194,217,Williamsburg,2718
room_type,34194,3,Entire home/apt,17800
last_review,34194,1675,2019-06-23,1212


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34194 entries, 0 to 34193
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              34194 non-null  int64  
 1   name                            34194 non-null  object 
 2   host_id                         34194 non-null  int64  
 3   host_name                       34194 non-null  object 
 4   neighbourhood_group             34194 non-null  object 
 5   neighbourhood                   34194 non-null  object 
 6   latitude                        34194 non-null  float64
 7   longitude                       34194 non-null  float64
 8   room_type                       34194 non-null  object 
 9   price                           34194 non-null  int64  
 10  minimum_nights                  34194 non-null  int64  
 11  number_of_reviews               34194 non-null  int64  
 12  last_review                     

In [6]:
df.isna().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [7]:
df = df.query("price > 0 & price < 800")
#df = df[df['price']>0]

In [8]:
df['last_review'] = pd.to_datetime(df['last_review'])
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,299531,Feel like you never leave your home,1220404,Tom,Brooklyn,East New York,40.66795,-73.89232,Entire home/apt,100,1,119,2019-06-30,1.39,2,289
1,2461439,Pristine Lower East Side Sanctuary,12586492,Sausan,Manhattan,Lower East Side,40.72007,-73.98946,Entire home/apt,133,14,177,2019-05-03,2.82,2,221
2,127387,"Luxe, Spacious 2BR 2BA Nr Trains",23276,Katharine,Brooklyn,Gowanus,40.66862,-73.99260,Entire home/apt,260,30,3,2014-08-04,0.03,1,316
3,629315,1BD brownstone apt in Fort Greene!,2397437,Lauren,Brooklyn,Fort Greene,40.68935,-73.96950,Entire home/apt,120,3,22,2015-10-28,0.27,1,189
4,4607923,LOVELY LARGE SUNNY ROOM Sunset Park,1113080,Audrey,Brooklyn,Sunset Park,40.64722,-74.00475,Private room,55,7,98,2019-05-22,1.75,3,312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34189,32786275,Clean and Simple,82940021,Todd,Manhattan,Hell's Kitchen,40.76341,-73.99306,Entire home/apt,145,3,9,2019-07-01,3.55,1,6
34190,29829054,Best location in Williamsburg!,20827165,Melissa,Brooklyn,Williamsburg,40.71545,-73.94383,Entire home/apt,99,2,1,2018-11-19,0.13,2,0
34191,31857472,Hamilton Studio. 2Queen. priv bath. kitchenette,238750007,Hamilton,Manhattan,Harlem,40.82335,-73.94939,Entire home/apt,145,4,20,2019-07-02,3.66,3,310
34192,19197129,Best CoLiving next to Bushwick!,134293540,Valentin,Queens,Ridgewood,40.70508,-73.90217,Shared room,26,31,5,2018-05-04,0.22,4,365


In [9]:
df['price'] = df['price'].astype("float64")
df['name'] = df['name'].astype(object)
df.dtypes

id                                         int64
name                                      object
host_id                                    int64
host_name                                 object
neighbourhood_group                       object
neighbourhood                             object
latitude                                 float64
longitude                                float64
room_type                                 object
price                                    float64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[ns]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
dtype: object

In [10]:
df['year'] = df.last_review.dt.year
df['month'] = df.last_review.dt.month

df[['year','month']].sample(5)

Unnamed: 0,year,month
15619,2016,7
30302,2019,2
20805,2019,7
6938,2018,7
22376,2017,12


In [11]:
X = df.drop(['price', 'host_id', 'host_name', 'id'],axis = 1)
y = np.log(df['price'])

num_cols = X.select_dtypes(include = np.number).columns.to_list()
cat_cols = X.select_dtypes(exclude = np.number).drop(['name','last_review'], axis=1).columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(X_train.tail())

                                                    name neighbourhood_group  \
17012                            Great Williamsburg Spot            Brooklyn   
6310   Private room in East Harlem close to heart of NYC           Manhattan   
11385                 Cozy 1 BD apartment in Sunset park            Brooklyn   
864             ENJOY MANHATTAN\r\nNEAR TO YANKE STADIUM               Bronx   
15944                    Large & bright 900ft² 1br in WV           Manhattan   

      neighbourhood  latitude  longitude        room_type  minimum_nights  \
17012  Williamsburg  40.71071  -73.95957     Private room               2   
6310    East Harlem  40.80113  -73.94283     Private room               2   
11385   Sunset Park  40.65046  -74.00402  Entire home/apt               1   
864      Mott Haven  40.80958  -73.91895     Private room               1   
15944  West Village  40.73291  -74.00059  Entire home/apt               4   

       number_of_reviews last_review  reviews_per_month 

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
pipe_cat = Pipeline(
    steps = [
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

pipe_num = Pipeline(
    steps = [
    ('scale', MinMaxScaler())
    ]
)

pipe_text = Pipeline(
    steps = [
        ('text_vec', CountVectorizer(analyzer='word',
                                     stop_words='english',
                                     max_features=30))
    ]
)

#pd.DataFrame(pipe_text.fit_transform(X_train['name']).toarray(), columns=pipe_text.get_feature_names_out())

In [47]:

preprocess = ColumnTransformer(
    transformers = [
    #('num', pipe_num, num_cols),
    ('text', pipe_text, 'name'),
    ('cat', pipe_cat, cat_cols)
    ],
    remainder='passthrough'
)

pd.DataFrame(preprocess.fit_transform(X_train), columns=preprocess.get_feature_names_out())

Unnamed: 0,text__apartment,text__apt,text__beautiful,text__bed,text__bedroom,text__bright,text__brooklyn,text__central,text__cozy,text__east,...,cat__room_type_Private room,cat__room_type_Shared room,remainder__latitude,remainder__longitude,remainder__minimum_nights,remainder__number_of_reviews,remainder__last_review,remainder__reviews_per_month,remainder__calculated_host_listings_count,remainder__availability_365
0,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,40.86674,-73.89284,2,2,2018-02-26,0.11,2,90
1,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,40.70728,-73.95461,1,122,2019-06-17,5.42,1,193
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,40.70648,-74.01241,2,7,2019-05-26,1.69,327,337
3,0,0,0,0,1,0,0,0,0,0,...,1.0,0.0,40.70297,-73.895,7,0,2019-06-23,0.08,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,40.79819,-73.94294,1,23,2019-06-05,3.37,1,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23706,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,40.71071,-73.95957,2,0,2019-07-06,0.03,2,86
23707,0,0,0,0,0,0,0,0,0,1,...,1.0,0.0,40.80113,-73.94283,2,0,2019-06-27,0.13,1,0
23708,1,0,0,0,0,0,0,0,1,0,...,0.0,0.0,40.65046,-74.00402,1,26,2019-03-24,2.52,1,0
23709,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,40.80958,-73.91895,1,29,2019-06-28,3.15,4,312


In [62]:
from feature_engine.datetime import DatetimeFeatures
model = Pipeline(steps = [
    ('date_features', DatetimeFeatures(features_to_extract=['year','month','quarter','day_of_week','weekend'], variables=['last_review'])),
    ('rare', RareLabelEncoder(tol=0.03, variables=['neighbourhood'])),
    ('preprocessor', preprocess),
    ('lgbm', LGBMRegressor())
])


In [63]:
model_fit = model.fit(X_train, y_train) 
model_fit.score(X_test, y_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1525
[LightGBM] [Info] Number of data points in the train set: 23711, number of used features: 61
[LightGBM] [Info] Start training from score 4.704162


0.6583388665571586

In [55]:
y_pred = model_fit.predict(X_test)
y_pred

array([4.45581389, 4.42916981, 3.49646764, ..., 5.22453883, 4.55709312,
       4.50164985])

In [56]:
print(f"RMSLE for LGBM: {np.sqrt(mean_squared_log_error(y_test, y_pred))},\
\nR2 for LGBM: {r2_score(y_test, y_pred)}")

RMSLE for LGBM: 0.06689414671711624,
R2 for LGBM: 0.6583388665571586


In [58]:
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import (
    GradientBoostingRegressor, AdaBoostRegressor
)
from xgboost import XGBRegressor

models = [
    ('huber', HuberRegressor()),
    ('gb', GradientBoostingRegressor()),
    ('ada', AdaBoostRegressor()),
    ('xgb', XGBRegressor())
] 

for name, model in models:
    pipe = Pipeline(steps=[
        ('date_features', DatetimeFeatures(features_to_extract=['year','month','quarter','day_of_week','weekend'], variables=['last_review'])),
        ('rare', RareLabelEncoder(tol=0.03, variables=['neighbourhood'])),
        ('preprocessor', preprocess),
        (name, model)
    ])
    
    pipe.fit(X_train,y_train)
    preds = pipe.predict(X_test)
    print(f"""{name} R2: {r2_score(y_test, preds)}
          {name} RMSLE: {np.sqrt(mean_squared_log_error(y_test, preds))}""")
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


huber R2: 0.4686383697227017
          huber RMSLE: 0.08468702414376847
gb R2: 0.6379808547970727
          gb RMSLE: 0.06871928256643577
ada R2: 0.47233040559159534
          ada RMSLE: 0.08442038979263566
xgb R2: 0.6489449637792912
          xgb RMSLE: 0.06783104540529283


## Otimização dos parâmetros do modelo

In [59]:
lgbm_grid = {
    'lgbm__num_leaves': [7, 14, 21],
    'lgbm__learning_rate': [0.1, 0.03, 0.001],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__n_estimators': [200, 500, 1000],
    'preprocessor__text__text_vec__max_features': [10,20,50,150]
}

model.get_params().keys()

dict_keys(['objective', 'base_score', 'booster', 'callbacks', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'device', 'early_stopping_rounds', 'enable_categorical', 'eval_metric', 'feature_types', 'gamma', 'grow_policy', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_bin', 'max_cat_threshold', 'max_cat_to_onehot', 'max_delta_step', 'max_depth', 'max_leaves', 'min_child_weight', 'missing', 'monotone_constraints', 'multi_strategy', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'random_state', 'reg_alpha', 'reg_lambda', 'sampling_method', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])

In [64]:
lgbm_tune = RandomizedSearchCV(
    model, 
    param_distributions = lgbm_grid,
    cv = 5,
    scoring = 'neg_mean_squared_log_error',
    return_train_score = True,
    n_iter = 10,
    verbose = 1,
    random_state = 42
)

In [65]:
lgbm_tune.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1460
[LightGBM] [Info] Number of data points in the train set: 18968, number of used features: 41
[LightGBM] [Info] Start training from score 4.702090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1460
[LightGBM] [Info] Number of data points in the train set: 18969, number of used features: 41
[LightGBM] [Info] Start training from score 4.703432
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001276 seconds.
You can set `force_row_

In [66]:
from joblib import dump
best_lgbm = lgbm_tune.best_estimator_
dump(best_lgbm, '../models/best_lgbm.pkl')

['../models/best_lgbm.pkl']

In [None]:
lgbm_tune.best_score_