## NYC Airbnb Price Prediction





In [1]:
import numpy as np 
import pandas as pd

from feature_engine.encoding import RareLabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, PowerTransformer, MinMaxScaler
)

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    KFold, RandomizedSearchCV, train_test_split
)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
df = pd.read_csv("../Data/airbnb_imputed.csv")

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,34194.0,19089490.0,10982050.0,2539.0,9511539.0,19760090.0,29231520.0,36487240.0
host_id,34194.0,67761780.0,78774260.0,2438.0,7858210.0,30869690.0,107434400.0,274321300.0
latitude,34194.0,40.72884,0.05462905,40.50641,40.68992,40.7229,40.76315,40.91306
longitude,34194.0,-73.95213,0.04608519,-74.24285,-73.98303,-73.95568,-73.93615,-73.7169
price,34194.0,152.0763,234.5629,10.0,69.0,105.0,175.0,10000.0
minimum_nights,34194.0,7.057729,20.71104,1.0,1.0,3.0,5.0,1250.0
number_of_reviews,34194.0,23.20214,44.32853,0.0,1.0,5.0,23.0,629.0
reviews_per_month,34194.0,1.158098,1.592788,0.01,0.13,0.51,1.59,58.5
calculated_host_listings_count,34194.0,7.181143,32.9744,1.0,1.0,1.0,2.0,327.0
availability_365,34194.0,113.5531,131.9172,0.0,0.0,46.0,230.0,365.0


In [4]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
name,34194,33681,Hillside Hotel,15
host_name,34194,9148,Michael,309
neighbourhood_group,34194,5,Manhattan,15134
neighbourhood,34194,217,Williamsburg,2718
room_type,34194,3,Entire home/apt,17800
last_review,34194,1675,2019-06-23,1212


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34194 entries, 0 to 34193
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              34194 non-null  int64  
 1   name                            34194 non-null  object 
 2   host_id                         34194 non-null  int64  
 3   host_name                       34194 non-null  object 
 4   neighbourhood_group             34194 non-null  object 
 5   neighbourhood                   34194 non-null  object 
 6   latitude                        34194 non-null  float64
 7   longitude                       34194 non-null  float64
 8   room_type                       34194 non-null  object 
 9   price                           34194 non-null  int64  
 10  minimum_nights                  34194 non-null  int64  
 11  number_of_reviews               34194 non-null  int64  
 12  last_review                     

In [6]:
df.isna().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [7]:
df.query("price == 0")
df = df[df['price']>0]

In [9]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [10]:
df['price'] = df['price'].astype("float64")
df['name'] = df['name'].astype(object)
df.dtypes

id                                         int64
name                                      object
host_id                                    int64
host_name                                 object
neighbourhood_group                       object
neighbourhood                             object
latitude                                 float64
longitude                                float64
room_type                                 object
price                                    float64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[ns]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
dtype: object

In [11]:
df['year'] = df.last_review.dt.year
df['month'] = df.last_review.dt.month

df[['year','month']].sample(5)

Unnamed: 0,year,month
4110,2018,12
27459,2018,12
4644,2017,9
13812,2018,7
20771,2019,6


In [32]:
X = df.drop(['price', 'host_id', 'host_name', 'id', 'last_review'],axis = 1)
y = df['price']

num_cols = X.select_dtypes(include = np.number).columns.to_list()
cat_cols = X.select_dtypes(exclude = np.number).drop('name', axis=1).columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(X_train[cat_cols].tail())   

      neighbourhood_group       neighbourhood        room_type
16850           Manhattan      Hell's Kitchen  Entire home/apt
6265             Brooklyn            Bushwick  Entire home/apt
11284              Queens            Elmhurst     Private room
860              Brooklyn  Bedford-Stuyvesant  Entire home/apt
15795              Queens           Ridgewood  Entire home/apt


In [33]:
from sklearn.feature_extraction.text import CountVectorizer
pipe_cat = Pipeline(
    steps = [
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

pipe_num = Pipeline(
    steps = [
    ('scale', MinMaxScaler())
    ]
)

pipe_text = Pipeline(
    steps = [
        ('text_vec', CountVectorizer(analyzer='word',
                                     stop_words='english',
                                     max_features=30))
    ]
)

#pd.DataFrame(pipe_text.fit_transform(X_train['name']).toarray(), columns=pipe_text.get_feature_names_out())

In [38]:
preprocess = ColumnTransformer(
    transformers = [
    #('num', pipe_num, num_cols),
    ('text', pipe_text, 'name'),
    ('cat', pipe_cat, cat_cols)
    ],
    remainder='passthrough'
)

pd.DataFrame(preprocess.fit_transform(X_train), columns=preprocess.get_feature_names_out())

Unnamed: 0,text__1br,text__apartment,text__apt,text__beautiful,text__bed,text__bedroom,text__bright,text__brooklyn,text__central,text__cozy,...,cat__room_type_Shared room,remainder__latitude,remainder__longitude,remainder__minimum_nights,remainder__number_of_reviews,remainder__reviews_per_month,remainder__calculated_host_listings_count,remainder__availability_365,remainder__year,remainder__month
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40.73812,-73.92024,30.0,1.0,0.25,103.0,186.0,2019.0,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40.80464,-73.95637,2.0,3.0,0.07,1.0,0.0,2016.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40.72088,-73.98533,2.0,1.0,0.03,1.0,0.0,2016.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40.72028,-73.98794,30.0,1.0,0.77,1.0,96.0,2019.0,5.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,40.67747,-73.94660,9.0,7.0,0.54,1.0,0.0,2019.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23930,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40.76189,-73.98765,31.0,0.0,0.25,5.0,341.0,2019.0,6.0
23931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40.69231,-73.92567,3.0,58.0,7.91,2.0,2.0,2019.0,7.0
23932,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,40.74262,-73.87907,2.0,100.0,3.05,1.0,296.0,2019.0,6.0
23933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40.68209,-73.92015,5.0,9.0,0.52,1.0,0.0,2018.0,12.0


In [40]:

model = Pipeline(steps = [
    ('rare', RareLabelEncoder(tol=0.03, variables=['neighbourhood'])),
    ('preprocessor', preprocess),
    ('lgbm', LGBMRegressor())
])


In [41]:
model_fit = model.fit(X_train, y_train) 
model_fit.score(X_test, y_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1508
[LightGBM] [Info] Number of data points in the train set: 23935, number of used features: 58
[LightGBM] [Info] Start training from score 152.341216


0.17829027895435845

In [42]:
y_pred = model_fit.predict(X_test)

y_pred 

array([ 95.82498164, 123.88288071,  62.77318734, ..., 114.93909828,
        60.03922989,  61.8740596 ])

In [43]:
print(f"RMSE for LGBM: {np.sqrt(mean_squared_error(y_test, y_pred))},\
\nMAE for LGBM: {mean_absolute_error(y_test, y_pred)}")

RMSE for LGBM: 212.4707509215737,
MAE for LGBM: 64.09701880934102


## Otimização dos parâmetros do modelo

In [44]:
lgbm_grid = {
    'lgbm__num_leaves': [7, 14, 21],
    'lgbm__learning_rate': [0.1, 0.03, 0.001],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__n_estimators': [200, 500, 1000],
    'preprocessor__text__text_vec__max_features': [10,20,50,150]
}

model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'rare', 'preprocessor', 'lgbm', 'rare__ignore_format', 'rare__max_n_categories', 'rare__missing_values', 'rare__n_categories', 'rare__replace_with', 'rare__tol', 'rare__variables', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__text', 'preprocessor__cat', 'preprocessor__text__memory', 'preprocessor__text__steps', 'preprocessor__text__verbose', 'preprocessor__text__text_vec', 'preprocessor__text__text_vec__analyzer', 'preprocessor__text__text_vec__binary', 'preprocessor__text__text_vec__decode_error', 'preprocessor__text__text_vec__dtype', 'preprocessor__text__text_vec__encoding', 'preprocessor__text__text_vec__input', 'preprocessor__text__text_vec__lowercase', 'preprocessor__text__text_vec__max_df', 'preprocessor__text__text_vec__max_features', 'preprocessor__t

In [46]:
lgbm_tune = RandomizedSearchCV(
    model, 
    param_distributions = lgbm_grid,
    cv = 5,
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    n_iter = 10,
    verbose = 1,
    random_state = 42
)

In [47]:
lgbm_tune.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1445
[LightGBM] [Info] Number of data points in the train set: 19148, number of used features: 38
[LightGBM] [Info] Start training from score 151.810111
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001902 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1452
[LightGBM] [Info] Number of data points in the train set: 19148, number of used features: 38
[LightGBM] [Info] Start training from score 152.350219
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enou

In [48]:
lgbm_tune.best_params_

{'preprocessor__text__text_vec__max_features': 10,
 'lgbm__num_leaves': 7,
 'lgbm__n_estimators': 200,
 'lgbm__max_depth': 5,
 'lgbm__learning_rate': 0.03}

In [None]:
pd.DataFrame(lgbm_tune.cv_results_)