## NYC Airbnb Price Prediction





In [1]:
import numpy as np 
import pandas as pd

# from pycaret.regression import *

# from feature_engine.encoding import RareLabelEncoder

from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    KFold, RandomizedSearchCV, train_test_split
)
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("Data/train.csv")

In [3]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,34226.0,34226.0,34226.0,34226.0,34226.0,34226.0,34226.0,27218.0,34226.0,34226.0
mean,19084830.0,67755720.0,40.728848,-73.952119,152.00412,7.055747,23.206597,1.380055,7.176416,113.523257
std,10980780.0,78760040.0,0.054633,0.046084,234.475161,20.703031,44.328567,1.707888,32.959383,131.915784
min,2539.0,2438.0,40.50641,-74.24285,0.0,1.0,0.0,0.01,1.0,0.0
25%,9509256.0,7858210.0,40.68993,-73.983017,69.0,1.0,1.0,0.19,1.0,0.0
50%,19759860.0,30878850.0,40.7229,-73.955675,105.0,3.0,5.0,0.71,1.0,46.0
75%,29219020.0,107434400.0,40.763157,-73.93612,175.0,5.0,23.0,2.03,2.0,230.0
max,36487240.0,274321300.0,40.91306,-73.7169,10000.0,1250.0,629.0,58.5,327.0,365.0


In [4]:
df.describe(include='object')

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,room_type,last_review
count,34217,34212,34226,34226,34226,27218
unique,33704,9151,5,217,3,1675
top,Hillside Hotel,Michael,Manhattan,Williamsburg,Entire home/apt,2019-06-23
freq,15,309,15146,2720,17807,981


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34226 entries, 0 to 34225
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              34226 non-null  int64  
 1   name                            34217 non-null  object 
 2   host_id                         34226 non-null  int64  
 3   host_name                       34212 non-null  object 
 4   neighbourhood_group             34226 non-null  object 
 5   neighbourhood                   34226 non-null  object 
 6   latitude                        34226 non-null  float64
 7   longitude                       34226 non-null  float64
 8   room_type                       34226 non-null  object 
 9   price                           34226 non-null  int64  
 10  minimum_nights                  34226 non-null  int64  
 11  number_of_reviews               34226 non-null  int64  
 12  last_review                     

In [6]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [7]:
df.isnull().sum()

id                                   0
name                                 9
host_id                              0
host_name                           14
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       7008
reviews_per_month                 7008
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [8]:
df['price'] = df['price'].astype("float64")
df.dtypes

id                                         int64
name                                      object
host_id                                    int64
host_name                                 object
neighbourhood_group                       object
neighbourhood                             object
latitude                                 float64
longitude                                float64
room_type                                 object
price                                    float64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[ns]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
dtype: object

In [None]:
df_filtered = df\
    .drop(['host_id', 'host_name', 'neighbourhood', 'name', 'id', 'last_review'], axis = 1)\
    .dropna(axis = 1)


In [None]:
setup = setup(data = df_filtered, target = "price", combine_rare_levels=True)

In [None]:
models = compare_models(fold = 5, sort='rmse')
models

In [9]:
X = df.drop(['price', 'host_id', 'host_name', 'neighbourhood', 'name', 'id', 'last_review'], axis = 1).dropna(axis = 1)
y = df['price'].dropna()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

num_cols = X.select_dtypes(include = np.number).columns.to_list()

In [10]:
pipe_cat = Pipeline(steps = [
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

pipe_num = Pipeline(steps = [
    ('yeo', PowerTransformer())
])

In [11]:
preprocess = ColumnTransformer(transformers = [
    ('num', pipe_num, num_cols),
    ('cat', pipe_cat, ['neighbourhood_group', 'room_type'])
])

In [12]:
model = Pipeline(steps = [
    ('preprocessor',preprocess),
    ('lgbm', LGBMRegressor())
])

In [13]:
lgbm = model.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_test)

  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)


In [None]:
print(f"RMSE for LGBM: {mean_squared_error(y_test, lgbm_pred, squared=False)}")

## Otimização dos parâmetros do modelo

In [None]:
lgbm_grid = {
    'lgbm__num_leaves': [7, 14, 21],
    'lgbm__learning_rate': [0.1, 0.03, 0.003],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__n_estimators': [200, 500, 1000]
}

In [None]:
lgbm_tune = RandomizedSearchCV(
    model, 
    param_distributions = lgbm_grid,
    cv = 3,
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    verbose = 3,
    random_state = 42
)

In [None]:
lgbm_tune.fit(X,y)

In [None]:
lgbm_tune.best_estimator_