## NYC Airbnb Price Prediction





In [1]:
import numpy as np 
import pandas as pd

# from pycaret.regression import *

from feature_engine.encoding import RareLabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, PowerTransformer, MinMaxScaler
)

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    KFold, RandomizedSearchCV, train_test_split
)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
df = pd.read_csv("../Data/airbnb_imputed.csv")

In [4]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0
mean,19089880.0,67761590.0,40.72884,-73.952123,152.040699,7.058213,23.20493,1.156207,7.180516,113.557862
std,10980800.0,78768300.0,0.054628,0.046084,234.546984,20.709473,44.327434,1.593658,32.970576,131.911958
min,2539.0,2438.0,40.50641,-74.24285,0.0,1.0,0.0,0.01,1.0,0.0
25%,9512642.0,7858259.0,40.68992,-73.98302,69.0,1.0,1.0,0.12,1.0,0.0
50%,19763330.0,30871980.0,40.7229,-73.95568,105.0,3.0,5.0,0.5,1.0,46.0
75%,29229940.0,107434400.0,40.763148,-73.936123,175.0,5.0,23.0,1.59,2.0,230.0
max,36487240.0,274321300.0,40.91306,-73.7169,10000.0,1250.0,629.0,58.5,327.0,365.0


In [None]:
df.describe(include='object')

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.query("price == 0")
df = df[df['price']>0]

In [None]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [None]:
df['price'] = df['price'].astype("float64")
df['name'] = df['name'].astype("str")
df.dtypes

In [None]:
df['year'] = df.last_review.dt.year
df['month'] = df.last_review.dt.month



In [None]:
df_filtered = df\
    .drop(['host_id', 'host_name', 'neighbourhood', 'name', 'id', 'last_review'], axis = 1)\
    .dropna(axis = 1)


In [36]:
X = df.drop(['price', 'host_id', 'host_name', 'id', 'last_review'],axis = 1)
y = df['price']

num_cols = X.select_dtypes(include = np.number).columns.to_list()
cat_cols = X.select_dtypes(exclude = np.number).columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(X_train[cat_cols].tail())   

                                                   name neighbourhood_group  \
16855      Gorgeous midtown apartment near Times square           Manhattan   
6267   Your NY home for Quality time , Fully Equipped !            Brooklyn   
11286                       A sunny healthy big bedroom              Queens   
860                                     Bedstuy Bedstay            Brooklyn   
15800               Spacious Flat in HeArt of Bushwick!              Queens   

            neighbourhood        room_type  
16855      Hell's Kitchen  Entire home/apt  
6267             Bushwick  Entire home/apt  
11286            Elmhurst     Private room  
860    Bedford-Stuyvesant  Entire home/apt  
15800           Ridgewood  Entire home/apt  


In [47]:
pipe_cat = Pipeline(
    steps = [
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

pipe_num = Pipeline(
    steps = [
    ('scale', MinMaxScaler())
    ]
)

pipe_text = Pipeline(
    steps = [
        ('text_vec', TfidfVectorizer(analyzer='word',
                                     max_features=25,
                                     ngram_range=(1,1)))        
    ]
)


In [52]:
preprocess = ColumnTransformer(
    transformers = [
    #('num', pipe_num, num_cols),
    ('text', pipe_text, ['name']),
    #('cat', pipe_cat, cat_cols)
    ],
    remainder='passthrough'
)
    
preprocess.fit_transform(X_train)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 23935

In [50]:
from sklearn.ensemble import RandomForestRegressor
model = Pipeline(steps = [
    ('rare', RareLabelEncoder(tol=0.03, variables=['neighbourhood'])),
    ('preprocessor', preprocess),
    ('lgbm', LGBMRegressor())
    #('rf', RandomForestRegressor(n_estimators=800))
])

In [51]:
model_fit = model.fit(X_train, y_train) 
model_fit.score(X_test, y_test)


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 23935

In [None]:
model_pred = model_fit.predict(X_test)

model_pred 

In [None]:
print(f"RMSE for LGBM: {np.sqrt(mean_squared_error(y_test, model_pred))},\
\nMAE for LGBM: {mean_absolute_error(y_test,model_pred)}")

## Otimização dos parâmetros do modelo

In [None]:
lgbm_grid = {
    'lgbm__num_leaves': [7, 14, 21],
    'lgbm__learning_rate': [0.1, 0.03, 0.001],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__n_estimators': [200, 500, 1000],
    'preprocessor__text_vec__max_features': [10,20,50]
}

In [None]:
lgbm_tune = RandomizedSearchCV(
    model, 
    param_distributions = lgbm_grid,
    cv = 5,
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    n_iter = 15,
    verbose = 3,
    random_state = 42
)

In [None]:
lgbm_tune.fit(X_train,y_train)

In [None]:
lgbm_tune.best_estimator_

In [None]:
lgbm_tune.best_score_

In [None]:
pd.DataFrame(lgbm_tune.cv_results_)