## NYC Airbnb Price Prediction





In [None]:
import numpy as np 
import pandas as pd

from feature_engine.encoding import RareLabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, PowerTransformer, MinMaxScaler
)

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    KFold, RandomizedSearchCV, train_test_split
)

from sklearn.metrics import (
    mean_squared_log_error, r2_score
)


In [None]:
df = pd.read_csv("../Data/airbnb_imputed.csv")

In [None]:
df.describe().T

In [None]:
df.describe(include='object').T

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df = df.query("price > 0 & price < 800")
#df = df[df['price']>0]

In [None]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [None]:
df['price'] = df['price'].astype("float64")
df['name'] = df['name'].astype(object)
df.dtypes

In [None]:
df['year'] = df.last_review.dt.year
df['month'] = df.last_review.dt.month

df[['year','month']].sample(5)

In [None]:
X = df.drop(['price', 'host_id', 'host_name', 'id', 'last_review'],axis = 1)
y = np.log(df['price'])

num_cols = X.select_dtypes(include = np.number).columns.to_list()
cat_cols = X.select_dtypes(exclude = np.number).drop('name', axis=1).columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(X_train[num_cols].tail())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
pipe_cat = Pipeline(
    steps = [
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

pipe_num = Pipeline(
    steps = [
    ('scale', MinMaxScaler())
    ]
)

pipe_text = Pipeline(
    steps = [
        ('text_vec', CountVectorizer(analyzer='word',
                                     stop_words='english',
                                     max_features=30))
    ]
)

#pd.DataFrame(pipe_text.fit_transform(X_train['name']).toarray(), columns=pipe_text.get_feature_names_out())

In [None]:

preprocess = ColumnTransformer(
    transformers = [
    #('num', pipe_num, num_cols),
    ('text', pipe_text, 'name'),
    ('cat', pipe_cat, cat_cols)
    ],
    remainder='passthrough'
)

pd.DataFrame(preprocess.fit_transform(X_train), columns=preprocess.get_feature_names_out())

In [28]:

model = Pipeline(steps = [
    ('rare', RareLabelEncoder(tol=0.03, variables=['neighbourhood'])),
    ('preprocessor', preprocess),
    ('lgbm', LGBMRegressor())
])


In [29]:
model_fit = model.fit(X_train, y_train) 
model_fit.score(X_test, y_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002877 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1511
[LightGBM] [Info] Number of data points in the train set: 23711, number of used features: 58
[LightGBM] [Info] Start training from score 4.704162


0.6583243034578063

In [None]:
y_pred = model_fit.predict(X_test)
y_pred

In [None]:
print(f"RMSLE for LGBM: {np.sqrt(mean_squared_log_error(y_test, y_pred))},\
\nR2 for LGBM: {r2_score(y_test, y_pred)}")

In [None]:
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import (
    GradientBoostingRegressor, AdaBoostRegressor
)
from xgboost import XGBRegressor

models = [
    ('huber', HuberRegressor()),
    ('gb', GradientBoostingRegressor()),
    ('ada', AdaBoostRegressor()),
    ('xgb', XGBRegressor())
] 

for name, model in models:
    pipe = Pipeline(steps=[
        ('rare', RareLabelEncoder(tol=0.03, variables=['neighbourhood'])),
        ('preprocessor', preprocess),
        (name, model)
    ])
    
    pipe.fit(X_train,y_train)
    preds = pipe.predict(X_test)
    print(f"""{name} R2: {r2_score(y_test, preds)}
          {name} RMSLE: {np.sqrt(mean_squared_log_error(y_test, preds))}""")
    

## Otimização dos parâmetros do modelo

In [None]:
lgbm_grid = {
    'lgbm__num_leaves': [7, 14, 21],
    'lgbm__learning_rate': [0.1, 0.03, 0.001],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__n_estimators': [200, 500, 1000],
    'preprocessor__text__text_vec__max_features': [10,20,50,150]
}

model.get_params().keys()

In [30]:
lgbm_tune = RandomizedSearchCV(
    model, 
    param_distributions = lgbm_grid,
    cv = 5,
    scoring = 'neg_mean_squared_log_error',
    return_train_score = True,
    n_iter = 10,
    verbose = 1,
    random_state = 42
)

In [31]:
lgbm_tune.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1446
[LightGBM] [Info] Number of data points in the train set: 18968, number of used features: 38
[LightGBM] [Info] Start training from score 4.702090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004748 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1446
[LightGBM] [Info] Number of data points in the train set: 18969, number of used features: 38
[LightGBM] [Info] Start training from score 4.703432
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002185 seconds.
You can set `force_row_

In [43]:
from joblib import dump
best_lgbm = lgbm_tune.best_estimator_
dump(best_lgbm, '../models/best_lgbm.pkl')

['../models/best_lgbm.pkl']

In [None]:
lgbm_tune.best_score_