## NYC Airbnb Price Prediction





In [1]:
import numpy as np 
import pandas as pd

from pycaret.regression import *

from feature_engine.encoding import RareLabelEncoder

from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    KFold, RandomizedSearchCV, train_test_split
)
from sklearn.metrics import mean_squared_error, r2_score

ValueError: Sklearn version error

In [None]:
df = pd.read_csv("Data/train.csv")

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [None]:
df['last_review']

In [None]:
df.isnull().sum()

In [None]:
df['price'] = df['price'].astype("float64")
df.dtypes

In [None]:
df_filtered = df\
.drop(['host_id', 'host_name', 'neighbourhood', 'name', 'id', 'last_review'], axis = 1)\
.dropna(axis = 1)


In [None]:
setup = setup(data = df_filtered, target = "price", combine_rare_levels=True)

In [None]:
models = compare_models(fold = 5, sort='rmse')
models

In [None]:
X = df.drop(['price', 'host_id', 'host_name', 'neighbourhood', 'name', 'id', 'last_review'], axis = 1).dropna(axis = 1)
y = df['price'].dropna()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

num_cols = X.select_dtypes(include = np.number).columns.to_list()

In [None]:
pipe_cat = Pipeline(steps = [
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

pipe_num = Pipeline(steps = [
    ('yeo', PowerTransformer())
])

In [None]:
preprocess = ColumnTransformer(transformers = [
    ('num', pipe_num, num_cols),
    ('cat', pipe_cat, ['neighbourhood_group', 'room_type'])
])

In [None]:
model = Pipeline(steps = [
    ('preprocessor',preprocess),
    ('lgbm', LGBMRegressor())
])

In [None]:
lgbm = model.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_test)

In [None]:
print(f"RMSE for LGBM: {mean_squared_error(y_test, lgbm_pred, squared=False)}")

## Otimização dos parâmetros do modelo

In [None]:
lgbm_grid = {
    'lgbm__num_leaves': [7, 14, 21],
    'lgbm__learning_rate': [0.1, 0.03, 0.003],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__n_estimators': [200, 500, 1000]
}

In [None]:
lgbm_tune = RandomizedSearchCV(
    model, 
    param_distributions = lgbm_grid,
    cv = 3,
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    verbose = 3,
    random_state = 42
)

In [None]:
lgbm_tune.fit(X,y)

In [None]:
lgbm_tune.best_estimator_