## NYC Airbnb Price Prediction





In [1]:
import numpy as np 
import pandas as pd

# from pycaret.regression import *

from feature_engine.encoding import RareLabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
OneHotEncoder, PowerTransformer, FunctionTransformer
)

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    KFold, RandomizedSearchCV, train_test_split
)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
df = pd.read_csv("Data/airbnb_imputed.csv")

In [3]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0,34202.0
mean,19089880.0,67761590.0,40.72884,-73.952123,152.040699,7.058213,23.20493,1.157316,7.180516,113.557862
std,10980800.0,78768300.0,0.054628,0.046084,234.546984,20.709473,44.327434,1.593326,32.970576,131.911958
min,2539.0,2438.0,40.50641,-74.24285,0.0,1.0,0.0,0.01,1.0,0.0
25%,9512642.0,7858259.0,40.68992,-73.98302,69.0,1.0,1.0,0.12,1.0,0.0
50%,19763330.0,30871980.0,40.7229,-73.95568,105.0,3.0,5.0,0.51,1.0,46.0
75%,29229940.0,107434400.0,40.763148,-73.936123,175.0,5.0,23.0,1.59,2.0,230.0
max,36487240.0,274321300.0,40.91306,-73.7169,10000.0,1250.0,629.0,58.5,327.0,365.0


In [4]:
df.describe(include='object')

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,room_type,last_review
count,34202,34202,34202,34202,34202,34202
unique,33689,9149,5,217,3,1675
top,Hillside Hotel,Michael,Manhattan,Williamsburg,Entire home/apt,2019-06-23
freq,15,309,15134,2719,17801,1213


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34202 entries, 0 to 34201
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              34202 non-null  int64         
 1   name                            34202 non-null  object        
 2   host_id                         34202 non-null  int64         
 3   host_name                       34202 non-null  object        
 4   neighbourhood_group             34202 non-null  object        
 5   neighbourhood                   34202 non-null  object        
 6   latitude                        34202 non-null  float64       
 7   longitude                       34202 non-null  float64       
 8   room_type                       34202 non-null  object        
 9   price                           34202 non-null  float64       
 10  minimum_nights                  34202 non-null  int64         
 11  nu

In [18]:
df.isna().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [4]:
df.query("price == 0")
df = df[df['price']>0]

In [5]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [6]:
df['price'] = df['price'].astype("float64")
df['name'] = df['name'].astype("str")
df.dtypes

id                                         int64
name                                      object
host_id                                    int64
host_name                                 object
neighbourhood_group                       object
neighbourhood                             object
latitude                                 float64
longitude                                float64
room_type                                 object
price                                    float64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[ns]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
dtype: object

In [45]:
df['year'] = df.last_review.dt.year
df['month'] = df.last_review.dt.month

In [None]:
df_filtered = df\
    .drop(['host_id', 'host_name', 'neighbourhood', 'name', 'id', 'last_review'], axis = 1)\
    .dropna(axis = 1)


In [None]:
setup = setup(data = df_filtered, target = "price", combine_rare_levels=True)

In [None]:
models = compare_models(fold = 5, sort='rmse')
models

In [47]:
X = df.drop(['price', 'host_id', 'host_name', 'id', 'last_review'], axis = 1)
y = df['price']

num_cols = X.select_dtypes(include = np.number).columns.to_list()
cat_cols = X.select_dtypes(exclude = np.number).drop(['name'], axis=1).columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [48]:
pipe_cat = Pipeline(steps = [
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

pipe_num = Pipeline(steps = [
    ('yeo', PowerTransformer())
])

In [49]:
preprocess = ColumnTransformer([
    ('num', pipe_num, num_cols),
    #('rare', RareLabelEncoder(max_n_categories=10), 'neighbourhood'),
    ('text_vec', TfidfVectorizer(), 'name'),
    ('cat', pipe_cat, ['neighbourhood_group', 'room_type'])
])

In [50]:
model = Pipeline(steps = [
    ('preprocessor', preprocess),
    ('lgbm', LGBMRegressor())
])

In [51]:
lgbm = model.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_test)

  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [36]:
print(f"RMSE for LGBM: {np.sqrt(mean_squared_error(y_test, lgbm_pred))},\
\nMAE for LGBM: {mean_absolute_error(y_test,lgbm_pred)}")

RMSE for LGBM: 211.08163229926868,
MAE for LGBM: 63.273356957172346


## Otimização dos parâmetros do modelo

In [15]:
lgbm_grid = {
    'lgbm__num_leaves': [7, 14, 21],
    'lgbm__learning_rate': [0.1, 0.03, 0.001],
    'lgbm__max_depth': [-1, 3, 5],
    'lgbm__n_estimators': [200, 500, 1000],
    'preprocessor__text_vec__max_features': [10,20,50]
}

In [38]:
lgbm_tune = RandomizedSearchCV(
    model, 
    param_distributions = lgbm_grid,
    cv = 5,
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    n_iter = 15,
    verbose = 3,
    random_state = 42
)

In [39]:
lgbm_tune.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-122.569, test=-233.958) total time=   1.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-114.231, test=-254.147) total time=   2.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-112.903, test=-259.614) total time=   2.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-124.312, test=-182.890) total time=   1.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-119.775, test=-187.831) total time=   1.9s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-168.036, test=-223.221) total time=   0.9s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-160.206, test=-252.952) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-159.711, test=-255.445) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-176.541, test=-171.472) total time=   1.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-172.008, test=-178.650) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-191.533, test=-218.558) total time=   2.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-183.397, test=-249.954) total time=   1.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-181.494, test=-254.215) total time=   2.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-202.610, test=-168.940) total time=   1.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-198.765, test=-186.288) total time=   3.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=20;, score=(train=-222.943, test=-224.198) total time=   2.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=20;, score=(train=-214.641, test=-257.385) total time=   1.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=20;, score=(train=-212.271, test=-262.817) total time=   1.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=20;, score=(train=-233.090, test=-178.228) total time=   1.6s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=20;, score=(train=-230.910, test=-189.121) total time=   1.6s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=1000, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-217.852, test=-220.447) total time=   2.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=1000, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-209.758, test=-254.302) total time=   2.6s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=1000, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-206.704, test=-259.708) total time=   2.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=1000, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-228.258, test=-174.047) total time=   2.3s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=1000, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-225.997, test=-185.553) total time=   2.4s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-222.823, test=-224.634) total time=   1.4s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-213.963, test=-258.336) total time=   1.4s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-211.823, test=-263.208) total time=   1.5s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-232.868, test=-178.842) total time=   1.4s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=14, preprocessor__text_vec__max_features=20;, score=(train=-230.646, test=-189.715) total time=   1.4s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-221.711, test=-222.871) total time=   2.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-213.040, test=-255.955) total time=   2.5s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-211.077, test=-260.914) total time=   2.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-231.776, test=-176.110) total time=   2.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-229.342, test=-187.674) total time=   2.3s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-194.218, test=-219.262) total time=   1.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-183.912, test=-248.505) total time=   1.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-184.672, test=-254.150) total time=   1.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-204.643, test=-166.759) total time=   1.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.03, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=21, preprocessor__text_vec__max_features=10;, score=(train=-200.341, test=-179.521) total time=   1.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-187.608, test=-220.864) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-176.447, test=-249.770) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-174.294, test=-254.962) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-196.096, test=-168.726) total time=   0.9s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.1, lgbm__max_depth=-1, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-189.624, test=-180.204) total time=   0.9s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.03, lgbm__max_depth=3, lgbm__n_estimators=200, lgbm__num_leaves=14, preprocessor__text_vec__max_features=50;, score=(train=-210.151, test=-218.198) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.03, lgbm__max_depth=3, lgbm__n_estimators=200, lgbm__num_leaves=14, preprocessor__text_vec__max_features=50;, score=(train=-201.899, test=-251.297) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.03, lgbm__max_depth=3, lgbm__n_estimators=200, lgbm__num_leaves=14, preprocessor__text_vec__max_features=50;, score=(train=-199.962, test=-255.173) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.03, lgbm__max_depth=3, lgbm__n_estimators=200, lgbm__num_leaves=14, preprocessor__text_vec__max_features=50;, score=(train=-220.556, test=-171.111) total time=   0.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.03, lgbm__max_depth=3, lgbm__n_estimators=200, lgbm__num_leaves=14, preprocessor__text_vec__max_features=50;, score=(train=-218.070, test=-181.806) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.1, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-184.069, test=-224.302) total time=   1.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.1, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-179.753, test=-249.850) total time=   1.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.1, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-172.151, test=-257.128) total time=   1.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.1, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-193.585, test=-168.609) total time=   1.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.1, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=10;, score=(train=-192.493, test=-182.549) total time=   1.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-221.713, test=-222.872) total time=   2.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-213.104, test=-255.725) total time=   2.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-211.077, test=-260.914) total time=   2.1s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-231.776, test=-176.110) total time=   2.0s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.001, lgbm__max_depth=3, lgbm__n_estimators=1000, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-229.343, test=-187.675) total time=   1.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-225.039, test=-225.836) total time=   1.3s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-216.108, test=-259.112) total time=   1.5s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-214.431, test=-263.792) total time=   1.3s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-235.073, test=-180.017) total time=   1.4s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.001, lgbm__max_depth=5, lgbm__n_estimators=500, lgbm__num_leaves=7, preprocessor__text_vec__max_features=50;, score=(train=-232.926, test=-190.489) total time=   1.3s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-227.786, test=-229.523) total time=   1.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-219.228, test=-262.125) total time=   1.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-217.641, test=-266.501) total time=   1.3s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-237.816, test=-183.931) total time=   1.3s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.001, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=21, preprocessor__text_vec__max_features=50;, score=(train=-235.601, test=-194.142) total time=   1.2s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 1/5] END lgbm__learning_rate=0.03, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=7, preprocessor__text_vec__max_features=20;, score=(train=-206.407, test=-216.444) total time=   0.8s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 2/5] END lgbm__learning_rate=0.03, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=7, preprocessor__text_vec__max_features=20;, score=(train=-198.473, test=-250.843) total time=   0.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 3/5] END lgbm__learning_rate=0.03, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=7, preprocessor__text_vec__max_features=20;, score=(train=-195.955, test=-254.573) total time=   0.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 4/5] END lgbm__learning_rate=0.03, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=7, preprocessor__text_vec__max_features=20;, score=(train=-216.049, test=-167.934) total time=   0.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[CV 5/5] END lgbm__learning_rate=0.03, lgbm__max_depth=-1, lgbm__n_estimators=200, lgbm__num_leaves=7, preprocessor__text_vec__max_features=20;, score=(train=-214.656, test=-179.968) total time=   0.7s


  out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [40]:
lgbm_tune.best_estimator_

In [41]:
lgbm_tune.best_score_

-213.6394921368345

In [42]:
pd.DataFrame(lgbm_tune.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor__text_vec__max_features,param_lgbm__num_leaves,param_lgbm__n_estimators,param_lgbm__max_depth,param_lgbm__learning_rate,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2.065219,0.457989,0.207682,0.009255,10,21,1000,-1,0.1,"{'preprocessor__text_vec__max_features': 10, '...",...,-223.688142,32.477958,13,-122.569327,-114.230902,-112.902704,-124.311545,-119.775339,-118.757964,4.498492
1,0.83952,0.102549,0.128528,0.003772,10,21,200,-1,0.1,"{'preprocessor__text_vec__max_features': 10, '...",...,-216.348118,35.638959,6,-168.035974,-160.205847,-159.710738,-176.540518,-172.008218,-167.300259,6.572976
2,2.033438,0.447034,0.176899,0.013364,50,21,500,5,0.03,"{'preprocessor__text_vec__max_features': 50, '...",...,-215.59094,33.812618,5,-191.533195,-183.39726,-181.494215,-202.609543,-198.765413,-191.559925,8.269963
3,1.695555,0.176197,0.149116,0.003598,20,21,500,5,0.001,"{'preprocessor__text_vec__max_features': 20, '...",...,-222.349773,34.407237,11,-222.942864,-214.641413,-212.271487,-233.090351,-230.909508,-222.771125,8.355642
4,2.254412,0.175176,0.1757,0.00705,20,14,1000,5,0.001,"{'preprocessor__text_vec__max_features': 20, '...",...,-218.811531,34.770052,8,-217.852439,-209.758255,-206.704027,-228.25819,-225.996546,-217.713891,8.53596
5,1.375245,0.038341,0.139121,0.002924,20,14,500,-1,0.001,"{'preprocessor__text_vec__max_features': 20, '...",...,-222.947197,34.426212,12,-222.82349,-213.963153,-211.822786,-232.8685,-230.646071,-222.4248,8.494955
6,2.148173,0.177182,0.184695,0.014488,50,21,1000,3,0.001,"{'preprocessor__text_vec__max_features': 50, '...",...,-220.704846,34.47834,10,-221.711496,-213.039792,-211.077288,-231.776252,-229.341658,-221.389297,8.33408
7,0.981439,0.046052,0.157311,0.003876,10,21,500,5,0.03,"{'preprocessor__text_vec__max_features': 10, '...",...,-213.639492,35.355738,1,-194.217681,-183.91184,-184.672008,-204.643038,-200.34135,-193.557183,8.262401
8,0.801342,0.022414,0.133125,0.005032,10,7,500,-1,0.1,"{'preprocessor__text_vec__max_features': 10, '...",...,-214.905366,35.19184,3,-187.607532,-176.447394,-174.29445,-196.095881,-189.624107,-184.813873,8.232652
9,0.785951,0.040275,0.12093,0.002605,50,14,200,3,0.03,"{'preprocessor__text_vec__max_features': 50, '...",...,-215.517014,34.549409,4,-210.150859,-201.899352,-199.962196,-220.555924,-218.070247,-210.127716,8.28101
