# 0.0. IMPORTS

In [1]:
import re
import pickle

import pandas            as pd
import numpy             as np
import seaborn           as sns
import matplotlib.pyplot as plt
import xgboost           as xgb
import lightgbm          as lgb

from sklearn.preprocessing   import MinMaxScaler
from sklearn.ensemble        import RandomForestRegressor
from sklearn.metrics         import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing   import LabelEncoder
from sklearn.linear_model    import LinearRegression, Lasso
from category_encoders       import TargetEncoder
from sklearn.preprocessing   import OneHotEncoder
from category_encoders.count import CountEncoder


from sklearn               import model_selection   as ms
from sklearn               import ensemble          as en

  from pandas import MultiIndex, Int64Index


## 0.1. Aux Functions

In [2]:
def settings():
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25,12]
    plt.rcParams['font.size'] = 24
    plt.rcParams['figure.dpi'] = 100
    sns.set()


def ml_error( model_name, ytest, yhat ):
    mae = mean_absolute_error( ytest, yhat )
    mape = mean_absolute_percentage_error( ytest, yhat )
    rmse = np.sqrt( mean_squared_error( ytest, yhat ) )
    
    return pd.DataFrame( {'Model name': model_name,
                          'MAE': mae,
                          'MAPE': mape,
                          'RMSE': rmse }, index=[0] )

In [3]:
settings()

## 0.2. Reading Data

In [4]:
df_raw=pd.read_csv('work/treino.csv')
df_test=pd.read_csv('work/teste.csv')

# 1.0. DESCRIÇÃO DOS DADOS

In [5]:
df1=df_raw.copy()

## 1.1. Dimensão dos Dados

In [6]:
df1.shape

(39446, 29)

## 1.2. NA Check

In [7]:
df1.isna().sum()

ID                                                                0
num_fotos                                                       237
marca                                                             0
modelo                                                            0
versao                                                            0
ano_de_fabricacao                                                 0
ano_modelo                                                        0
odometro                                                          0
cambio                                                            0
num_portas                                                        0
tipo                                                              0
blindado                                                          0
cor                                                               0
tipo_vendedor                                                     0
cidade_vendedor                                 

### 1.2.1. Fillout NA

In [8]:
# # num_fotos - considerar nan como 0 fotos > ordinal encoding
df1['num_fotos']=df1['num_fotos'].fillna(0)

## 1.3. Tipos dos Dados

In [9]:
df1.dtypes

ID                                                             object
num_fotos                                                     float64
marca                                                          object
modelo                                                         object
versao                                                         object
ano_de_fabricacao                                               int64
ano_modelo                                                    float64
odometro                                                      float64
cambio                                                         object
num_portas                                                      int64
tipo                                                           object
blindado                                                       object
cor                                                            object
tipo_vendedor                                                  object
cidade_vendedor     

### 1.3.1. Change data types

In [10]:
df1['num_fotos']=df1['num_fotos'].astype(int)
df1['ano_modelo']=df1['ano_modelo'].astype(int)

# 2.0. FEATURE ENGINEERING

In [11]:
df2 = df1.copy()

In [12]:
# ano de fabricacao one hot
bin1 = [1985, 2000]
bin2 = [2000, 2005]
bin3 = [2005, 2010]
bin4 = [2010, 2015]
bin5 = [2015, 2020]
bin6 = [2020, 2025]
bins = [bin1, bin2, bin3, bin4, bin5, bin6]

year_bins = []

for i in range(df2.shape[0]):
    tmp = [0 for i in range(len(bins))]
    ye = df2['ano_de_fabricacao'][i]
    for j in range(len(bins)):
        if ye >= bins[j][0] and ye < bins[j][1]:
            tmp[j] = 1
    year_bins.append(tmp)
year_bins = np.asarray(year_bins)
new_df = pd.DataFrame(year_bins, columns=['Year_Bin'+str(i+1) for i in range(len(bins))])
df2 = pd.concat( [df2, new_df], axis=1 )
# df2 = df2.drop(columns=['ano_de_fabricacao'])

# categoria marca
popular_baixo_padrao = ['FIAT','SUZUKI','CHEVROLET','SMART','HYUNDAI','LIFAN','SSANGYONG','RENAULT','DODGE','ALFA ROMEO',
                        'CITROËN','CHRYSLER','BRM','EFFA']

popular_alto_padrao = ['JEEP','SUBARU','FORD','KIA','CHERY','PEUGEOT','VOLKSWAGEN','NISSAN','JAC','HONDA','MITSUBISHI']

luxo = ['VOLVO','LEXUS','MERCEDES-BENZ','FERRARI','AUDI','TOYOTA','IVECO','MINI','TROLLER']

superluxo = ['PORSCHE','RAM','LAMBORGHINI','JAGUAR','LAND ROVER','MASERATI','BMW']

df2['categoria_marca'] = df2['marca'].apply( lambda x: 'popular_baixo_padrao' if x in popular_baixo_padrao else
                                                       'popular_alto_padrao' if x in popular_alto_padrao else
                                                       'luxo' if x in luxo else
                                                       'superluxo' if x in superluxo 
                                                        else 'nao_identificado' )
# cilindrada
cilindradas = []

for i in range(len(df2)):
    try:
        cilindrada = re.search( "[0-9]{1}.[0-9]{1}", df2['versao'][i] )[0]
    except:
        cilindrada = "desconhecida"
        
    cilindradas.append( cilindrada )
    
df_cilindradas = pd.DataFrame (cilindradas, columns = ['cilindrada'])
df2 = pd.concat( [df2,df_cilindradas],axis=1)

# turbo
turbo_list = []

for i in range(len(df2)):
    try:
        turbo = re.search( "TURBO", df2['versao'][i] )[0]
    except:
        turbo = "NÃO TURBO"
        
    turbo_list.append( turbo )
    
df_turbo = pd.DataFrame (turbo_list, columns = ['turbo'])
df2 = pd.concat( [df2,df_turbo],axis=1)

# 4x4
offroad_list = []

for i in range(len(df2)):
    try:
        offroad = re.search( "4X4", df2['versao'][i] )[0]
    except:
        offroad = "NÃO 4x4"
        
    offroad_list.append( offroad )
    
df_offroad = pd.DataFrame (offroad_list, columns = ['offroad'])
df2 = pd.concat( [df2,df_offroad],axis=1)

# combustivel
df2['combustivel'] = df2['versao'].apply( lambda x: re.search( "GASOLINA", x )[0] if re.search( "GASOLINA", x ) is not None else 
                                                    re.search( "FLEX", x )[0] if re.search( "FLEX", x ) is not None else
                                                    re.search( "HYBRID", x )[0] if re.search( "HYBRID", x ) is not None else
                                                    re.search( "DIESEL", x )[0] if re.search( "DIESEL", x ) is not None else
                                                    re.search( "ELECTIRC", x )[0] if re.search( "ELECTIRC", x ) is not None else
                                                    re.search( "ELÉTRICO", x )[0] if re.search( "ELÉTRICO", x ) is not None else
                                                    "HYBRID" if re.search( "HÍBRIDO", x ) is not None else
                                                    re.search( "GÁS", x )[0] if re.search( "GÁS", x ) is not None else "DESCONHECIDO")

In [13]:
df2['cilindrada'] = df2['cilindrada'].apply( lambda x: "0" if x == "desconhecida" else x )
df2['cilindrada'] = df2['cilindrada'].astype(float)
# df2['faixa_cilindrada'] = df2['cilindrada'].apply( lambda x: "ate_1.0" if x <= 1.0 else
#                                                 "ate_1.6" if x <= 1.6 else
#                                                 "ate_2.0" if x <= 2.0 else
#                                                 "ate_2.5" if x<=2.5 else
#                                                 "ate_3.0" if x<=3.0 else
#                                                 "ate_5.0" if x<=5.0 else
#                                                 "mais_de_5.0" if x >5.0 else "desconhecido" )
# df2 = df2.drop(columns=['cilindrada'])

# kilometragem
km = []
for i in range(len(df2)):
    if df2['odometro'][i] / ( 2022 - df2['ano_de_fabricacao'][i] ) < 10000:
        kilometragem = "pouco_usado"
    if df2['odometro'][i] / ( 2022 - df2['ano_de_fabricacao'][i] ) <= 20000:
        kilometragem = "na_media"
    else:
        kilometragem = "muito_usado"
        
    km.append(kilometragem)
    
df_km = pd.DataFrame (km, columns = ['km'])
df2 = pd.concat( [df2,df_km],axis=1)

df2['antigo_atual'] = df2['ano_de_fabricacao'].apply( lambda x: "antigo" if x < 2011 else "novo" )

# valvulas
df2['valvulas'] = df2['versao'].apply( lambda x: re.search( "[0-9]{2}V", x )[0] if re.search( "[0-9]{2}V", x ) is not None else 
                                                    re.search( "[0-9]{1}V", x )[0] if re.search( "[0-9]{1}V", x ) is not None
                                                    else "DESCONHECIDO" )

  if df2['odometro'][i] / ( 2022 - df2['ano_de_fabricacao'][i] ) < 10000:
  if df2['odometro'][i] / ( 2022 - df2['ano_de_fabricacao'][i] ) <= 20000:


# 3.0. FILTRAGEM DE VARIÁVEIS

In [14]:
df3 = df2.copy()

## 3.1. Filtragem das Linhas

In [15]:
# # substituindo ano de fabricacao caso a diferença para o ano modelo seja superior a 1 ano
# df3['dif_ano'] = df3.apply(lambda x: x['ano_modelo'] - x['ano_de_fabricacao'], axis=1)
# df3[df3['dif_ano'] > 1].count()

# df3['ano_de_fabricacao'] = df3.apply(lambda x: (x['ano_modelo']-1) if x['dif_ano'] > 1 else x['ano_de_fabricacao'], axis=1)

## 3.2. Seleção das Colunas

In [16]:
# df3 = df3.drop( columns=['elegivel_revisao','attr_veiculo_alienado'])

# 4.0. EDA

In [17]:
df4 = df3.copy()

# 5.0. DATA PREPARATION

In [18]:
df5 = df4.copy()

## 5.1. Encoding

In [19]:
# colunas pra dropar: id
df5 = df5.drop( columns=['ID','ano_de_fabricacao'] )

# binario (0/1): blindado, troca, elegivel_revisao, attr_veiculo_aceita_troca, attr_veiculo_único_dono,
# attr_veiculo_todas_as_revisões_feitas_pela_concessionária, attr_veiculo_ipva_pago, attr_veiculo_licenciado
# attr_veiculo_ipva_pago, attr_veiculo_licenciado, attr_veiculo_garantia_de_fábrica
df5['offroad'] = df5['offroad'].apply( lambda x: 1 if x == '4X4' else 0 )
df5['turbo'] = df5['turbo'].apply( lambda x: 1 if x == 'TURBO' else 0 )
df5['blindado'] = df5['blindado'].apply( lambda x: 0 if x == "false" else 1 if x=="true" else 0 )
df5['troca'] = df5['troca'].apply( lambda x: 0 if x == "false" else 1 if x=="true" else 0 )
#df5['elegivel_revisao'] = df5['elegivel_revisao'].apply( lambda x: 0 if x == "false" else 1 if x=="true" else 0 )
# df5['entrega_delivery'] = df5['entrega_delivery'].apply( lambda x: 0 if x == "false" else 1 if x=="true" else 0 )
# df5['attr_veiculo_aceita_troca'] = df5['attr_veiculo_aceita_troca'].apply( lambda x: 0 if x == "Aceita troca" else 0 )
# df5['attr_veiculo_único_dono'] = df5['attr_veiculo_único_dono'].apply( lambda x: 0 if x == "Único dono" else 0 )
# df5['attr_veiculo_todas_as_revisões_feitas_pela_concessionária'] = df5['attr_veiculo_todas_as_revisões_feitas_pela_concessionária'].apply( lambda x: 0 if x == "Todas as revisões feitas pela concessionária" else 0 )
# df5['attr_veiculo_todas_as_revisões_feitas_pela_agenda_do_carro'] = df5['attr_veiculo_todas_as_revisões_feitas_pela_agenda_do_carro'].apply( lambda x: 0 if x == "Todas as revisões feitas pela agenda do carro" else 0 )
# df5['attr_veiculo_ipva_pago'] = df5['attr_veiculo_ipva_pago'].apply( lambda x: 0 if x == "IPVA pago" else 0 )
# df5['attr_veiculo_licenciado'] = df5['attr_veiculo_licenciado'].apply( lambda x: 0 if x == "Licenciado" else 0 )
# df5['attr_veiculo_garantia_de_fábrica'] = df5['attr_veiculo_garantia_de_fábrica'].apply( lambda x: 0 if x == "Garantia de fábrica" else 0 )
df5 = df5.drop( columns = ['attr_veiculo_aceita_troca','attr_veiculo_único_dono','attr_veiculo_todas_as_revisões_feitas_pela_concessionária',
                           'attr_veiculo_todas_as_revisões_feitas_pela_agenda_do_carro','attr_veiculo_ipva_pago','attr_veiculo_licenciado',
                           'attr_veiculo_garantia_de_fábrica','attr_veiculo_alienado','elegivel_revisao'] )

# dummies: marca, cambio, tipo, tipo_vendedor
encoder = OneHotEncoder(handle_unknown='ignore')

df_cambio = pd.DataFrame(encoder.fit_transform(df5[['cambio']]).toarray())
df_cambio.columns = encoder.get_feature_names_out()
df5 = df5.join(df_cambio)
pickle.dump( encoder, open( 'cambio_encoding', 'wb' ) )

df_tipo_vendedor = pd.DataFrame(encoder.fit_transform(df5[['tipo_vendedor']]).toarray())
df_tipo_vendedor.columns = encoder.get_feature_names_out()
df5 = df5.join(df_tipo_vendedor)
# df5['tipo_vendedor'] = ohe.fit_transform(df5[['tipo_vendedor']])
pickle.dump( encoder, open( 'tipo_vendedor_encoding', 'wb' ) )

df_tipo_anuncio = pd.DataFrame(encoder.fit_transform(df5[['tipo_anuncio']]).toarray())
df_tipo_anuncio.columns = encoder.get_feature_names_out()
df5 = df5.join(df_tipo_anuncio)
# df5['tipo_anuncio'] = ohe.fit_transform(df5[['tipo_anuncio']])
pickle.dump( encoder, open( 'tipo_anuncio_encoding', 'wb' ) )

df_categoria_marca = pd.DataFrame(encoder.fit_transform(df5[['categoria_marca']]).toarray())
df_categoria_marca.columns = encoder.get_feature_names_out()
df5 = df5.join(df_categoria_marca)
# df5['categoria_marca'] = ohe.fit_transform(df5[['categoria_marca']])
pickle.dump( encoder, open( 'categoria_marca_encoding', 'wb' ) )

# df_km = pd.DataFrame(encoder.fit_transform(df5[['km']]).toarray())
# df_km.columns = encoder.get_feature_names_out()
# df5 = df5.join(df_km)
# df5['categoria_marca'] = ohe.fit_transform(df5[['categoria_marca']])
# pickle.dump( encoder, open( 'categoria_marca_encoding', 'wb' ) )

df_combustivel = pd.DataFrame(encoder.fit_transform(df5[['combustivel']]).toarray())
df_combustivel.columns = encoder.get_feature_names_out()
df5 = df5.join(df_combustivel)
# df5['combustivel'] = ohe.fit_transform(df5[['combustivel']])
df5 = df5.drop( columns=['cambio','tipo_vendedor','tipo_anuncio','categoria_marca','combustivel',
                         'km'] )
pickle.dump( encoder, open( 'combustivel_encoding', 'wb' ) )

# frequency encoder/label encoder: modelo, versao
# fe_modelo = df5.groupby('modelo').size() / len(df5)
# df5.loc[:,'modelo'] = df5['modelo'].map(fe_modelo)

# fe_versao = df5.groupby('versao').size() / len(df5)
# df5.loc[:,'versao'] = df5['versao'].map(fe_versao)
te = TargetEncoder()
# le = LabelEncoder()
# df5['modelo'] = le.fit_transform(df5['modelo'])
# # df5['modelo'] = te.fit_transform(df5['modelo'], df5['preco'])
# pickle.dump( le, open( 'modelo_encoding', 'wb' ) )

df5['versao'] = te.fit_transform(df5['versao'], df5['preco'])
pickle.dump( te, open( 'versao_encoding', 'wb' ) )
# df5['versao'] = le.fit_transform(df5['versao'])

# # fe_marca = df5.groupby('marca').size() / len(df5)
# # df5.loc[:,'marca'] = df5['marca'].map(fe_marca)

# # fe_cor = df5.groupby('cor').size() / len(df5)
# # df5.loc[:,'cor'] = df5['cor'].map(fe_cor)

# # fe_tipo = df5.groupby('tipo').size() / len(df5)
# # df5.loc[:,'tipo'] = df5['tipo'].map(fe_tipo)

# label encoder
# le = LabelEncoder()
df5['marca'] = te.fit_transform(df5['marca'],df5['preco'])
pickle.dump( te, open( 'marca_encoding', 'wb' ) )
df5['cor'] = te.fit_transform(df5['cor'],df5['preco'])
pickle.dump( te, open( 'cor_encoding', 'wb' ) )
df5['tipo'] = te.fit_transform(df5['tipo'],df5['preco'])
pickle.dump( te, open( 'tipo_encoding', 'wb' ) )
df5['cidade_vendedor'] = te.fit_transform(df5['cidade_vendedor'],df5['preco'])
pickle.dump( te, open( 'cidade_vendedor_encoding', 'wb' ) )
df5['estado_vendedor'] = te.fit_transform(df5['estado_vendedor'],df5['preco'])
pickle.dump( te, open( 'estado_vendedor_encoding', 'wb' ) )

# cols = ['marca','cor','tipo','cidade_vendedor','estado_vendedor','versao','modelo']
# ce = CountEncoder(cols=cols, return_df=True)
# df5[cols] = te.fit_transform(df5[cols], df5['preco'])
# pickle.dump( ce, open( 'ce_encoder', 'wb' ) )

df5['modelo'] = te.fit_transform(df5['modelo'], df5['preco'])
pickle.dump( te, open( 'modelo_encoding_te', 'wb' ) )

# df5['cambio'] = te.fit_transform(df5['cambio'],df5['preco'])
# df5['tipo_anuncio'] = te.fit_transform(df5['tipo_anuncio'],df5['preco'])
# df5['tipo_vendedor'] = te.fit_transform(df5['tipo_vendedor'],df5['preco'])
# df5['categoria_marca'] = te.fit_transform(df5['categoria_marca'],df5['preco'])
# df5['combustivel'] = te.fit_transform(df5['combustivel'],df5['preco'])
# df5['km'] = te.fit_transform(df5['km'],df5['preco'])
df5['antigo_atual'] = te.fit_transform(df5['antigo_atual'],df5['preco'])
pickle.dump( te, open( 'antigo_atual_encoding', 'wb' ) )

df5['valvulas'] = te.fit_transform(df5['valvulas'],df5['preco'])
pickle.dump( te, open( 'valvulas_encoding', 'wb' ) )



## 5.2. Normalização

## 5.3. Rescaling

In [20]:
# mms = MinMaxScaler()

# df5['num_fotos'] = mms.fit_transform( df5[['num_fotos']].values )
# # df5['ano_modelo'] = mms.fit_transform( df5[['ano_modelo']].values )
# df5['odometro'] = mms.fit_transform( df5[['odometro']].values )
# df5['num_portas'] = mms.fit_transform( df5[['num_portas']].values )

## 5.4. Transformação

In [21]:
df5['preco'] = np.log1p(df5['preco'])

# 6.0. FEATURE SELECTION

In [22]:
df6 = df5.copy()

## 6.1. Manual Selection

## 6.2. Feature Importancia

In [23]:
# # model
# forest = en.ExtraTreesRegressor( n_estimators=250, random_state=0, n_jobs=-1 )

# # training
# x_train_fselection = df6.drop( ['preco'], axis=1 )
# y_train_fselection = df6['preco'].values
# forest.fit( x_train_fselection, y_train_fselection )

In [24]:
# importances = forest.feature_importances_
# std = np.std( [tree.feature_importances_ for tree in forest.estimators_], axis=0 )
# indices = np.argsort( importances )[::-1]

# # print the feature ranking
# df = pd.DataFrame()

# print( 'Feature Ranking:\n' )
# for i, j in zip( x_train_fselection,forest.feature_importances_ ):
#     aux = pd.DataFrame( {'feature': i, 'importance': j}, index=[0] )
#     df = pd.concat( [df, aux], axis=0 )
    
# print( df.sort_values( 'importance', ascending=False ) ) 

# # plot the impurity-based feature importances of the forest
# # plt.figure()
# # plt.title( 'Feature importances' )
# # plt.bar( range( x_train_fselection.shape[1] ), importances[indices], color='r', yerr=std[indices], align='center' )
# # plt.xticks( range(x_train_fselection.shape[1]), indices )
# # plt.xlim( [-1, x_train_fselection.shape[1]] )
# # # plt.show()

In [25]:
# cols = ['blindado','troca','combustivel_DESCONHECIDO']

## 6.3. Boruta Selection

# 7.0. MACHINE LEARNING MODELLING

In [26]:
df7 = df6.copy()

X = df7.drop(columns=["preco"])
# X = df7.drop(columns=cols)
Y = df7['preco'].copy()
# Y = df6['preco'].copy()

X_train, X_val, y_train, y_val = ms.train_test_split( X, Y, test_size=0.2, random_state=42 )

## 7.1. Linear Regression

In [27]:
# model training
lr = LinearRegression().fit( X_train, y_train )

# prediction
yhat_lr = lr.predict( X_val )

# performance (error)
lr_results = ml_error( 'Linear Regression', np.expm1( y_val ) , np.expm1( yhat_lr ) )
lr_results

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Linear Regression,29534.946712,0.228926,54773.260422


### 7.1.1. Cross Validation

## 7.3. Random Forest Regressor

In [28]:
# model
rf = RandomForestRegressor( n_estimators=150, n_jobs=-1, random_state=42 ).fit( X_train, y_train )

# prediction
yhat_rf = rf.predict( X_val )

# performance
rf_results = ml_error( 'Random Forest Regressor', np.expm1(y_val), np.expm1(yhat_rf) )
rf_results

KeyboardInterrupt: 

### 7.3.1. Cross Validation

## 7.5. XGBoost Regressor

In [None]:
# model
model_xgb = xgb.XGBRegressor( n_estimators=400 ).fit( X_train, y_train )

# prediction
yhat_xgb = model_xgb.predict( X_val )

# performance
model_xgb_results = ml_error( 'XGBoost Regressor',  np.expm1(y_val), np.expm1(yhat_xgb) )
model_xgb_results

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Unnamed: 0,Model name,MAE,MAPE,RMSE
0,XGBoost Regressor,25984.094792,0.204358,43125.821164


### 7.5.1. Cross Validation

## 7.6. LightGBM

In [29]:
# model
model_lgb = lgb.LGBMRegressor( n_jobs=-1, random_state=42, subsample_freq=1, max_bin=500, n_estimators = 768, max_depth = 18, learning_rate = 0.02391, num_leaves = 68, min_child_samples = 6, subsample = 0.959787, colsample_bytree = 0.6179 ).fit( X_train, y_train )

# prediction
yhat_lgb = model_lgb.predict( X_val )

# performance
model_lgb_results = ml_error( 'LightGBM Regressor',  np.expm1(y_val), np.expm1(yhat_lgb) )
model_lgb_results

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,LightGBM Regressor,24964.16048,0.195114,42172.434674


In [30]:
from sklearn.model_selection import KFold, cross_val_score
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.callbacks import CheckpointSaver

# Search space for hyper parameters
search_space = [Integer( 2, 700, name = 'max_bin'),
                Integer( 100, 1500, name = 'n_estimators'), 
                Integer(1, 20, name = 'max_depth'), 
                Real(0.001, 0.1, 'log-uniform', name = 'learning_rate'),
                Integer(2, 128, name = 'num_leaves'),
                Integer(1, 100, name = 'min_child_samples'),
                Real(0.05, 1.0, name = 'subsample'),
                Real(0.15, 1.0, name = 'colsample_bytree')]

# model definition
lgbm_model = lgb.LGBMRegressor(n_jobs=-1, random_state=42, subsample_freq=1)

# applying cross-validation into tunning
@use_named_args(search_space)
def model_eval( **params ):
	
    lgbm_model.set_params(**params)
    print(lgbm_model)
    kf = KFold(n_splits=10, random_state=42, shuffle=True)  
    ft_result = cross_val_score(lgbm_model, X, np.expm1(Y), scoring='neg_mean_absolute_error', cv=kf.split(X), n_jobs=-1, verbose=0)*-1
    return np.mean(ft_result)

# create checkpoints
checkpoint_callback = CheckpointSaver('lgbm_ft.pkl')

# return results and run bayesian optimize
result = gp_minimize( model_eval, search_space, n_calls = 100, 
                      n_initial_points = 10, verbose=True, n_jobs=-1, 
                      random_state= 42, callback=[checkpoint_callback])
result

Iteration No: 1 started. Evaluating function at random point.
LGBMRegressor(colsample_bytree=0.4336523194681686,
              learning_rate=0.01562069367563987, max_bin=558, max_depth=16,
              min_child_samples=11, n_estimators=357, num_leaves=58,
              random_state=42, subsample=0.48628644736757387, subsample_freq=1)
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 14.1211
Function value obtained: 25387.2070
Current minimum: 25387.2070
Iteration No: 2 started. Evaluating function at random point.
LGBMRegressor(colsample_bytree=0.6748592831835591,
              learning_rate=0.027796975515266827, max_bin=102, max_depth=2,
              min_child_samples=1, n_estimators=1011, num_leaves=120,
              random_state=42, subsample=0.9926009813266569, subsample_freq=1)
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 9.7084
Function value obtained: 26392.6489
Current minimum: 25387.2070
Iteration No: 3 started. Evaluating function 

KeyboardInterrupt: 

In [None]:
# join dfs
Full_X_train = pd.concat( [X_train, X_val], axis=0 )
Full_Y_train = pd.concat( [y_train, y_val], axis=0 )

# model
model_lgb_full = lgb.LGBMRegressor( n_jobs=-1, random_state=42, subsample_freq=1, max_bin=500, n_estimators = 768, max_depth = 18, learning_rate = 0.02391, num_leaves = 68, min_child_samples = 6, subsample = 0.959787, colsample_bytree = 0.6179 ).fit( Full_X_train, Full_Y_train )

# saving trained model
pickle.dump( model_lgb_full, open( '/Users/mathe/Repos_ComunidadeDS/mobility_cars_hackday/mobility_cars_lgb.pkl', 'wb' ) )