# Main Notebook Modelo LIGHTGBM Rossman Kaggle
Disclaimer: los resultados que arroja la siguiente notebook corrida en anaconda son inferiores y distintos a los mostrados en kaggle porque esas corridas se realizaron en collab. 

## Import de los modulos necesarios

In [4]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

import pandas as pd
import numpy as np
import datetime
from pandas_summary import DataFrameSummary



## Load data procesada utilizando las notebooks entregadas

In [9]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

In [10]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 
            'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw']
contin_vars = ['CompetitionDistance', 
               'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Precipitationmm',
               'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
               'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
               'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday', 'StateHoliday_bool']

In [11]:
# Split data into train/val and define X and y variables
df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
df_val = df[df.Date >= datetime.datetime(2015, 7, 1)]
print(f'Cantidad en val: {len(df_val)}, porcentaje: {len(df_train)/(len(df_train) + len(df_val))}')

y_out_columns = ['Sales']
X_train = df_train[cat_vars + contin_vars]
X_val = df_val[cat_vars + contin_vars]
X_test = df_test[cat_vars + contin_vars]

Cantidad en val: 30188, porcentaje: 0.9642465458145908


In [12]:
X_train.shape, X_val.shape

((814150, 40), (30188, 40))

In [13]:
# Normalize output and determine wether to use log_output 
log_output = True
    
if log_output:
    # Escala logaritmica
    max_log_y = np.max(np.log(df[y_out_columns])).values
    y_train = np.log(df_train[y_out_columns].values)/max_log_y
    y_val = np.log(df_val[y_out_columns].values)/max_log_y
else:
    # Normalización
    y_mean = df_train[y_out_columns].mean().values
    y_std = df_train[y_out_columns].std().values
    y_train = (df_train[y_out_columns].values - y_mean)/y_std
    y_val = (df_val[y_out_columns].values - y_mean)/y_std

## Data from hyper opt - Iterations

In [15]:
iterations_from_hyperopt = pd.read_csv('09-lightGBM-hyp-search_iterations.csv')
iterations_from_hyperopt.sort_values('loss')

Unnamed: 0.1,Unnamed: 0,loss,iteration,max_depth,learning_rate,reg_lambda,num_leaves,n_estimators
34,34,0.112966,34,460.0,0.050612,17.0,55.0,1150.0
78,78,0.113528,78,580.0,0.102991,21.0,75.0,1150.0
60,60,0.113648,60,560.0,0.055221,19.0,65.0,1200.0
39,39,0.113735,39,440.0,0.050151,14.0,50.0,1150.0
44,44,0.113890,44,500.0,0.050844,20.0,55.0,1150.0
...,...,...,...,...,...,...,...,...
84,84,0.123025,84,540.0,0.023014,25.0,80.0,1050.0
0,0,0.124011,0,460.0,0.021323,35.0,55.0,1100.0
94,94,0.124650,94,580.0,0.020936,8.0,70.0,1100.0
33,33,0.124939,33,420.0,0.018817,9.0,50.0,1100.0


In [16]:
# We print the best parameters found
iterations_from_hyperopt.loc[34,:]

Unnamed: 0         34.000000
loss                0.112966
iteration          34.000000
max_depth         460.000000
learning_rate       0.050612
reg_lambda         17.000000
num_leaves         55.000000
n_estimators     1150.000000
Name: 34, dtype: float64

## Model

In [17]:
from lightgbm import LGBMRegressor

In [18]:
# Los valores presentados a continuación son los finales que entregan el mejor score privado en la competencia.
# Son una modificación a mano de los parametros que entregó el optimizador. 
# Las modificaciones fueron para regularizar: subir el reg_lambda, bajar los n_estimators y subir el LR, etc.
min_child_samples=5
n_estimators=1150
learning_rate=0.050612
max_depth = 460
num_leaves= 55
min_child_samples= 200 #194
reg_lambda= 20 #10 #20 #10 #0.000000
reg_alpha= 1.000000
colsample_bytree= 0.519264
min_child_weight= 0.000000
#n_estimators= 1000

model = LGBMRegressor(min_child_samples=min_child_samples, n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, num_leaves=num_leaves, reg_lambda=reg_lambda,
                     reg_alpha=reg_alpha, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,n_jobs=8)

In [19]:
# Fit params
fit_params={"early_stopping_rounds":100, 
            "eval_metric" : 'l2', 
            "eval_set" : [(X_val, y_val.reshape(-1))],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': cat_vars
           }

In [21]:
# Fit model
model.fit(X_train, y_train.reshape(-1), **fit_params)

New categorical_feature is ['Assortment', 'CompetitionMonthsOpen', 'CompetitionOpenSinceYear', 'Day', 'DayOfWeek', 'Events', 'Month', 'Promo2SinceYear', 'Promo2Weeks', 'PromoInterval', 'Promo_bw', 'Promo_fw', 'SchoolHoliday_bw', 'SchoolHoliday_fw', 'State', 'StateHoliday', 'StateHoliday_bool_bw', 'StateHoliday_bool_fw', 'Store', 'StoreType', 'Week', 'Year']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	valid's l2: 0.000200194	valid's l2: 0.000200194
[200]	valid's l2: 0.000148531	valid's l2: 0.000148531
[300]	valid's l2: 0.000131198	valid's l2: 0.000131198
[400]	valid's l2: 0.000121904	valid's l2: 0.000121904
[500]	valid's l2: 0.000118979	valid's l2: 0.000118979
[600]	valid's l2: 0.000115269	valid's l2: 0.000115269
[700]	valid's l2: 0.000112147	valid's l2: 0.000112147
[800]	valid's l2: 0.000109999	valid's l2: 0.000109999
[900]	valid's l2: 0.000108506	valid's l2: 0.000108506
[1000]	valid's l2: 0.000107384	valid's l2: 0.000107384
[1100]	valid's l2: 0.000106561	valid's l2: 0.000106561
Did not meet early stopping. Best iteration is:
[1148]	valid's l2: 0.000106303	valid's l2: 0.000106303


LGBMRegressor(boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.519264, importance_type='split',
              learning_rate=0.050612, max_depth=460, min_child_samples=200,
              min_child_weight=0.0, min_split_gain=0.0, n_estimators=1150,
              n_jobs=8, num_leaves=55, objective=None, random_state=None,
              reg_alpha=1.0, reg_lambda=20, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

# Analisis de los resultados

## Metrica

$$
\textrm{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} \left(\frac{\hat{y}_i - y_i}{y_i}\right)^2}
$$

In [22]:
# No sacamos lo de train para acelerar la notebook.
if log_output:
    #y_pred_train = np.exp(model.predict(X_train, verbose=1)*max_log_y)
    y_pred = np.exp(model.predict(X_val, verbose=1)*max_log_y)
    y_pred_test = np.exp(model.predict(X_test, verbose=1)*max_log_y)
else:
    #y_pred_train = model.predict(X_train, verbose=1)*y_std + y_mean
    y_pred = model.predict(X_val, verbose=1)*y_std + y_mean
    y_pred_test = model.predict(X_test, verbose=1)*y_std + y_mean

In [23]:
# Validación
val_score = np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))
print(val_score)

0.1130517279702061


# Submit a la competencia

In [24]:
sample_csv = pd.read_csv('sample_submission.csv')
sample_csv['Sales'] = y_pred_test
sample_csv.head()

sample_csv.to_csv(f'submision_lightgbm_{log_output}-{min_child_samples}-{n_estimators}-{learning_rate}-SCORE-{val_score}.csv', index=False)