# Imports

In [2]:
import pandas as pd
!pip install pandas_summary
from pandas_summary import DataFrameSummary
import numpy as np
import datetime
!pip install isoweek
from isoweek import Week
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
!pip install sklearn_pandas
from sklearn_pandas import DataFrameMapper
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, Input, Flatten, Concatenate, Dense, BatchNormalization, Activation, LeakyReLU, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras import optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical

Collecting pandas_summary
  Downloading pandas_summary-0.0.7-py2.py3-none-any.whl (5.2 kB)
Installing collected packages: pandas-summary
Successfully installed pandas-summary-0.0.7
Collecting isoweek
  Downloading isoweek-1.3.3-py2.py3-none-any.whl (7.1 kB)
Installing collected packages: isoweek
Successfully installed isoweek-1.3.3


# Funciones de preprocesamiento

In [3]:
def get_missing_columns(df):  # devuelve las columnas a las que les falten datos
    return list(df.columns[df.describe(include = 'all').loc['count']<len(df)])

In [4]:
def join_df(left, right, left_on, right_on=None, join_test = False, test = pd.DataFrame()):
    if right_on is None: right_on = left_on
    train_merge = left.merge(right, how='left', left_on=left_on, right_on=right_on, 
                      suffixes=("", "_y"))
    if join_test == True: #si queremos unir la tabla de test con la misma tabla que unimos train
      test_merge = test.merge(right, how='left', left_on=left_on, right_on=right_on, 
                      suffixes=("", "_y"))
      return train_merge,test_merge
    else:
      return train_merge

In [5]:
def add_datepart(df):
    df.Date = pd.to_datetime(df.Date)
    df["Year"] = df.Date.dt.year
    df["Month"] = df.Date.dt.month
    df["Week"] = df.Date.dt.week
    df["Day"] = df.Date.dt.day
    

# Funciones de preprocesamiento de duraciones

In [6]:
class elapsed(object):
    def __init__(self, fld):
        self.fld = fld
        self.last = pd.to_datetime(np.nan)
        self.last_store = 0
        
    def get(self, row):
        if row.Store != self.last_store:
            self.last = pd.to_datetime(np.nan)
            self.last_store = row.Store
        if (row[self.fld]): self.last = row.Date
        return row.Date-self.last

In [7]:
def add_elapsed(df, fld, prefix):
    sh_el = elapsed(fld)
    df[prefix+fld] = df.apply(sh_el.get, axis=1)

# Funciones Baselines

In [8]:
def get_metric(sales, sales_):
    return np.sqrt((((sales - sales_)/sales)**2).sum()/len(sales))

In [9]:
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred)/y_true)))

In [10]:
def get_mean_by_column(column, sales_str):
    group_means_dict = {}
    group_mean_list = []
    for col_value, group_df in df_train.groupby(column):
        group_mean =  group_df[group_df[sales_str] > 0][sales_str].mean()
        group_means_dict[col_value] = group_mean
        group_mean_list.append(group_mean)
    print('Train:', get_metric(df_train[sales_str], 
                               df_train[column].apply(group_means_dict.get)))
    print('Val:', get_metric(df_val[sales_str], 
                             df_val[column].apply(group_means_dict.get)))
    return group_means_dict, group_mean_list

In [11]:
def get_keras_LR(X_columns, hidden_units=1):
    inputs = []
    activation = 'linear'
    if hidden_units>1:
        activation = 'relu'
    for i, col in enumerate(X_columns):
        inp = Input(shape=(X_train[i].shape[1],), name=f"{col}_input")
        inputs.append(inp)
    if len(X_columns)>1:
        concat_out = Concatenate()(inputs)
        dense_out = Dense(hidden_units, name='Dense', activation=activation)(concat_out)
    else:
        dense_out = Dense(hidden_units, name='Dense', activation=activation)(inputs[0])
    if hidden_units>1:
        dense_out = Dense(1, name='Dense_out')(dense_out)
    model = Model(inputs, dense_out)
    model.compile(optimizers.Adam(lr=0.0001), loss='mse', metrics=[rmspe, 'mse'])
    return model


In [12]:
def get_embedings_NN(X_columns, hidden_units = 20, activation = 'relu'):
    embed_outs = []
    inputs = []
    for i, col in enumerate(X_columns):
        inp = Input(shape=(1,), name=f"{col}_input")
        inputs.append(inp)
        if col in embed_outs_dict:
            embed_out = Embedding(len(np.unique(X_train[i])), embed_outs_dict[col], name=f"{col}_embedding", mask_zero=False)(inp)
            out = Flatten(name=f"{col}_flatten")(embed_out)
            embed_outs.append(out)
        else:
            embed_outs.append(inp)
        
    if len(X_columns)>1:
        concat_out = Concatenate()(embed_outs)
        dense_out = Dense(hidden_units, activation=activation)(concat_out)
    else:
        dense_out = Dense(hidden_units, activation=activation)(out)
    out = Dense(1)(dense_out)
    model = Model(inputs, out)
    model.compile(optimizers.Adam(lr=0.0001), loss='mse', metrics=[rmspe, 'mse'])
    return model

In [13]:
def plot_embed(layer_name, cat_names):
    Y = model.get_layer(layer_name).get_weights()[0]
    print(Y.shape)
    plt.figure(figsize=(8,8))
    plt.scatter(-Y[:, 0], -Y[:, 1])
    for i, txt in enumerate(cat_names):
        plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), xytext = (-5, 8), textcoords = 'offset points')

# Función Embedding

In [14]:
# Función que crea las capas de Embedding para las variables categoricas:
def get_cat_vars_model(cat_vars, uniques, cat_var_dict):
    cat_vars_embed_outs = [] # Lista de las capas de embeddings para cada variable categorica.
    cat_var_inputs = [] # Lista de los inputs a las capas de embeddings de cada variable categorica.
    for cat_var in cat_vars:
        # Se define la entrada unica (una sola variable) de una capa embedding.
        cat_var_in = Input(shape=(1,), name=f"{cat_var}_input")
        # Se agrega ese input a la lista:
        cat_var_inputs.append(cat_var_in)
        # Se crea la capa de embedding de salida:
        embed_out = Embedding(uniques[cat_var][0], cat_var_dict[cat_var], name=f'{cat_var}_Embed')(cat_var_in)
        # Hay que hacerle un reshape a esta capa de salida:
        flatten_out = Flatten(name=f"{cat_var}_flat")(embed_out)
        # Se agrega esta capa con su reshape a la lista de embeddings que devuelve:
        cat_vars_embed_outs.append(flatten_out)
    return cat_var_inputs, cat_vars_embed_outs

In [15]:
# Función que crea las capas de Embedding para las variables categoricas:
def get_cat_vars_model2(cat_vars, uniques, cat_var_dict):
    cat_vars_embed_outs = [] # Lista de las capas de embeddings para cada variable categorica.
    cat_var_inputs = [] # Lista de los inputs a las capas de embeddings de cada variable categorica.
    for cat_var in cat_vars:
        # Se define la entrada unica (una sola variable) de una capa embedding.
        cat_var_in = Input(shape=(1,), name=f"{cat_var}_input")
        # Se agrega ese input a la lista:
        cat_var_inputs.append(cat_var_in)
        # Se crea la capa de embedding de salida:
        embed_out = Embedding(uniques[cat_var][0]+1, cat_var_dict[cat_var], name=f'{cat_var}_Embed')(cat_var_in)
        # Hay que hacerle un reshape a esta capa de salida:
        flatten_out = Flatten(name=f"{cat_var}_flat")(embed_out)
        # Se agrega esta capa con su reshape a la lista de embeddings que devuelve:
        cat_vars_embed_outs.append(flatten_out)
    return cat_var_inputs, cat_vars_embed_outs

# Función creación capas de entrada variables continuas

In [16]:
def get_cont_vars_input(contin_vars, dense_layer=False):
    cont_vars_inputs = [] # Lista de los inputs de cada capa densa para cada variable continua.
    cont_vars_outputs = [] # Lista de las capas densas para cada variable continua.
    for cont_var in contin_vars:
        # Se define la entrada unica (una sola variable) de una capa.
        cont_var_in = Input(shape=(1,), name=f"{cont_var}_input")
        # Se agrega ese input a la lista:
        cont_vars_inputs.append(cont_var_in)
        if dense_layer: 
            cont_var_out = Dense(1, name=f"{cont_var}_input", activation = 'linear')(cont_var_in)
            cont_vars_outputs.append(cont_var_out)
        else:
            cont_vars_outputs.append(cont_var_in)
    return cont_vars_inputs, cont_vars_outputs

# Creación de modelo

In [17]:
def MLP_layers(embeddings, contin_inputs, l2_lambda, kernel_in, cant_capas, cant_neuronas = [1], activation_func = ["linear"], alpha = 0.3):
  max_capas = 0
  # Se combinan todos los inputs en una sola capa de inputs:
  merged = Concatenate(name='All_Concatenate')(embeddings + contin_inputs)

  # Se inicializa la primera capa:
  x = Dense(cant_neuronas[max_capas], kernel_initializer= kernel_in, kernel_regularizer=l2(l2_lambda))(merged)
  if activation_func[max_capas] == "LeakyReLU":
    x = LeakyReLU(alpha=alpha[max_capas])(x)
  else:
    x = Activation(activation_func[max_capas])(x)
  
  # Se suma uno a max_capas:
  max_capas = 1

  # se arman las siguientes capas:
  while cant_capas > max_capas:
    x = Dense(cant_neuronas[max_capas], kernel_initializer= kernel_in, kernel_regularizer=l2(l2_lambda))(x)
    if activation_func[max_capas] == "LeakyReLU":
      x = LeakyReLU(alpha=alpha[max_capas])(x)
    else:
      x = Activation(activation_func[max_capas])(x)
    max_capas += 1
  
  return x

In [18]:
def crear_MLP(add_customers, cat_var_inputs, cont_vars_inputs, layers, output_activation):
  # Creamos el output de Sales:
  output_1 = Dense(1, name='Sales', activation=output_activation)(layers)
  # El modelo se crea en base a si también queremos predecir customers:
  if add_customers:
    output_2 = Dense(1, name='Customers', activation=output_activation)(layers)
    model = Model(cat_var_inputs + cont_vars_inputs, [output_1, output_2])
  else: 
    model = Model(cat_var_inputs + cont_vars_inputs, [output_1])
  return model

# Funciones de metricas

In [19]:
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred)/y_true)))

In [20]:
def get_metric(df, sales_):
    return np.sqrt((((df['Sales'] - sales_)/df['Sales'])**2).mean())

# Entrenamiento y obtención del modelo

In [21]:
def obtener_x_y(df_train,df_val,df_test,all_vars,add_customers=True,log_output=False):

  X_train = np.hsplit(df_train[all_vars].values, len(all_vars))
  X_val = np.hsplit(df_val[all_vars].values, len(all_vars))
  X_test = np.hsplit(df_test[all_vars].values, len(all_vars))

  if add_customers:
      y_out_columns = ['Sales', 'Customers']
  else:
      y_out_columns = ['Sales']

  y_norm =[]
      
  if log_output:
      # Escala logaritmica
      df = pd.concat([df_train, df_val], axis=0)
      max_log_y = np.max(np.log(df[y_out_columns])).values
      y_train = np.log(df_train[y_out_columns].values)/max_log_y
      y_val = np.log(df_val[y_out_columns].values)/max_log_y

      y_norm.append(max_log_y)
  else: 
      # Normalizacion
      y_mean = df_train[y_out_columns].mean().values
      y_std = df_train[y_out_columns].std().values

      y_max = df_train[y_out_columns].max().values
      y_train = df_train[y_out_columns].values/y_max
      y_val = df_val[y_out_columns].values/y_max

      y_norm.append(y_mean)
      y_norm.append(y_std)
      y_norm.append(y_max)

  y_train = np.hsplit(y_train, y_train.shape[1])
  y_val = np.hsplit(y_val, y_val.shape[1])

  return X_train,X_val,X_test,y_train,y_val,y_norm

In [22]:
def entrenar_MLP(X_train,X_val,y_train,y_val,all_vars,model,add_customers=True,log_output=False,lr=0.001,metrics=['mse', rmspe],loss='mse',model_chosen='bestmodel.hdf5',epochs = 20,batch_size = 256,verbose=2):

  model.compile(optimizer=Adam(learning_rate=lr), metrics=metrics, loss=loss)
  
  if add_customers:
      checkpoint = ModelCheckpoint(model_chosen, monitor='val_Sales_mse', verbose=verbose, save_best_only=True)
  else:
      checkpoint = ModelCheckpoint(model_chosen, monitor='val_loss', verbose=verbose, save_best_only=True)

  history = model.fit(X_train, y_train, validation_data=(X_val, y_val),  epochs=epochs, batch_size=batch_size, callbacks=[checkpoint], verbose=verbose)

  return history,model

In [23]:
def entrenar_MLP2(X_train,X_val,y_train,y_val,all_vars,model,add_customers=True,log_output=False,lr=0.001,metrics=['mse', rmspe],loss='mse',model_chosen='bestmodel.hdf5',epochs = 20,batch_size = 256,verbose=2):

  print(1)
  model.compile(optimizer=Adam(learning_rate=lr), metrics=metrics, loss=loss)
  print(2)
  if add_customers:
      print(3)
      checkpoint = ModelCheckpoint(model_chosen, monitor='val_Sales_mse', verbose=verbose, save_best_only=True)
  else:
      checkpoint = ModelCheckpoint(model_chosen, monitor='val_loss', verbose=verbose, save_best_only=True)

  print(4)
  history = model.fit(X_train, y_train, validation_data=(X_val, y_val),  epochs=epochs, batch_size=batch_size, callbacks=[checkpoint], verbose=verbose)
  print(5)
  return history,model

# Obtención de métricas

In [24]:
def metricas_MLP(X_val, y_val,model,lr=0.001,metrics=['mse', rmspe],loss='mse',model_chosen='bestmodel.hdf5'):

  model.compile(optimizer=Adam(learning_rate=lr), metrics=metrics, loss=loss)
  model.load_weights(model_chosen)
  res_metrics = model.evaluate(X_val, y_val,return_dict = True)

  return res_metrics

In [25]:
def crossval_score_MLP(df,df_test,all_vars,model,k=5,add_customers=True,log_output=False,lr=0.001,metrics=['mse', rmspe],loss='mse',epochs = 20,batch_size = 256,verbose=2):

  metrics_cross = []

  for i in range(8-k,8):
    df_cross_train = df[df.Date < datetime.datetime(2015, i, 1)]  
    df_cross_val = df[df.Date >= datetime.datetime(2015, i, 1)]
    df_cross_val = df_cross_val[df.Date < datetime.datetime(2015, i+1, 1)]

    print(f'Fechas en df_train:, {min(df_cross_train.Date)}, - {max(df_cross_train.Date)}')
    print(f'Fechas en df_val:, {min(df_cross_val.Date)}, - {max(df_cross_val.Date)}')

    X_cross_train,X_cross_val,X_cross_test,y_cross_train,y_cross_val,y_norm = obtener_x_y(df_cross_train,df_cross_val,df_test,all_vars,add_customers=add_customers,log_output=log_output)  

    model_chosen=f'modelos generados/bestmodel_add_customers_{add_customers}_log_output_{log_output}_lr_{lr}_batch_size_{batch_size}_crossval_{i}.hdf5'

    history,model_cross = entrenar_MLP(X_cross_train,X_cross_val,y_cross_train,y_cross_val,all_vars,model=model,add_customers=add_customers,log_output=log_output,lr=lr,metrics=metrics,loss=loss,model_chosen=model_chosen,epochs = epochs,batch_size = batch_size,verbose=verbose)  

    metric_cross = metricas_MLP(X_cross_val,y_cross_val,model_cross,model_chosen=model_chosen)    
    metrics_cross.append(metric_cross['Sales_rmspe'])

  return metrics_cross

In [26]:
def crossval_score_MLP_nuestro(df,df_test,all_vars,model,k=5,add_customers=True,log_output=False,lr=0.001,metrics=['mse', rmspe],loss='mse',epochs = 20,batch_size = 256,verbose=2):

  metrics_cross = []

  for i in range(8-k,8):
    df_cross_train = df[df.Date < datetime.datetime(2015, i, 1)]  
    df_cross_val = df[df.Date >= datetime.datetime(2015, i, 1)]
    df_cross_val = df_cross_val[df.Date < datetime.datetime(2015, i+1, 1)]

    print(f'Fechas en df_train:, {min(df_cross_train.Date)}, - {max(df_cross_train.Date)}')
    print(f'Fechas en df_val:, {min(df_cross_val.Date)}, - {max(df_cross_val.Date)}')

    X_cross_train,X_cross_val,X_cross_test,y_cross_train,y_cross_val,y_norm = obtener_x_y(df_cross_train,df_cross_val,df_test,all_vars,add_customers=add_customers,log_output=log_output)  

    model_chosen=f'modelos generados/bestmodel_nuestro_add_customers_{add_customers}_log_output_{log_output}_lr_{lr}_batch_size_{batch_size}_crossval_{i}.hdf5'

    history,model_cross = entrenar_MLP(X_cross_train,X_cross_val,y_cross_train,y_cross_val,all_vars,model=model,add_customers=add_customers,log_output=log_output,lr=lr,metrics=metrics,loss=loss,model_chosen=model_chosen,epochs = epochs,batch_size = batch_size,verbose=verbose)  

    metric_cross = metricas_MLP(X_cross_val,y_cross_val,model_cross,model_chosen=model_chosen)    
    metrics_cross.append(metric_cross['Sales_rmspe'])

  return metrics_cross

In [27]:
def prom_metrics(df,df_test,all_vars,model,k=5,add_customers=True,log_output=False,lr=0.001,metrics=['mse', rmspe],loss='mse',epochs = 20,batch_size = 256):
  metrics_cross = []
  loss_cross = []
  for i in range(8-k,8):
    # Se dividen los df:
    df_cross_train = df[df.Date < datetime.datetime(2015, i, 1)]
    df_cross_val = df[df.Date >= datetime.datetime(2015, i, 1)]
    df_cross_val = df_cross_val[df.Date < datetime.datetime(2015, i+1, 1)]
  
    # Se normalizan y crean los X_val e y_val:
    X_train,X_val,X_test,y_train,y_val,y_norm = obtener_x_y(df_cross_train,df_cross_val,df_test,all_vars,add_customers=add_customers,log_output=log_output)
    models_chosen=f'modelos generados/bestmodel_add_customers_{add_customers}_log_output_{log_output}_lr_{lr}_batch_size_{batch_size}_crossval_{i}.hdf5'

    # se calcula la metrica rmspe y loss para cada modelo:
    metric_cross = metricas_MLP(X_val,y_val,model,model_chosen=models_chosen)
    metrics_cross.append(metric_cross['Sales_rmspe'])
    loss_cross.append(metric_cross['Sales_loss'])

    # Se saca el mean de cada variable:
    mean_metrics_cross = np.mean(metrics_cross)
    mean_loss_cross = np.mean(loss_cross)

  return metrics_cross, loss_cross, mean_metrics_cross, mean_loss_cross

In [28]:
def prom_metrics_nuestro(df,df_test,all_vars,model,k=5,add_customers=True,log_output=False,lr=0.001,metrics=['mse', rmspe],loss='mse',epochs = 20,batch_size = 256):
  metrics_cross = []
  loss_cross = []
  for i in range(8-k,8):
    # Se dividen los df:
    df_cross_train = df[df.Date < datetime.datetime(2015, i, 1)]
    df_cross_val = df[df.Date >= datetime.datetime(2015, i, 1)]
    df_cross_val = df_cross_val[df.Date < datetime.datetime(2015, i+1, 1)]
  
    # Se normalizan y crean los X_val e y_val:
    X_train,X_val,X_test,y_train,y_val,y_norm = obtener_x_y(df_cross_train,df_cross_val,df_test,all_vars,add_customers=add_customers,log_output=log_output)
    models_chosen=f'modelos generados/bestmodel_nuestro_add_customers_{add_customers}_log_output_{log_output}_lr_{lr}_batch_size_{batch_size}_crossval_{i}.hdf5'

    # se calcula la metrica rmspe y loss para cada modelo:
    metric_cross = metricas_MLP(X_val,y_val,model,model_chosen=models_chosen)
    metrics_cross.append(metric_cross['Sales_rmspe'])
    loss_cross.append(metric_cross['Sales_loss'])

    # Se saca el mean de cada variable:
    mean_metrics_cross = np.mean(metrics_cross)
    mean_loss_cross = np.mean(loss_cross)

  return metrics_cross, loss_cross, mean_metrics_cross, mean_loss_cross

# Obtención de predicciones

In [29]:
def predic_MLP(df_test,X_test,y_norm,model,add_customers=True,log_output=False,lr=0.001,metrics=['mse', rmspe],loss='mse',model_chosen='bestmodel.hdf5',verbose=2):

  model.compile(optimizer=Adam(learning_rate=lr), metrics=metrics, loss=loss)
  model.load_weights(model_chosen)

  if log_output:
    max_log_y = y_norm[0]
    if add_customers:
        y_pred_test = np.exp(model.predict(X_test, verbose=1)[0][:, 0]*max_log_y[0])
    else:
        y_pred_test = np.exp(model.predict(X_test, verbose=1)*max_log_y)[:,0]
  else:
    y_mean = y_norm[0]
    y_std = y_norm[1]
    y_max = y_norm[2]
    if add_customers:
        y_pred_test = (model.predict(X_test, verbose=1)[0]*y_std[0] + y_mean[0])[:,0]
    else:

        y_pred_test = model.predict(X_test, verbose=1)[:,0]*y_max

  y_pred_test[df_test['Open'] == 0] = 0

  return y_pred_test

# Funciones de ploteo

In [30]:
def plot_embed(model,layer_name, cat_names):
    Y = model.get_layer(layer_name).get_weights()[0]
    print(Y.shape)
    plt.figure(figsize=(8,8))
    plt.scatter(-Y[:, 0], -Y[:, 1])
    for i, txt in enumerate(cat_names):
        plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), xytext = (-5, 8), textcoords = 'offset points')