# Model Training

### Documents structure:
1.   Evaluation system settings
2.   Utility functions
3.   Input Data pre-processing
4.   Dictionaries generation
5.   Define Encoder-Decoder LSTM Architecture
6.   Evaluation metrics and functions
7.   Saving the trained models, Encoder and Decoder.




In [2]:
import re
import json
import string
import numpy as np
import pandas as pd
from string import digits
import matplotlib.pyplot as plt
from google.colab import drive
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.utils import plot_model
from keras import optimizers

drive.mount('/content/drive')

##########################
# CONFIGURATION SETTINGS #
config_number_str = "v7"
home_dir = "/content/drive/My Drive/Current Works/UBC Research Period/Training Folder/"

# Encoder-Decoder LSTM Architecture Configuration
units_number = 512
encoder_dropout = 0.2
decoder_dropout = 0.2
embedding_size = 128
learning_rate = 0.001
epochs_number = 300
batch_size = 32
validation_split = 0.2

##########################

# Files loading
file_dir = home_dir + "Datasets/5_Fold_Cross_Validation_time_series/5_train_time_series.txt"
lines= pd.read_table(file_dir, sep="___", names=['geo', 'year', 'title', 'uom', 'min_value', 'max_value', 'inp', 'out'])

Using TensorFlow backend.


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive




In [0]:
#####################
# UTILITY FUNCTIONS #
#####################

def sentences_pre_processing(lines):
    
    # Defining the chars to be replaced and the relative substitute value.
    replacing_dictionary = {"'"  : '',             
                            "\"" : '',
                            ','  : ' COMMA ', 
                            ':'  : ' COLON ', 
                            ";"  : ' SEMICOLON ',
                            '('  : ' S_R_BRACKET ', 
                            ')'  : ' E_R_BRACKET ',
                            '.'  : ' _SEQ_END SEQ_START_ ',
                           }
    
    # Replacing all the chars and tokens within the 'replacing_dictionary'
    for idx in replacing_dictionary:
      lines.out=lines.out.apply(lambda x: x.replace(idx, replacing_dictionary[idx]))

    # Adding tokens at the start and at the end of the caption.
    lines.out = lines.out.apply(lambda x : 'CAP_START_ SEQ_START_ '+ x + ' _SEQ_END _CAP_END')
    lines.out=lines.out.apply(lambda x: x.replace(' _SEQ_END SEQ_START_ _SEQ_END _CAP_END', ' _SEQ_END _CAP_END'))

    
    return lines

# Save a dictionary as JSON Object
def save_dictionary(data, save_dir):
  with open(save_dir, 'w') as fp:
    json.dump(data, fp, sort_keys=True, indent=4)
  print("Dictionary correctly saved!")

# Load a dictionary as JSON Object
def load_dictionary(load_dir):
  with open(load_dir, 'r') as fp:
      data = json.load(fp)    
  return data

In [0]:
### ### ### ### ### ### ### ### ### ###
# PRE PROCESSING THE INPUT DATA       #
### ### ### ### ### ### ### ### ### ###

lines = sentences_pre_processing(lines)


In [5]:
### ### ### ### ### ### ### ### ### ####
# GENERATING THE TRAINING DICTIONARIES #
### ### ### ### ### ### ### ### ### ####

all_out_words=set()
for out in lines.out:
  for word in out.split():
    if word not in all_out_words:
      all_out_words.add(word)
target_words = sorted(list(all_out_words))
num_decoder_tokens = len(all_out_words) + 1
target_token_index = dict([(word, i) for i, word in enumerate(target_words, 1)])

# Reverse-lookup token index to decode sequences back to something readable.
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

# Intiailizing the encoder/decoder arrays
decoder_input_data = np.zeros((len(lines.out), len(target_words)), dtype='float32')
decoder_target_data = np.zeros((len(lines.out),  len(target_words), num_decoder_tokens), dtype='float32')
encoder_input_data = np.zeros((len(lines.inp), 12), dtype='float32')

for idx, row in enumerate(lines.inp):
  for month, value in enumerate(row.split(" ")[:-1]):
    encoder_input_data[idx, month] = float(value)

for i, (input_text, target_text) in enumerate(zip(lines.inp, lines.out)):
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.

print(reverse_target_char_index)
# Save dictionaries about training set
save_dictionary(target_token_index, home_dir + "Dictionaries/" + config_number_str + "_target_token_index.json")
save_dictionary(reverse_target_char_index, home_dir + "Dictionaries/"  + config_number_str + "_reverse_target_char_index.json")

{1: '$', 2: '-1', 3: '-11', 4: '-2', 5: '-3', 6: '-32', 7: '-4', 8: '-48', 9: '0', 10: '1', 11: '10', 12: '100', 13: '101', 14: '102', 15: '103', 16: '104', 17: '105', 18: '108', 19: '11', 20: '110', 21: '12', 22: '13', 23: '15', 24: '16', 25: '19', 26: '2', 27: '20', 28: '2016', 29: '2018', 30: '2019', 31: '21', 32: '24', 33: '25', 34: '27', 35: '28', 36: '3', 37: '30', 38: '33', 39: '34', 40: '35', 41: '36', 42: '37', 43: '38', 44: '4', 45: '40', 46: '41', 47: '42', 48: '43', 49: '44', 50: '45', 51: '47', 52: '48', 53: '50', 54: '52', 55: '53', 56: '56', 57: '59', 58: '6', 59: '60', 60: '61', 61: '62', 62: '63', 63: '64', 64: '65', 65: '66', 66: '69', 67: '7', 68: '70', 69: '71', 70: '73', 71: '74', 72: '75', 73: '77', 74: '79', 75: '80', 76: '81', 77: '85', 78: '88', 79: '89', 80: '9', 81: '90', 82: '91', 83: '92', 84: '93', 85: '94', 86: '95', 87: '96', 88: '97', 89: '98', 90: '99', 91: 'CAP_START_', 92: 'COLON', 93: 'COMMA', 94: 'E_R_BRACKET', 95: 'SEMICOLON', 96: 'SEQ_START_', 97

In [6]:
### ### ### ### ### ### ### ### #### 
# ENCODER-DECODER MODEL DEFINITION # 
### ### ### ### ### ### ### ### #### 

# In order to change the architecture configuration,
# Check out the configuration section at the beginning of this document.

### ### ### ### ### ### ###
# ENCODER MODEL STRUCTURE #
### ### ### ### ### ### ###
encoder_inputs = Input(shape=(None,))
en_x=  Embedding(101, embedding_size)(encoder_inputs)
encoder = LSTM(units_number, dropout=encoder_dropout, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
# Discard the `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
encoder_model = Model(encoder_inputs, encoder_states)

### ### ### ### ### ### ###
# DECODER MODEL STRUCTURE #
### ### ### ### ### ### ###
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dex = Embedding(num_decoder_tokens, embedding_size)
final_dex= dex(decoder_inputs)
decoder_lstm = LSTM(units_number, dropout=decoder_dropout,return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

### ### ### ### ### ### ### ### ### #
# ENCODER - DECODER MODEL STRUCTURE #
### ### ### ### ### ### ### ### ### #
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

optimizer = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(optimizer=optimizer, loss='categorical_crossentropy')
model.summary()


from keras.callbacks import EarlyStopping
earlyStop=EarlyStopping(monitor="val_loss",verbose=2,mode='min',patience=5, restore_best_weights=True)

# Fit the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs_number,
          validation_split=validation_split)
          #callbacks=[earlyStop])

# Create sampling model
decoder_state_input_h = Input(shape=(units_number,))
decoder_state_input_c = Input(shape=(units_number,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2= dex(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    12928       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 128)    108672      input

In [0]:
### ### ### ### ### ### ### #
# SAVING THE TRAINED MODELS #
### ### ### ### ### ### ### #

encoder_model.save(home_dir + "Models/" + config_number_str + '_encoder_model.h5')  
decoder_model.save(home_dir + "Models/" + config_number_str + '_decoder_model.h5')  