# Model Evaluation

### Documents structure:
1.   Evaluation system settings
2.   Utility functions
3.   Loading the required files
4.   Select dataset to use for the evaluation
5.   Encoding the input data
6.   Evaluation metrics and functions
7.   Sentence Generation Evaluation
7.   Caption Generation Evaluation





In [4]:
import re
import string
import json
import numpy as np
import pandas as pd
from string import digits
import matplotlib.pyplot as plt
from google.colab import drive
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.utils import plot_model
from keras.models import load_model
import operator
import random

drive.mount('/content/drive')

##############################
# EVALUATION SYSTEM SETTINGS #
##############################

# General settings
config_number_str = "v7"
home_dir = "/content/drive/My Drive/Current Works/UBC Research Period/Training Folder/"

# Generation Configuration Settings #
## Set to True if you want to evaluate the system on the test dataset
## Set to False if you want to evaluate the system on a part of the train dataset
test_evaluation = True
## Set to True if you want to evaluate the system on the whole train dataset
full_inference = False


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#####################
# UTILITY FUNCTIONS #
#####################

# Randomly choose the "numb" % of the input dataset's indexes
def validation_set(dataset, numb):
    numb_list = list(dataset["ID_Series"].values)
    choosen_list = []
    for n in range(numb):
        r_num = random.choice(numb_list)
        choosen_list.append(r_num)
        numb_list.remove(r_num) 
    return choosen_list

# Load a dictionary as JSON Object
def load_dictionary(load_dir):
  with open(load_dir, 'r') as fp:
      data = json.load(fp)    
  return data

In [6]:
### ### ### ### ### ### ### ##
# LOADING THE REQUIRED FILES #
### ### ### ### ### ### ### ##

# Loading the encoder model
encoder_filename = config_number_str + "_encoder_model.h5"
encoder_model = load_model(home_dir + "Models/" + encoder_filename)

# Loading the decoder model
decoder_filename = config_number_str + "_decoder_model.h5"
decoder_model = load_model(home_dir + "Models/" + decoder_filename)

# Load "target token index" dictionary generated during the training process
target_token_index = load_dictionary(home_dir + "Dictionaries/" + config_number_str + "_target_token_index.json")
for k in target_token_index:
  target_token_index[k] = int(target_token_index[k])
    
# Load "reverse target char" dictionary  generated during the training process
reverse_target_char_index_temp = {}
reverse_target_char_index = load_dictionary(home_dir + "Dictionaries/"  + config_number_str + "_reverse_target_char_index.json")
for key, value in reverse_target_char_index.items():
  reverse_target_char_index_temp[int(key)] = value
reverse_target_char_index = reverse_target_char_index_temp





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.










In [0]:
### ### ### ### ### ##
# EVALUATION DATASET #
### ### ### ### ### ##
if(full_inference == True):
  dataset = pd.read_excel(home_dir + "Datasets/final_captions_collection.xlsx")
  unique_IDs = dataset["ID_Series"].unique()
  choosen_list = unique_IDs[~np.isnan(unique_IDs)]
else:
  if(test_evaluation == True):
    # Evaluation on the test dataset
    full_dataset = pd.read_excel(home_dir + "Datasets/final_captions_collection.xlsx")
    dataset = pd.read_excel(home_dir + "Datasets/5_Fold_Cross_Validation_time_series/4_test_time_series.xlsx")
    unique_IDs = dataset["ID_Series"].unique()
    choosen_list = unique_IDs[~np.isnan(unique_IDs)]

  orig_captions, orig_sentences = [], []
  output_captions, output_sentences = [], []
  output_results = {}

In [8]:

### ### ### ### ### ### ### ### ### ##
# EVALUATION LIBRARIES AND FUNCTION  #
### ### ### ### ### ### ### ### ### ##
!pip install py-rouge
import nltk
import rouge
nltk.download('punkt')

def print_results(idx, output_sequence, original_sequence):
    print('-- #', idx, '------------------------------------------------------------------------------------------------')
    print("Output caption: ", output_sequence)
    print("Original captions: ", original_sequence)
    print('\n')
    
def set_vocabulary(current_df):
    dtkn_vocabulary = {}
    dtkn_vocabulary["TKN_Year"] = str(current_df["Year"].values[0])
    dtkn_vocabulary["TKN_Geo"] = current_df["Geo"].values[0] 
    dtkn_vocabulary["TKN_About"] = current_df["About"].values[0] 
    dtkn_vocabulary["TKN_UOM"] = current_df["UOM"].values[0] 
    return dtkn_vocabulary

def detokenization(input_sentence):
    # Defining the chars to be replaced and the relative substitute value.
    replacing_dictionary = {","  : ' COMMA ', 
                            ":"  : ' COLON ', 
                            ";"  : ' SEMICOLON ',
                            "("  : "S_R_BRACKET ", 
                            ")"  : " E_R_BRACKET", 
                            "."  : " _SEQ_END", 
                            ""  : "CAP_START_ SEQ_START_ ", 
                            "  "  : "_CAP_END",
                            " " : "SEQ_START_",
                            ". " : ". . ",
                            ".  " : ".  ."}
   
    # Replacing all the chars and tokens within the 'replacing_dictionary'
    for repl_idx in replacing_dictionary:
      input_sentence = input_sentence.replace(replacing_dictionary[repl_idx], repl_idx )
    
    while("  " in input_sentence):
      input_sentence = input_sentence.replace("  ", " ")
      
    input_sentence = re.sub(r"(\A\w)|" + "(?<!\.\w)([\.?!] )\w|" + "\w(?:\.\w)|"+ "(?<=\w\.)\w", lambda x: x.group().upper(), input_sentence)
    
    return input_sentence

def denormalization(input_sequence, input_min, input_max):
    input_sequence = input_sequence
    new_caption = input_sequence
    for word in input_sequence.split(" "):
            try:
                if(word[-1] == "."):
                    val = int(word[:-1])
                    old_val = " " + str(val) + "."
                    # Normalize the value
                    val_to_substitute = (val/100 * (input_max - input_min)) + input_min
                    new_val = " " + str(round(val_to_substitute, 2)) + "."

                else:
                    # Check if the word is a float
                    val = int(word)
                    old_val = " " + str(val) + " "
                    # Normalize the value
                    val_to_substitute = (val/100 * (input_max - input_min)) + input_min
                    new_val = " " + str(round(val_to_substitute, 2)) + " "
                    
                # Substitute the normalized value with the original value in the tokenized caption
                new_caption = new_caption.replace(old_val, new_val)
            except:
                pass
    return new_caption
              
    
def prepare_results(metric, p, r, f, evaluation_results):
    evaluation_results[metric] = {"Precision" : round(100.0 * p, 2), "Recall" : round(100.0 * r, 2), "F1-Score" : round(100.0 * f, 2)}
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f), evaluation_results


def rouge_evaluation(all_hypothesis, all_references):

    # it's possible to add also 'Individual' to check the evaluation between
    # each single hypothesis and each single reference.
    for aggregator in ['Avg', 'Best']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                            max_n=4,
                            limit_length=True,
                            length_limit=100,
                            length_limit_type='words',
                            apply_avg=apply_avg,
                            apply_best=apply_best,
                            alpha=0.5, # Default F1_score
                            weight_factor=1.2,
                            stemming=True)
        
        scores = evaluator.get_scores(all_hypothesis, all_references)
        evaluation_results = {}

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id], evaluation_results))
                print()
            else:
                pr, evaluation_results = prepare_results(metric, results['p'], results['r'], results['f'], evaluation_results)
                print(pr)



Collecting py-rouge
[?25l  Downloading https://files.pythonhosted.org/packages/9c/1d/0bdbaf559fb7afe32308ebc84a2028600988212d7eb7fb9f69c4e829e4a0/py_rouge-1.1-py3-none-any.whl (56kB)
[K     |█████▊                          | 10kB 18.2MB/s eta 0:00:01[K     |███████████▌                    | 20kB 1.8MB/s eta 0:00:01[K     |█████████████████▎              | 30kB 2.4MB/s eta 0:00:01[K     |███████████████████████         | 40kB 1.7MB/s eta 0:00:01[K     |████████████████████████████▉   | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 1.9MB/s 
[?25hInstalling collected packages: py-rouge
Successfully installed py-rouge-1.1
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
def BS_generator(input_seq, mode, k):

  # Beam Search parameter
  k = k

  # Encoding the input sequence and saving the states value
  start_state = encoder_model.predict(input_seq)

  # Initializing the first input token
  start_token = 'CAP_START_'

  # Initializing bm dictionary and states vector
  bm = {}
  states_values = []
  for i in range(0, k):
      bm[i, start_token] = 1
      states_values.append(start_state)

  cont = 0 
  while(cont < 80):
      to_predict = []
      temp_search_values = []
      temp_search_probs = []
      temp_search_states = []
      
      if(cont == 0):
        to_predict = [start_token] * k 
      else:
        for sent in bm.keys():
            to_predict.append(sent[1].split()[-1]) # or maybe [-2] ???
      for idx, word in enumerate(to_predict): 
          target_seq = np.zeros((1,1))
          target_seq[0, 0] = target_token_index[word]
          # Predicting the probability for the next output tokens
          output_tokens, h, c = decoder_model.predict([target_seq] + states_values[idx])
          
          # Creating a list of tokens ordered by their probability values.
          indexed = list(enumerate(output_tokens[0, -1, :]))
          ordered_predictions = list(reversed(sorted(indexed, key=operator.itemgetter(1))))[0:k]
          top_values = [reverse_target_char_index[i] for i, v in ordered_predictions]
          top_probs = [v for i, v in ordered_predictions]
          
          for append_idx in range(0, k):
              temp_search_values.append(top_values[append_idx])
              temp_search_probs.append(top_probs[append_idx])
              temp_search_states.append([h,c])
          
      curr_prob_idx = -1
      for bm_key_idx, bm_key_value in enumerate(bm.keys()):
        for prob_idx in range(0, k):
            curr_prob_idx = curr_prob_idx + 1
            temp_search_probs[curr_prob_idx] = bm[bm_key_value] * temp_search_probs[curr_prob_idx]
            temp_search_values[curr_prob_idx] = bm_key_value[1] + " " + temp_search_values[curr_prob_idx]
      
      temp_search_probs = np.array(temp_search_probs)
      top_indexes = temp_search_probs.argsort()[-k:][::-1]
      
      bm = {}
      states_values = []
      
      if(cont == 0):
        for subs_idx in range(0, k):
            bm[subs_idx, temp_search_values[subs_idx]] = temp_search_probs[subs_idx]
            states_values.append(temp_search_states[subs_idx])
      else:
        for subs_idx in range(0, k):
            bm[subs_idx, temp_search_values[top_indexes[subs_idx]]] = temp_search_probs[top_indexes[subs_idx]]
            states_values.append(temp_search_states[top_indexes[subs_idx]])
      cont = cont + 1
     
  
  alternatives_captions = []
  # Select the K captions within the bm
  for key, value in bm.items():
    splitted_capt = detokenization(key[1]).split(".")[:-1]
    splt_capt = '.'.join(splitted_capt)
    splt_capt = splt_capt + "."
    alternatives_captions.append(splt_capt)

  # Remove extra words after final "." char during sentence generation
  if(mode == "sentence"):
    for idx, sentence in enumerate(alternatives_captions):
      alternatives_captions[idx] = sentence.split(".")[0] + "."

  return alternatives_captions

In [10]:

###################################
# Generating the output sentences  #
###################################
output_sentences = []
orig_sentences = []
results_to_save = {}
sentences_to_save = ["S#1", "S#2", "S#3"]
for idx in choosen_list:
    # Retrieve test instances with ID_Series equal to idx
    current_df = full_dataset[full_dataset["ID_Series"] == idx]
        
    # Retrieving the original sentences associated with ID_Seris == idx
    orig_sent_to_add = []
    for sentence in list(current_df["Caption"].values):
      orig_sent_to_add.extend(sentence.split(".")[:-1])
    orig_sentences.append(orig_sent_to_add)
    
    # Retrieving the input sequence - time series values -
    input_seq_df = current_df[current_df["ID_Series"] == idx].reset_index()
    input_seq = np.array([input_seq_df.iloc[0,9:21].values])
    
    # Setting the vocabulary for the current time series
    dtkn_vocabulary = set_vocabulary(current_df)
    
    # Generating the output sentence
    # BS = Beam Search implementation
    alternatives_sentences = BS_generator(input_seq, "sentence", 3)

    decoded_alt_sentences = []
    for decoded_sentence in alternatives_sentences:
      # Denormalize the output caption
      temp_alt_decoded_sentence = denormalization(str(decoded_sentence), current_df["min_time_series"].values[0] , current_df["max_time_series"].values[0] )
      # Detokenization the output caption
      for tkn in dtkn_vocabulary:
        temp_alt_decoded_sentence = temp_alt_decoded_sentence.replace(tkn, dtkn_vocabulary[tkn])

      decoded_alt_sentences.append(temp_alt_decoded_sentence)
    
    # Append the orig and the output captions, ready for the evaluation   
    decoded_dtknzd_sentence = decoded_alt_sentences[0]
    output_sentences.append(decoded_dtknzd_sentence)
    
    # Print out the results
    print_results(idx, decoded_dtknzd_sentence, orig_sent_to_add)
    # Save the K alternative outputs
    results_to_save[str(idx)] = {sentences_to_save[0] : decoded_alt_sentences[0], sentences_to_save[1] : decoded_alt_sentences[1], sentences_to_save[2] : decoded_alt_sentences[2]}

print("\n##############################")
print("##### SENTENCE EVALUATION #####")
print("##############################\n")

# Saving the results JSON file
with open(home_dir + "Results/Sentences_outputs.json", "w") as write_file:
  json.dump(results_to_save, write_file)

# Rouge metric between list of output detokenized sentences and original sentences
rouge_evaluation(output_sentences, orig_sentences)

-- # 83 ------------------------------------------------------------------------------------------------
Output caption:  The graph illustrates the Total vehicles entering Canada in Canada in 2016.
Original captions:  ['the line chart shows the total number of vehicles entering canada during 2016', " it's clearly possible to see how during the summer months the amount of car entering canada is way higher than the rest of the year, reaching the maximum value during the month of august with around 3400000 cars", "as it's shown in the graph, the number of vehicles entering canada during 2016 has been much higher during the summer compared with the other months of the year", ' in particular, the maximum number of vehicles entering canada in 2016 has been registered during august', ' at the same time, the minimum value during february', 'the number of total vehicles entering canada in 2016 is depicted in this graph', ' during january approximately 2100000 vehicles enter', ' these numbers sl

In [11]:
###################################
# Generating the output captions  #
###################################
output_captions = []
orig_captions = []
results_to_save = {}
captions_to_save = ["C#1", "C#2", "C#3"]
for idx in choosen_list:
    # Retrieve test instances with ID_Series equal to idx
    current_df = full_dataset[full_dataset["ID_Series"] == idx]
        
    # Retrieving the original captions associated with ID_Seris == idx
    orig_captions.append(list(current_df["Caption"].values))
    
    # Retrieving the input sequence - time series values -
    input_seq_df = current_df[current_df["ID_Series"] == idx].reset_index()
    input_seq = np.array([input_seq_df.iloc[0,9:21].values])
    
    # Setting the vocabulary for the current time series
    dtkn_vocabulary = set_vocabulary(current_df)
    
    # Generating the output caption
    alternatives_captions = BS_generator(input_seq, "caption", 3)
    
    decoded_alt_captions = []
    for decoded_caption in alternatives_captions:
      # Denormalize the output caption
      temp_alt_decoded_caption = denormalization(str(decoded_caption), current_df["min_time_series"].values[0] , current_df["max_time_series"].values[0] )
      # Detokenization the output caption
      for tkn in dtkn_vocabulary:
           temp_alt_decoded_caption = temp_alt_decoded_caption.replace(tkn, dtkn_vocabulary[tkn])
      decoded_alt_captions.append(temp_alt_decoded_caption)
    # Append the orig and the output captions, ready for the evaluation   
    decoded_dtknzd_caption = decoded_alt_captions[0]
    output_captions.append(decoded_dtknzd_caption)
    
    # Print out the results
    print_results(idx, decoded_dtknzd_caption, list(current_df["Caption"].values))
    # Save the K alternative outputs
    results_to_save[str(idx)] = {captions_to_save[0] : decoded_alt_captions[0], captions_to_save[1] : decoded_alt_captions[1], captions_to_save[2] : decoded_alt_captions[2]}

print("\n##############################")
print("##### CAPTION EVALUATION #####")
print("##############################\n")

# Saving the results JSON file
with open(home_dir + "Results/Captions_outputs.json", "w") as write_file:
    json.dump(results_to_save, write_file)

# Rouge metric between list of output detokenized captions and original captions
rouge_evaluation(output_captions, orig_captions)


-- # 83 ------------------------------------------------------------------------------------------------
Output caption:  The graph illustrates the Total vehicles entering Canada in Canada in 2016. The first few montes till may the graph stays quite constant at a low level of about 2013443.76 Number,besides a small peak in march up to nearly 2367090.76 Number. The graph starts to increase continusly from may to july. Afterwards it increase rapidly till the maxumum value in september.
Original captions:  ["the line chart shows the total number of vehicles entering canada during 2016. it's clearly possible to see how during the summer months the amount of car entering canada is way higher than the rest of the year, reaching the maximum value during the month of august with around 3400000 cars.", "as it's shown in the graph, the number of vehicles entering canada during 2016 has been much higher during the summer compared with the other months of the year. in particular, the maximum numbe