In [1]:
# Importing the required libraries 
import pandas as pd
import matplotlib.pyplot as plt
import ast
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from models.ngrams.word_ngrams import NGramModel
from tools.utils import is_other
from tools.utils import printStatus
from tools.utils import merge_dictionaries
import sys

WORD_LEVEL_DICTIONARIES_PATH = "./dictionaries/word-level/"
n = 2

In [2]:
# States
states = ['lang1', 'lang2']

# Observations
observations = [['this', 'is', 'a', 'test', 'seq']]

# Start probability
start_probability = {'lang1': 0.5, 'lang2': 0.5}

# Transition probability
transition_probability = {
   'lang1' : {'lang1': 0.8, 'lang2': 0.2},
   'lang2' : {'lang1': 0.2, 'lang2': 0.8}
}

In [2]:
# Emission probability (our probability dictionaries)
printStatus("Getting dictionaries...")
probability_en_df = pd.read_csv(WORD_LEVEL_DICTIONARIES_PATH + 'probability_dict_en.csv', encoding='utf-16')
probability_en_dict = probability_en_df.set_index('word')['probability'].to_dict()

probability_es_df = pd.read_csv(WORD_LEVEL_DICTIONARIES_PATH + 'probability_dict_es.csv', encoding='utf-16')
probability_es_dict = probability_es_df.set_index('word')['probability'].to_dict()
printStatus("Dictionaries ready!")

data_en = {'lang1': probability_en_dict}
data_es = {'lang2': probability_es_dict}
emission_probability = merge_dictionaries(data_en, data_es)

[22:16:07] Getting dictionaries...
[22:16:08] Dictionaries ready!


In [3]:
#Executing the Viterbi Algorithm
predicted_tags = []                #intializing the predicted tags
for x in range(len(observations)):   # for each tokenized sentence in the test data
  s = observations[x]
  #storing_values is a dictionary which stores the required values
  #ex: storing_values = {step_no.:{state1:[previous_best_state,value_of_the_state]}}                
  storing_values = {}              
  for q in range(len(s)):
    step = s[q]
    #for the starting word of the sentence
    if q == 1:                
      storing_values[q] = {}
      for t in states:
        #this is applied since we do not know whether the word in the test data is present in train data or not
        try:
          storing_values[q][t] = ['<s>',emission_probability[t][step]]
        #if word is not present in the train data but present in test data we assign a very low probability of 0.0001
        except:
          storing_values[q][t] = ['<s>',0.0001]#*train_emission_prob[t][step]]

    #if the word is not at the start of the sentence
    if q>1:
      storing_values[q] = {}
      previous_states = list(storing_values[q-1].keys())   # loading the previous states
      current_states  = tags_of_tokens[step]               # loading the current states
      #calculation of the best previous state for each current state and then storing
      #it in storing_values
      for t in current_states:                             
        temp = []
        for pt in previous_states:                         
          try:
            temp.append(storing_values[q-1][pt][1]*[pt][t]*emission_probability[t][step])
          except:
            temp.append(storing_values[q-1][pt][1]*0.0001)
        max_temp_index = temp.index(max(temp))
        best_pt = previous_states[max_temp_index]
        storing_values[q][t]=[best_pt,max(temp)]

  #Backtracing to extract the best possible tags for the sentence
  pred_tags = []
  total_steps_num = storing_values.keys()
  last_step_num = max(total_steps_num)
  for bs in range(len(total_steps_num)):
    step_num = last_step_num - bs
    if step_num == last_step_num:
      pred_tags.append('</s>')
      pred_tags.append(storing_values[step_num]['<s>'][0])
    if step_num<last_step_num and step_num>0:
      pred_tags.append(storing_values[step_num][pred_tags[len(pred_tags)-1]][0])
  predicted_tags.append(list(reversed(pred_tags)))


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 9)

In [None]:
print(predicted_tags)

In [None]:
#Calculating the accuracy based on tagging each word in the test data.
right = 0 
wrong = 0
for i in range(len(observations)):
  gt = test_tags[i]
  pred = predicted_tags[i]
  for h in range(len(gt)):
    if gt[h] == pred[h]:
      right = right+1
    else:
      wrong = wrong +1 

print('Accuracy on the test data is: ',right/(right+wrong))
print('Loss on the test data is: ',wrong/(right+wrong))