In [124]:
import re
import os
import numpy as np
import copy

In [125]:
# Defining Helper variables to be used later

labelList = ["O", "B-positive", "B-negative", "B-neutral", "I-positive", "I-negative", "I-neutral"]
labelDict = {   
        "START": 0,
        "O": 0,
        "B-positive": 0,
        "B-negative": 0,
        "B-neutral": 0,
        "I-positive": 0,
        "I-negative": 0,
        "I-neutral": 0,
        "END": 0
    }

NUMBER_OF_LABELS = len(labelList)

# Initialise a random number generator with a fixed seed for reproducible results and deterministic behavior
rng = np.random.default_rng(1004519 + 1004103 + 1004555)
folderPath = os.path.abspath(os.getcwd())

In [126]:
# Helper functions to read and parse data
def readFile(filePath: str):
    with open(filePath, "r", encoding="utf-8") as f:
        return f.readlines()
    
def processFile(file: list):
    return [word[:len(word)-1] for word in file]

def getAllUniqueTokens(input_data):
    # Might want to somehow ensure that this order stays consistent between runs
    return list(set(item.split()[0] for item in input_data if item))

### Part 1

1. Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation):
<br>
$$
e(x|y) = \frac{{\text{{Count}}(y \rightarrow x)}}{{\text{{Count}}(y)}}
$$



In [127]:
# Calculating Emissions Function

def calcCountofEachWord(file: list, labelDict_in: dict):
    tokenDict_out = {}

    for i in range(len(file)):
        if file[i] != "":
            l = file[i].split()
            token = l[0]
            label = l[1]
            key = (token, label)
            if label in labelDict_in:
                labelDict_in[label] += 1

            else:
                labelDict_in[label] = 1
                

            if key in tokenDict_out:
                tokenDict_out[key] += 1
            else:
                tokenDict_out[key] = 1
    return tokenDict_out, labelDict_in

def calcEmission(tokenDict_in: dict, labelDict_in: dict, uniqueTokensList_in: list, k: float = 1.0):
    emissionDict_out = {}

    for key,val in labelDict_in.items():
        inner_dict = {}
        if val != 0:
            for token, label in tokenDict_in.keys():
                if token in uniqueTokensList_in:
                    e = tokenDict_in[(token, label)] / (labelDict_in[key] + k)
                    inner_dict[token] = e
                else:
                    e = k / (labelDict_in[label] + k)
                    inner_dict[token] = e
        emissionDict_out[key] = inner_dict
            

    # for token, label in tokenDict_in.keys():
    #     if token in uniqueTokensList_in:
    #         e = tokenDict_in[(token, label)] / (labelDict_in[label] + k)
    #         emissionDict_out[(token, label)] = e

    #     else:
    #         e = k / (labelDict_in[label] + k)
    #         emissionDict_out[(token, label)] = e
    return emissionDict_out

In [128]:
# Calling the functions

# Defining the filePath for the training dataset
EsTrainFilePath = os.path.join(folderPath, "../Data/ES/train")

# Processing the file to separate line by line
trainData = processFile(readFile(filePath=EsTrainFilePath))
uniqueTokensList = getAllUniqueTokens(trainData)

# Calculating the count of each token to the label
tokenDict, labelDict = calcCountofEachWord(trainData, labelDict)

# Calculating the emission value for each unique token
emissionsDict= calcEmission(tokenDict, labelDict, uniqueTokensList)
print(emissionsDict)


### Section 3
EsTestInFilePath = os.path.join(folderPath, "../Data/ES/dev.in")
EsTestOutputFilePath = os.path.join(folderPath, "../Data/ES/dev.p2.out")

{'START': {}, 'O': {'Estuvimos': 0.00020664003306240529, 'hace': 0.0008954401432704229, 'poco': 0.0018942003030720485, 'mi': 0.0024796803967488635, 'pareja': 0.00044772007163521146, 'y': 0.00017220002755200442, 'yo': 3.444000551040088e-05, 'comiendo': 0.00034440005510400884, 'resultó': 0.00013776002204160352, 'todo': 3.444000551040088e-05, 'muy': 6.888001102080176e-05, 'bien': 3.444000551040088e-05, ',': 0.00010332001653120264, 'tanto': 0.0013431602149056344, 'la': 3.444000551040088e-05, 'comida': 0.000585480093676815, 'el': 3.444000551040088e-05, 'vino': 3.444000551040088e-05, 'trato': 3.444000551040088e-05, 'decoración': 6.888001102080176e-05, '…': 3.444000551040088e-05, 'nos': 0.005028240804518529, 'gustó': 0.0003788400606144097, 'mucho': 3.444000551040088e-05, '.': 3.444000551040088e-05, 'Por': 0.0010332001653120264, 'poner': 0.00017220002755200442, 'algún': 0.00020664003306240529, 'pero': 3.444000551040088e-05, 'quizá': 0.00013776002204160352, 'jamón': 3.444000551040088e-05, 'no':

2. Set k to 1, implement this fix into your function for computing the emission parameters

$$
e(x|y) = \begin{cases}
\frac{{\text{{Count}}(y \rightarrow x)}}{{\text{{Count}}(y)+k}}, & \text{{if the word token }} x \text{{ appears in the training set}} \\
\frac{k}{{\text{{Count}}(y)+k}}, & \text{{if word token }} x \text{{ is the special token \#UNK\#}}
\end{cases}
