In [141]:
import re
import os
import numpy as np
import copy

In [142]:
# Defining Helper variables to be used later

labelList = ["O", "B-positive", "B-negative", "B-neutral", "I-positive", "I-negative", "I-neutral"]
labelDictInit = {   
        "START": 0,
        "O": 0,
        "B-positive": 0,
        "B-negative": 0,
        "B-neutral": 0,
        "I-positive": 0,
        "I-negative": 0,
        "I-neutral": 0,
        "END": 0
    }

NUMBER_OF_LABELS = len(labelList)

# Initialise a random number generator with a fixed seed for reproducible results and deterministic behavior
rng = np.random.default_rng(1004519 + 1004103 + 1004555)

# Defining the filePath for the datasets
folderPath = os.path.abspath(os.getcwd())

EsTrainFilePath = os.path.join(folderPath, "../Data/ES/train")
EsTrain1FilePath = os.path.join(folderPath, "../Data/ES/train1")
EsDevInFilePath = os.path.join(folderPath, "../Data/ES/dev.in")
EsDevOutFilePath = os.path.join(folderPath, "../Data/ES/dev.out")
EsPredOutputFilePath = os.path.join(folderPath, "../Data/ES/dev.p1.out")

RuTrainFilePath = os.path.join(folderPath, "../Data/RU/train")
RuDevInFilePath = os.path.join(folderPath, "../Data/RU/dev.in")
RuDevOutFilePath = os.path.join(folderPath, "../Data/RU/dev.out")
RuPredOutputFilePath = os.path.join(folderPath, "../Data/RU/dev.p1.out")

ES = [
    EsTrainFilePath,
    labelDictInit,
    EsDevInFilePath,
    EsDevOutFilePath,
    EsPredOutputFilePath]

RU = [
    RuTrainFilePath,
    labelDictInit,
    RuDevInFilePath,
    RuDevOutFilePath,
    RuPredOutputFilePath]

languages = [ES, RU]


In [143]:
# Helper functions to read and parse data
def readFile(filePath: str):
    with open(filePath, "r", encoding="utf-8") as f:
        return f.readlines()
    
def processFile(file: list):
    return [word[:len(word)-1] for word in file]

def getAllUniqueTokens(input_data):
    # Might want to somehow ensure that this order stays consistent between runs
    return list(set(item.split(" ")[0] for item in input_data))


### Part 1

1. Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation):
<br>
$$
e(x|y) = \frac{{\text{{Count}}(y \rightarrow x)}}{{\text{{Count}}(y)}}
$$

2. Set k to 1, implement this fix into your function for computing the emission parameters

$$
e(x|y) = \begin{cases}
\frac{{\text{{Count}}(y \rightarrow x)}}{{\text{{Count}}(y)+k}}, & \text{{if the word token }} x \text{{ appears in the training set}} \\
\frac{k}{{\text{{Count}}(y)+k}}, & \text{{if word token }} x \text{{ is the special token \#UNK\#}}
\end{cases}




In [144]:
# Calculating Emissions Function

def calcCountofEachWord(file: list, labelDict_in: dict):
    tokenDict_out = {}

    for i in range(len(file)):
        if file[i] != "":
            l = file[i].split()
            token = l[0]
            label = l[1]
            key = (token, label)
            if label in labelDict_in:
                labelDict_in[label] += 1

            else:
                labelDict_in[label] = 1
                

            if key in tokenDict_out:
                tokenDict_out[key] += 1
            else:
                tokenDict_out[key] = 1
    return tokenDict_out, labelDict_in

def calcEmission(tokenDict_in: dict, labelDict_in: dict, uniqueTokensList_in: list, k: float = 1.0):
    emissionDict_out = {}
    unknownDict = {}

    for token, label in tokenDict_in.keys():
        if token not in emissionDict_out:
            emissionDict_out[token] = {}  # Create an empty inner dictionary for the token
        if labelDict_in[label] != 0:
            e = tokenDict_in[(token, label)] / (labelDict_in[label] + k)
            emissionDict_out[token][label] = e  # Update the emission value for the specific label

    for key, labelCount in labelDict_in.items():
        if key not in ["START", "END"]:
            # Creating entry for unknown words
            unknownToken = "#UNK"
            e = k / (labelCount + k)
            unknownDict[key] = e
            emissionDict_out[unknownToken] = unknownDict

    return emissionDict_out

def getLabel(tokenInput: str, uniqueTokensList_in: list, emissionsDict_in: dict):
    if tokenInput in uniqueTokensList_in:
        x = max(emissionsDict_in[tokenInput], key=emissionsDict_in[tokenInput].get)
        # print('x -> {tokenInput} Current max probability is: ', x)
        return x
    else:
        y = max(emissionsDict_in["#UNK"], key=emissionsDict_in["#UNK"].get)
        # print(f'y -> {tokenInput} Current max probability is: {y}')
        return y

def calcSentimentAnalysis(tokenList: list, trainedData: dict, uniqueTokensList_in: list):

    predictedTokenList = []

    for token in tokenList:
        if token:
            if token in trainedData:
                predictedTokenList.append(token + " " + getLabel(token, uniqueTokensList_in, trainedData))
            else:
                predictedTokenList.append(token + " " + getLabel(token, uniqueTokensList_in, trainedData))
        else:
            predictedTokenList.append("")
    return predictedTokenList


def evalModel(predictedTokenFilePath: str, actualTokenFilePath: str):
    
    # Reading both files
    predictedTokenFile = processFile(readFile(predictedTokenFilePath))
    actualTokenFile = processFile(readFile(actualTokenFilePath))

    # print("\n=============Processed Files=============")
    # print(f"Predicted Token File: {predictedTokenFile}")
    # print(f"Actual Token File: {actualTokenFile}")
    # print("=============Processed Files=============\n")

    # Extracting the predicted and actual labels
    predictedLabels = [line.split()[1] for line in predictedTokenFile if line]
    actualLabels = [line.split()[1] for line in actualTokenFile if line]

    # print("\n=============Label Lists=============")
    # print(f"Predicted Labels: {predictedLabels}")
    # print(f"Actual Labels: {actualLabels}")
    # print("=============Label Lists=============\n")
    
    # Checking if the shape of the label lists are the same
    assert len(predictedLabels) == len(actualLabels) 
    
    # Precision
    correctPredictions = 0
    # Calculate metrics for each label
    totalPredicted = len(predictedLabels)
    totalActual = len(actualLabels)

    for i in range(totalPredicted):    
        if predictedLabels[i] == actualLabels[i]:
            correctPredictions += 1

    precision = round(correctPredictions/len(predictedLabels), 5) if (totalPredicted) != 0 else 0

    # Recall
    recall = round(correctPredictions/len(actualLabels), 5) if (totalActual) != 0 else 0

    # F1
    f1 = round(2 * ((precision * recall) / (precision + recall)), 5) if ((totalPredicted) != 0 ) and ((totalActual) != 0) else 0

    print("\n=============Evaluation Metrics=============")
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1)
    print("=============Evaluation Metrics=============\n\n")

    return precision, recall, f1
    
    

In [145]:
def predictAndWrite(
        trainFilePath: str, 
        labelDictIn: dict, 
        devInFilePath: str, 
        devOutFilePath: str,
        predOutputFilePath: str,
    ):
    # Processing the file to separate line by line
    trainData = processFile(readFile(filePath=trainFilePath))
    uniqueTokensList = getAllUniqueTokens(trainData)

    # Calculating the count of each token to the label
    tokenDict, labelDictOut = calcCountofEachWord(trainData, labelDictIn)

    # print(f"Token Dict is: \n{tokenDict}")
    # print(f"Label Dict is: \n{labelDictOut}")

    # Calculating the emission value for each unique token
    emissionsDict = calcEmission(tokenDict, labelDictOut, uniqueTokensList)

    # Reading test file
    testInData = processFile(readFile(devInFilePath))

    # Predicting test data
    predictedLabels = calcSentimentAnalysis(testInData, emissionsDict, uniqueTokensList)

    # Writing to file
    with open(predOutputFilePath, "w+", encoding="utf-8") as file:
        for line in predictedLabels:
            file.write(line + "\n")

    # Calculating Precision
    precision, recall, f1 = evalModel(predOutputFilePath, devOutFilePath)

    return precision, recall, f1


In [146]:
# Running the function
for language in languages:
    predictAndWrite(
        language[0],
        language[1],
        language[2],
        language[3],
        language[4]
    )


Precision:  0.6308
Recall:  0.6308
F1:  0.6308



Precision:  0.64228
Recall:  0.64228
F1:  0.64228




In [147]:
correctFile = processFile(readFile(os.path.join(folderPath, "correct_dev.p1.out")))
ourFile = processFile(readFile(os.path.join(folderPath, "../Data/ES/dev.p1.out")))

out = []

for i in range(len(correctFile)):
    if correctFile[i] != ourFile[i]:
        out.append( (correctFile[i], ourFile[i]))
print(out)
print(len(out))

[]
0


In [192]:
# Dict to count the number of START and END
endStatesDict = {
    'START': {'count': 0},
    'STOP': {'count': 0}
}

for label in labelList:
    endStatesDict[label] = {'count': 0, 'tokenList': []}

# Dict -> Keys = Label, Value = Dict of Label: Count 
# => Counts the number of Transitions from Label x -> y
transitionDict = {}
transitionDict['START'] = {}
for label in labelList: 
    transitionDict['START'][label] = 0
transitionDict['START']['STOP'] = 0
for label1 in labelList:
    transitionDict[label1] = {}
    for label2 in labelList:
        transitionDict[label1][label2] = 0
    transitionDict[label1]['STOP'] = 0

# Reading the file
file = processFile(readFile(EsTrain1FilePath))
prevLabel = "START"

# print(file)

for line in file:

    try:
        token, label = line.split(" ")
        # Blank => Start and End of sentence
        if prevLabel == "START":
            currLabel = label

            endStatesDict["START"]['count'] += 1
            transitionDict["START"][currLabel] += 1
            endStatesDict[currLabel]["count"] += 1
            endStatesDict[currLabel]["tokenList"].append(token)

            prevLabel = currLabel
        else:
            currLabel = label

            transitionDict[prevLabel][currLabel] += 1
            endStatesDict[currLabel]["count"] += 1
            endStatesDict[currLabel]["tokenList"].append(token)

            prevLabel = currLabel    
    except:
        # STOP
        currLabel = "STOP"

        endStatesDict["STOP"]["count"] += 1
        transitionDict[prevLabel]["STOP"] += 1

        # Preparing to transition from STOP to START
        prevLabel = "START"

endStatesDict["STOP"]["count"] += 1
print(endStatesDict)


calcTransitionsDict = {}

for state in transitionDict.keys():
    calcTransitionsDict[state] = {}
    for nextState in transitionDict[state].keys():
        try: calcTransitionsDict[state][nextState] = round(transitionDict[state][nextState] / endStatesDict[state]['count'], 5)
        except: calcTransitionsDict[state][nextState] = 0.0
 
calcTransitionsDict



{'START': {'count': 4}, 'STOP': {'count': 4}, 'O': {'count': 27, 'tokenList': ['Estuvimos', 'hace', 'poco', 'mi', 'pareja', 'y', 'yo', 'comiendo', 'y', 'resultó', 'la', 'tan', 'acogedora', 'da', 'una', 'sensación', 'de', 'bienestar', 'decoración', 'difícil', 'de', 'conseguir', 'en', 'otros', 'lugares', 'decoración', '.']}, 'B-positive': {'count': 3, 'tokenList': ['decoración', 'decoración', 'decoración']}, 'B-negative': {'count': 0, 'tokenList': []}, 'B-neutral': {'count': 0, 'tokenList': []}, 'I-positive': {'count': 0, 'tokenList': []}, 'I-negative': {'count': 0, 'tokenList': []}, 'I-neutral': {'count': 0, 'tokenList': []}}


{'START': {'O': 0.75,
  'B-positive': 0.25,
  'B-negative': 0.0,
  'B-neutral': 0.0,
  'I-positive': 0.0,
  'I-negative': 0.0,
  'I-neutral': 0.0,
  'STOP': 0.0},
 'O': {'O': 0.81481,
  'B-positive': 0.07407,
  'B-negative': 0.0,
  'B-neutral': 0.0,
  'I-positive': 0.0,
  'I-negative': 0.0,
  'I-neutral': 0.0,
  'STOP': 0.07407},
 'B-positive': {'O': 0.66667,
  'B-positive': 0.0,
  'B-negative': 0.0,
  'B-neutral': 0.0,
  'I-positive': 0.0,
  'I-negative': 0.0,
  'I-neutral': 0.0,
  'STOP': 0.33333},
 'B-negative': {'O': 0.0,
  'B-positive': 0.0,
  'B-negative': 0.0,
  'B-neutral': 0.0,
  'I-positive': 0.0,
  'I-negative': 0.0,
  'I-neutral': 0.0,
  'STOP': 0.0},
 'B-neutral': {'O': 0.0,
  'B-positive': 0.0,
  'B-negative': 0.0,
  'B-neutral': 0.0,
  'I-positive': 0.0,
  'I-negative': 0.0,
  'I-neutral': 0.0,
  'STOP': 0.0},
 'I-positive': {'O': 0.0,
  'B-positive': 0.0,
  'B-negative': 0.0,
  'B-neutral': 0.0,
  'I-positive': 0.0,
  'I-negative': 0.0,
  'I-neutral': 0.0,
  'STOP': 0.0