<h1 style="text-align: center;"> Assignment-2 (Task-1) </h1>
<h4 style="text-align: center;">  Name: Shatansh Patnaik </h4>
<h4 style="text-align: center;"> Roll No: 20MA20067 </h4>

### Importing the essential libraries needed in the project.

In [1]:
from collections import defaultdict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import numpy as np
import csv

### Parsing function for parsing the Training as well as Test Data

In [2]:
def parseData(filePath):
    data = []
    listOfSentences = []
    sentIdList = []

    with open(filePath, 'r') as file:
        sentId = ""

        for line in file:
            line = line.strip()
            if line.startswith("# sent_id"):
                newLine = line.split("=")
                newSentence = newLine[-1].strip()
                sentId = newSentence
                sentIdList.append(sentId)

            elif line.startswith("# text"):
                newLine = line.split("=")
                newSentence = newLine[-1].strip()
                listOfSentences.append(newSentence)


            elif line and not line.startswith("#"):
                divisions = line.split()
                if len(divisions) >= 5:
                    data.append([sentId, divisions[0], divisions[1], divisions[3]])


    return data, listOfSentences, sentIdList

### Functions for calculation the word given tag and tag given previous tag probabilities

In [3]:
def calculateTagProbs(data):
    tagsAfterTagsCounts = defaultdict(lambda : defaultdict(int))
    tagCounts = defaultdict(int)
    tagProbs = defaultdict(lambda : defaultdict(float))

    for i in range(len(data)-1):
        currentTag = data[i][3]
        nextTag = data[i+1][3]
        tagsAfterTagsCounts[currentTag][nextTag] += 1
        tagCounts[currentTag] += 1

    for current, nextDict in tagsAfterTagsCounts.items():
        for nextT, count in nextDict.items():
            tagProbs[current][nextT] = count / tagCounts[current]

    return tagProbs

def calculateWordTagProbs(data):
    vocabularySet = set([])
    wordTagCounts = defaultdict(lambda: defaultdict(int))
    wordTagProbs = defaultdict(lambda: defaultdict(float))
    tagCounts = defaultdict(int)
    wordCounts = defaultdict(int)

    for i in range(len(data)):
        tag = data[i][3]
        word = data[i][2]
        wordTagCounts[tag][word] += 1
        tagCounts[tag]+=1
        vocabularySet.add(word)

    tagCnts = len(tagCounts)

    for tag, words in wordTagCounts.items():
        for word, count in words.items():
            wordTagProbs[tag][word] = (count) / (tagCounts[tag])

    return wordTagProbs, tagCnts, vocabularySet, wordTagCounts, tagCounts

### The Viterbi Algorithm which takes in a sentence as an input  would return an array of predicted POS Tags and the total probability

In [4]:
def viterbi(sentence, tagProbs, wordTagProbs, tagCnts):
    v = [{1: "a"}]
    pathFollowed = {}

    # Base Case:
    for tag in wordTagProbs.keys():
        v[0][tag] = (1/tagCnts)*wordTagProbs[tag][sentence[0]]
        pathFollowed[tag] =[tag]

    # Recursive case
    for t in range(1, len(sentence)):
        v.append({})
        newpath = {}

        for tag in wordTagProbs.keys():
            (prob, state) = max((v[t-1][prevTag] * tagProbs[prevTag][tag] * wordTagProbs[tag][sentence[t]], prevTag)
                                for prevTag in wordTagProbs.keys())
            v[t][tag] = prob
            newpath[tag] = pathFollowed[state] + [tag]

        pathFollowed = newpath


    n = len(sentence) - 1
    (prob, state) = max((v[n][tag], tag) for tag in wordTagProbs.keys())

    return [prob, pathFollowed[state]]

### Functions for getting the corresponding tags, and their corresponding sentences, performing Smoothing and getting a Smoothing Count

In [5]:
def getCorrespondingTags(data):
    dictData = {}
    for smallArr in data:
        key = smallArr[0]
        value = smallArr[3]
        if key in dictData:
            dictData[key].append(value)
        else:
            dictData[key] = [value]

    result = []
    for key, values in dictData.items():
        result.append(values)

    return result

def getCorrespondingSentence(data):
    dictData = {}
    for smallArr in data:
        key = smallArr[0]
        value = smallArr[2]
        if key in dictData:
            dictData[key].append(value)
        else:
            dictData[key] = [value]

    result = []
    for key, values in dictData.items():
        result.append(values)

    return result

def getSmoothingCount(data):
    smoothingCount = 0
    for i in range(len(data)):
        if wordTagProbs[data[i][3]][data[i][2]] == 0:
            smoothingCount+=1
    return smoothingCount

def smoothenTransitionProbability(data, tagProbs, wordTagProbs, countTags, countWords, vocabularySet):
    v = len(vocabularySet)
    for i in range(len(data)):
        wordTagProbs[data[i][3]][data[i][2]] = (countWords[data[i][3]][data[i][2]] + 1) / (v + countTags[data[i][3]])

    for tag, words in wordTagProbs.items():
        for word, prob in words.items():
            wordTagProbs[tag][word] = (countWords[tag][word] + 1) / (v + countTags[tag])

    return wordTagProbs

### Calculation of Accuracy, Precision, Recall and F1 Score of the Model

In [6]:
def calculateEvaluationMetrics(sentences, tagProbs, wordTagProbs, getTags, tagCnts):
    results = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        result = viterbi(sentence, tagProbs, wordTagProbs, tagCnts)[1]
        results.append(result)

    xFlat = [item for sublist in getTags for item in sublist]
    yFlat = [item for sublist in results for item in sublist]

    accuracy = accuracy_score(xFlat, yFlat)
    precision = precision_score(xFlat, yFlat, average='weighted', zero_division=1)
    recall = recall_score(xFlat, yFlat, average='weighted', zero_division=1)
    f1 = f1_score(xFlat, yFlat, average='weighted')

    return accuracy, precision, recall, f1, results

### Training Data Metrics Evaluation

In [7]:
rawTrainingData, trainSentences, sentIdList = parseData("./NLP2/train.txt")
trainingData = [sublist for sublist in rawTrainingData if sublist[1].isdigit()]

tagProbs = calculateTagProbs(trainingData)
wordTagProbs, tagCnts, vocabularySet, wordTagCounts, tagCounts = calculateWordTagProbs(trainingData)

getTags = getCorrespondingTags(trainingData)
smoothingCount = getSmoothingCount(trainingData)
trainingSentences = getCorrespondingSentence(trainingData)
# wordTagProbs = smoothenTransitionProbability(trainingData, tagProbs, wordTagProbs, tagCounts, wordTagCounts)

trainingAccuracy, trainingPrecision, trainingRecall, trainingF1, trainingResults = calculateEvaluationMetrics(trainingSentences, tagProbs, wordTagProbs, getTags, tagCnts)

print(f"The Smoothing Count for the training data is {smoothingCount}")
print(f"The training accuracy is {trainingAccuracy*100} %")
print(f"The training precision is {trainingPrecision*100} %")
print(f"The training recall is {trainingRecall*100} %")
print(f"The training F1 Score is {trainingF1*100} %")

The Smoothing Count for the training data is 0
The training accuracy is 98.1268448784794 %
The training precision is 98.15698556577934 %
The training recall is 98.1268448784794 %
The training F1 Score is 98.13385851813972 %


### Testing Data Metrics Evaluation

In [8]:
rawTestData, testSentences, listOfTestSentIds = parseData("./NLP2/test.txt")
testData = [sublist for sublist in rawTestData if sublist[1].isdigit()]

getTags = getCorrespondingTags(testData)
smoothingCount = getSmoothingCount(testData)
testSentences = getCorrespondingSentence(testData)
wordTagProbs = smoothenTransitionProbability(testData, tagProbs, wordTagProbs, tagCounts, wordTagCounts, vocabularySet)

testAccuracy, testPrecision, testRecall, testF1, testResults = calculateEvaluationMetrics(testSentences, tagProbs, wordTagProbs, getTags, tagCnts)

print(f"The Smoothing Count for the test data is {smoothingCount}")
print(f"The training accuracy is {testAccuracy*100} %")
print(f"The training precision is {testPrecision*100} %")
print(f"The training recall is {testRecall*100} %")
print(f"The training F1 Score is {testF1*100} %")

The Smoothing Count for the test data is 509
The training accuracy is 86.91460055096418 %
The training precision is 87.99260138411482 %
The training recall is 86.91460055096418 %
The training F1 Score is 86.4908720924295 %


In [9]:
print(len(trainingResults))

2000


In [10]:
print(len(testResults))

100


In [11]:
for i in range(len(trainingResults)):
    for j in range(len(trainingResults[i])):
        print(f"{sentIdList[i]}\t{trainingData[j][1]}\t{trainingSentences[i][j]}\t{trainingResults[i][j]}")

GUM_academic_art-1	1	Aesthetic	JJ
GUM_academic_art-1	2	Appreciation	NN
GUM_academic_art-1	3	and	CC
GUM_academic_art-1	4	Spanish	NNP
GUM_academic_art-1	5	Art	NNP
GUM_academic_art-1	6	:	:
GUM_academic_art-2	1	Insights	NNS
GUM_academic_art-2	2	from	IN
GUM_academic_art-2	3	Eye	NN
GUM_academic_art-2	4	-	HYPH
GUM_academic_art-2	5	Tracking	NN
GUM_academic_art-3	1	Claire	NNP
GUM_academic_art-3	2	Bailey	NNP
GUM_academic_art-3	3	-	HYPH
GUM_academic_art-3	4	Ross	NNP
GUM_academic_art-3	5	claire.bailey-ross@port.ac.uk	NNP
GUM_academic_art-3	6	University	NNP
GUM_academic_art-3	1	of	IN
GUM_academic_art-3	2	Portsmouth	NNP
GUM_academic_art-3	3	,	,
GUM_academic_art-3	4	United	NNP
GUM_academic_art-3	5	Kingdom	NNP
GUM_academic_art-4	1	Andrew	NNP
GUM_academic_art-4	2	Beresford	NNP
GUM_academic_art-4	3	a.m.beresford@durham.ac.uk	NNP
GUM_academic_art-4	4	Durham	NNP
GUM_academic_art-4	5	University	NNP
GUM_academic_art-4	6	,	,
GUM_academic_art-4	1	United	NNP
GUM_academic_art-4	2	Kingdom	NNP
GUM_academic_art-5	

GUM_academic_huh-25	1	Extracts	NNPS
GUM_academic_huh-25	2	2	CD
GUM_academic_huh-25	3	and	CC
GUM_academic_huh-25	4	3	CD
GUM_academic_huh-25	5	show	VBP
GUM_academic_huh-25	6	structurally	RB
GUM_academic_huh-25	1	identical	JJ
GUM_academic_huh-25	2	sequences	NNS
GUM_academic_huh-25	3	in	IN
GUM_academic_huh-25	4	two	CD
GUM_academic_huh-25	5	other	JJ
GUM_academic_huh-25	1	languages	NNS
GUM_academic_huh-25	2	:	:
GUM_academic_huh-25	3	Siwu	NNP
GUM_academic_huh-25	4	,	,
GUM_academic_huh-25	5	a	DT
GUM_academic_huh-25	6	Kwa	NNP
GUM_academic_huh-25	7	language	NN
GUM_academic_huh-25	8	spoken	VBN
GUM_academic_huh-25	9	in	IN
GUM_academic_huh-25	10	Ghana	NNP
GUM_academic_huh-25	11	,	,
GUM_academic_huh-25	1	and	CC
GUM_academic_huh-25	2	Lao	NNP
GUM_academic_huh-25	3	,	,
GUM_academic_huh-25	4	a	DT
GUM_academic_huh-25	5	Tai	NNP
GUM_academic_huh-25	6	-	HYPH
GUM_academic_huh-25	7	Kadai	NNP
GUM_academic_huh-25	8	language	NN
GUM_academic_huh-25	1	spoken	VBN
GUM_academic_huh-25	2	in	IN
GUM_academic_huh-25	3	La

GUM_academic_implicature-10	5	.	.
GUM_academic_implicature-11	1	Future	JJ
GUM_academic_implicature-11	2	studies	NNS
GUM_academic_implicature-11	3	need	VBP
GUM_academic_implicature-11	4	to	TO
GUM_academic_implicature-11	5	include	VB
GUM_academic_implicature-11	6	more	JJR
GUM_academic_implicature-11	1	participants	NNS
GUM_academic_implicature-11	2	and	CC
GUM_academic_implicature-11	3	more	JJR
GUM_academic_implicature-11	4	items	NNS
GUM_academic_implicature-11	5	to	TO
GUM_academic_implicature-11	1	increase	VB
GUM_academic_implicature-11	2	power	NN
GUM_academic_implicature-11	3	.	.
GUM_academic_implicature-12	1	Future	JJ
GUM_academic_implicature-12	2	studies	NNS
GUM_academic_implicature-12	3	should	MD
GUM_academic_implicature-12	4	also	RB
GUM_academic_implicature-12	5	include	VB
GUM_academic_implicature-12	6	a	DT
GUM_academic_implicature-12	1	larger	JJR
GUM_academic_implicature-12	2	age	NN
GUM_academic_implicature-12	3	range	NN
GUM_academic_implicature-12	4	of	IN
GUM_academic_implicature-1

GUM_academic_thrones-14	7	(	-LRB-
GUM_academic_thrones-14	8	Benioff	NNP
GUM_academic_thrones-14	1	&	CC
GUM_academic_thrones-14	2	Weiss	NNP
GUM_academic_thrones-14	3	,	,
GUM_academic_thrones-14	4	2013	CD
GUM_academic_thrones-14	5	)	-RRB-
GUM_academic_thrones-14	6	.	.
GUM_academic_thrones-15	1	Readers	NNS
GUM_academic_thrones-15	2	of	IN
GUM_academic_thrones-15	3	the	DT
GUM_academic_thrones-15	4	book	NN
GUM_academic_thrones-15	5	series	NN
GUM_academic_thrones-15	6	had	VBD
GUM_academic_thrones-15	1	long	RB
GUM_academic_thrones-15	2	anticipated	VBN
GUM_academic_thrones-15	3	and	CC
GUM_academic_thrones-15	4	dreaded	VBN
GUM_academic_thrones-15	5	the	DT
GUM_academic_thrones-15	1	events	NNS
GUM_academic_thrones-15	2	of	IN
GUM_academic_thrones-15	3	the	DT
GUM_academic_thrones-15	4	“	``
GUM_academic_thrones-15	5	Red	NNP
GUM_academic_thrones-15	6	Wedding	NNP
GUM_academic_thrones-15	7	”	''
GUM_academic_thrones-15	8	,	,
GUM_academic_thrones-15	9	while	IN
GUM_academic_thrones-15	10	fans	NNS
GUM_acade

GUM_academic_thrones-28	3	,	,
GUM_academic_thrones-28	4	in	IN
GUM_academic_thrones-28	5	order	NN
GUM_academic_thrones-28	6	to	TO
GUM_academic_thrones-28	7	extend	VB
GUM_academic_thrones-28	8	its	PRP$
GUM_academic_thrones-28	1	application	NN
GUM_academic_thrones-28	2	to	IN
GUM_academic_thrones-28	3	the	DT
GUM_academic_thrones-28	4	broader	JJR
GUM_academic_thrones-28	5	context	NN
GUM_academic_thrones-28	6	of	IN
GUM_academic_thrones-28	7	modern	JJ
GUM_academic_thrones-28	8	-	HYPH
GUM_academic_thrones-28	1	day	NN
GUM_academic_thrones-28	2	consumers	NNS
GUM_academic_thrones-28	3	.	.
GUM_academic_thrones-29	1	The	DT
GUM_academic_thrones-29	2	current	JJ
GUM_academic_thrones-29	3	study	NN
GUM_academic_thrones-29	4	will	MD
GUM_academic_thrones-29	5	contribute	VB
GUM_academic_thrones-29	6	to	IN
GUM_academic_thrones-29	1	the	DT
GUM_academic_thrones-29	2	development	NN
GUM_academic_thrones-29	3	of	IN
GUM_academic_thrones-29	4	further	RBR
GUM_academic_thrones-29	5	qualitative	JJ
GUM_academic_throne

GUM_bio_galois-44	5	cenotaph	NN
GUM_bio_galois-44	6	in	IN
GUM_bio_galois-44	7	his	PRP$
GUM_bio_galois-44	8	honour	NN
GUM_bio_galois-44	9	was	VBD
GUM_bio_galois-44	10	erected	VBN
GUM_bio_galois-44	11	beside	IN
GUM_bio_galois-44	1	the	DT
GUM_bio_galois-44	2	graves	NNS
GUM_bio_galois-44	3	of	IN
GUM_bio_galois-44	4	his	PRP$
GUM_bio_galois-44	5	relatives	NNS
GUM_bio_galois-44	6	.	.
GUM_bio_galois-44	7	[	-LRB-
GUM_bio_galois-44	8	24	CD
GUM_bio_galois-44	1	]	-RRB-
GUM_bio_goode-1	1	Matthew	NNP
GUM_bio_goode-1	2	Goode	NNP
GUM_bio_goode-2	1	Matthew	NNP
GUM_bio_goode-2	2	William	NNP
GUM_bio_goode-2	3	Goode	NNP
GUM_bio_goode-2	4	(	-LRB-
GUM_bio_goode-2	5	born	VBN
GUM_bio_goode-2	6	3	CD
GUM_bio_goode-2	1	April	NNP
GUM_bio_goode-2	2	1978	CD
GUM_bio_goode-2	3	)	-RRB-
GUM_bio_goode-2	4	is	VBZ
GUM_bio_goode-2	5	an	DT
GUM_bio_goode-2	1	English	JJ
GUM_bio_goode-2	2	character	NN
GUM_bio_goode-2	3	actor	NN
GUM_bio_goode-2	4	.	.
GUM_bio_goode-3	1	He	PRP
GUM_bio_goode-3	2	made	VBD
GUM_bio_goode-3	3	his	PRP$

GUM_bio_goode-11	2	the	DT
GUM_bio_goode-11	3	critically	RB
GUM_bio_goode-11	4	acclaimed	VBN
GUM_bio_goode-11	5	British	JJ
GUM_bio_goode-11	1	mini-serial	NN
GUM_bio_goode-11	2	Dancing	NNP
GUM_bio_goode-11	3	on	IN
GUM_bio_goode-11	4	the	DT
GUM_bio_goode-11	5	Edge	NNP
GUM_bio_goode-11	6	,	,
GUM_bio_goode-11	7	as	IN
GUM_bio_goode-11	8	music	NN
GUM_bio_goode-11	9	journalist	NN
GUM_bio_goode-11	10	Stanley	NNP
GUM_bio_goode-11	11	Mitchell	NNP
GUM_bio_goode-11	1	,	,
GUM_bio_goode-11	2	for	IN
GUM_bio_goode-11	3	which	WDT
GUM_bio_goode-11	4	he	PRP
GUM_bio_goode-11	5	earned	VBD
GUM_bio_goode-11	6	a	DT
GUM_bio_goode-11	7	nomination	NN
GUM_bio_goode-11	8	for	IN
GUM_bio_goode-11	1	Best	JJS
GUM_bio_goode-11	2	Actor	NN
GUM_bio_goode-11	3	in	IN
GUM_bio_goode-11	4	a	DT
GUM_bio_goode-11	5	Miniseries	NN
GUM_bio_goode-11	6	or	CC
GUM_bio_goode-11	7	a	DT
GUM_bio_goode-11	8	Television	NNP
GUM_bio_goode-11	1	Film	NNP
GUM_bio_goode-11	2	at	IN
GUM_bio_goode-11	3	the	DT
GUM_bio_goode-11	4	Satellite	NNP
GUM_bio_go

GUM_bio_moreau-32	1	her	PRP$
GUM_bio_moreau-32	2	20s	NNS
GUM_bio_moreau-32	3	,	,
GUM_bio_moreau-32	4	was	VBD
GUM_bio_moreau-32	5	already	RB
GUM_bio_moreau-32	6	one	CD
GUM_bio_moreau-32	7	of	IN
GUM_bio_moreau-32	8	leading	VBG
GUM_bio_moreau-32	1	actresses	NNS
GUM_bio_moreau-32	2	in	IN
GUM_bio_moreau-32	3	the	DT
GUM_bio_moreau-32	4	theatre	NN
GUM_bio_moreau-32	5	's	POS
GUM_bio_moreau-32	6	troupe	NN
GUM_bio_moreau-32	7	.	.
GUM_bio_moreau-32	8	[	-LRB-
GUM_bio_moreau-32	1	2	CD
GUM_bio_moreau-32	2	]	-RRB-
GUM_bio_moreau-33	1	After	IN
GUM_bio_moreau-33	2	1949	CD
GUM_bio_moreau-33	3	,	,
GUM_bio_moreau-33	4	she	PRP
GUM_bio_moreau-33	5	began	VBD
GUM_bio_moreau-33	6	appearing	VBG
GUM_bio_moreau-33	1	in	IN
GUM_bio_moreau-33	2	films	NNS
GUM_bio_moreau-33	3	with	IN
GUM_bio_moreau-33	4	small	JJ
GUM_bio_moreau-33	5	parts	NNS
GUM_bio_moreau-33	1	but	CC
GUM_bio_moreau-33	2	continued	VBD
GUM_bio_moreau-33	3	primarily	RB
GUM_bio_moreau-33	4	active	JJ
GUM_bio_moreau-33	5	in	IN
GUM_bio_moreau-33	6	the	DT
GU

GUM_bio_nida-27	2	,	,
GUM_bio_nida-27	3	though	IN
GUM_bio_nida-27	4	it	PRP
GUM_bio_nida-27	5	is	VBZ
GUM_bio_nida-27	6	the	DT
GUM_bio_nida-27	7	most	RBS
GUM_bio_nida-27	8	well	RB
GUM_bio_nida-27	9	-	HYPH
GUM_bio_nida-27	10	known	VBN
GUM_bio_nida-27	11	.	.
GUM_bio_nida-28	1	Nida	NNP
GUM_bio_nida-28	2	's	POS
GUM_bio_nida-28	3	dynamic	JJ
GUM_bio_nida-28	4	-	HYPH
GUM_bio_nida-28	5	equivalence	NN
GUM_bio_nida-28	6	theory	NN
GUM_bio_nida-28	1	is	VBZ
GUM_bio_nida-28	2	often	RB
GUM_bio_nida-28	3	held	VBN
GUM_bio_nida-28	4	in	IN
GUM_bio_nida-28	5	opposition	NN
GUM_bio_nida-28	1	to	IN
GUM_bio_nida-28	2	the	DT
GUM_bio_nida-28	3	views	NNS
GUM_bio_nida-28	4	of	IN
GUM_bio_nida-28	5	philologists	NNS
GUM_bio_nida-28	6	who	WP
GUM_bio_nida-28	7	maintain	VBP
GUM_bio_nida-28	8	that	IN
GUM_bio_nida-28	9	an	DT
GUM_bio_nida-28	10	understanding	NN
GUM_bio_nida-28	11	of	IN
GUM_bio_nida-28	1	the	DT
GUM_bio_nida-28	2	source	NN
GUM_bio_nida-28	3	text	NN
GUM_bio_nida-28	4	(	-LRB-
GUM_bio_nida-28	5	ST	NNP
GUM_bio_ni

GUM_conversation_christmas-166	3	take	VB
GUM_conversation_christmas-166	4	it	PRP
GUM_conversation_christmas-166	5	.	.
GUM_conversation_christmas-167	1	No	UH
GUM_conversation_christmas-167	2	,	,
GUM_conversation_christmas-167	3	it	PRP
GUM_conversation_christmas-167	4	's	VBZ
GUM_conversation_christmas-167	5	not	RB
GUM_conversation_christmas-167	6	too	RB
GUM_conversation_christmas-167	1	small	JJ
GUM_conversation_christmas-167	2	.	.
GUM_conversation_christmas-168	1	It	PRP
GUM_conversation_christmas-168	2	's	VBZ
GUM_conversation_christmas-168	3	perfect	JJ
GUM_conversation_christmas-168	4	.	.
GUM_conversation_christmas-169	1	Is	VBZ
GUM_conversation_christmas-169	2	it	PRP
GUM_conversation_christmas-169	3	too	RB
GUM_conversation_christmas-169	4	big	JJ
GUM_conversation_christmas-169	5	?	.
GUM_conversation_christmas-170	1	Hm-m	UH
GUM_conversation_christmas-170	2	.	.
GUM_conversation_christmas-171	1	Get	VB
GUM_conversation_christmas-171	2	you	PRP
GUM_conversation_christmas-171	3	some	DT
GUM_conve

GUM_conversation_court-41	5	writing	VBG
GUM_conversation_court-41	6	,	,
GUM_conversation_court-41	1	or	CC
GUM_conversation_court-41	2	verbal	JJ
GUM_conversation_court-41	3	?	.
GUM_conversation_court-42	1	Completely	RB
GUM_conversation_court-42	2	verbal	JJ
GUM_conversation_court-42	3	.	.
GUM_conversation_court-43	1	I	PRP
GUM_conversation_court-43	2	've	VBP
GUM_conversation_court-43	3	been	VBN
GUM_conversation_court-43	4	working	VBG
GUM_conversation_court-43	5	with	IN
GUM_conversation_court-43	6	him	PRP
GUM_conversation_court-43	1	for	IN
GUM_conversation_court-43	2	like	IN
GUM_conversation_court-43	3	six	CD
GUM_conversation_court-43	4	years	NNS
GUM_conversation_court-43	5	.	.
GUM_conversation_court-44	1	Uh	UH
GUM_conversation_court-44	2	,	,
GUM_conversation_court-44	3	upon	IN
GUM_conversation_court-44	4	a	DT
GUM_conversation_court-44	5	reve-	UH
GUM_conversation_court-44	6	uh	UH
GUM_conversation_court-44	1	—	:
GUM_conversation_court-44	2	uh	UH
GUM_conversation_court-44	3	receiving	VBG
GUM

In [12]:
for i in range(len(testResults)):
    for j in range(len(testResults[i])):
        print(f"{listOfTestSentIds[i]}\t{testData[j][1]}\t{testSentences[i][j]}\t{testResults[i][j]}")

GUM_academic_discrimination-1	1	The	DT
GUM_academic_discrimination-1	2	prevalence	NN
GUM_academic_discrimination-1	3	of	IN
GUM_academic_discrimination-1	4	discrimination	NN
GUM_academic_discrimination-1	5	across	IN
GUM_academic_discrimination-1	6	racial	JJ
GUM_academic_discrimination-1	7	groups	NNS
GUM_academic_discrimination-1	8	in	IN
GUM_academic_discrimination-1	9	contemporary	JJ
GUM_academic_discrimination-1	10	America	NNS
GUM_academic_discrimination-1	11	:	:
GUM_academic_discrimination-2	1	Results	NNS
GUM_academic_discrimination-2	2	from	IN
GUM_academic_discrimination-2	3	a	DT
GUM_academic_discrimination-2	4	nationally	RB
GUM_academic_discrimination-2	5	representative	JJ
GUM_academic_discrimination-2	6	sample	NN
GUM_academic_discrimination-2	7	of	IN
GUM_academic_discrimination-2	8	adults	NNS
GUM_academic_discrimination-3	1	Introduction	NN
GUM_academic_discrimination-3	2	.	.
GUM_academic_discrimination-4	1	Personal	JJ
GUM_academic_discrimination-4	2	experiences	NNS
GUM_academic_dis

GUM_academic_eegimaa-18	1	’	NNP
GUM_academic_eegimaa-18	2	,	,
GUM_academic_eegimaa-18	3	is	VBZ
GUM_academic_eegimaa-18	4	the	DT
GUM_academic_eegimaa-18	5	name	NN
GUM_academic_eegimaa-18	6	given	VBN
GUM_academic_eegimaa-18	7	by	IN
GUM_academic_eegimaa-18	8	Jóola	NNP
GUM_academic_eegimaa-18	1	Kaasa	NNP
GUM_academic_eegimaa-18	2	speakers	NNS
GUM_academic_eegimaa-18	1	who	WP
GUM_academic_eegimaa-18	2	are	VBP
GUM_academic_eegimaa-18	3	more	DT
GUM_academic_eegimaa-18	4	familiar	NN
GUM_academic_eegimaa-18	5	with	IN
GUM_academic_eegimaa-18	6	the	DT
GUM_academic_eegimaa-18	7	village	NN
GUM_academic_eegimaa-18	8	of	IN
GUM_academic_eegimaa-18	9	Seleki	NNP
GUM_academic_eegimaa-18	10	/	NNP
GUM_academic_eegimaa-18	11	Sállagi	NNP
GUM_academic_eegimaa-18	12	’	NNP
GUM_academic_eegimaa-18	13	than	IN
GUM_academic_eegimaa-18	14	all	DT
GUM_academic_eegimaa-18	15	other	JJ
GUM_academic_eegimaa-18	16	villages	NNS
GUM_academic_eegimaa-18	17	of	IN
GUM_academic_eegimaa-18	18	Mof-Ávvi	NNP
GUM_academic_eegimaa-18	

## Generating a TSV File according to the format specified in the Problem Statement

In [13]:
output_file = "viterbi_predictions_train.tsv"
with open(output_file, "w", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    writer.writerow(["Sent_ID", "Data", "Sentence", "Result"])
    for i in range(len(trainingResults)):
        for j in range(len(trainingResults[i])):
            writer.writerow([sentIdList[i], trainingData[j][1], trainingSentences[i][j], trainingResults[i][j]])

In [14]:
output_file = "viterbi_predictions_test.tsv"
with open(output_file, "w", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    writer.writerow(["Sent_ID", "Data", "Sentence", "Result"])
    for i in range(len(testResults)):
        for j in range(len(testResults[i])):
            writer.writerow([listOfTestSentIds[i], testData[j][1], testSentences[i][j], testResults[i][j]])