<h1 style="text-align: center;"> Assignment-2 (Task-1) </h1>
<h4 style="text-align: center;">  Name: Shatansh Patnaik </h4>
<h4 style="text-align: center;"> Roll No: 20MA20067 </h4>

### Importing the essential libraries needed in the project.

In [13]:
from collections import defaultdict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import numpy as np
import csv

### Parsing function for parsing the Training as well as Test Data

In [14]:
def parseData(filePath):
    data = []
    listOfSentences = []
    sentIdList = []

    with open(filePath, 'r') as file:
        sentId = ""

        for line in file:
            line = line.strip()
            if line.startswith("# sent_id"):
                newLine = line.split("=")
                newSentence = newLine[-1].strip()
                sentId = newSentence
                sentIdList.append(sentId)

            elif line.startswith("# text"):
                newLine = line.split("=")
                newSentence = newLine[-1].strip()
                listOfSentences.append(newSentence)


            elif line and not line.startswith("#"):
                divisions = line.split()
                if len(divisions) >= 5:
                    data.append([sentId, divisions[0], divisions[1], divisions[3]])


    return data, listOfSentences, sentIdList

### Functions for calculation the word given tag and tag given previous tag probabilities

In [15]:
def calculateTagProbs(data):
    tagsAfterTagsCounts = defaultdict(lambda : defaultdict(int))
    tagCounts = defaultdict(int)
    tagProbs = defaultdict(lambda : defaultdict(float))

    for i in range(len(data)-1):
        currentTag = data[i][3]
        nextTag = data[i+1][3]
        tagsAfterTagsCounts[currentTag][nextTag] += 1
        tagCounts[currentTag] += 1

    for current, nextDict in tagsAfterTagsCounts.items():
        for nextT, count in nextDict.items():
            tagProbs[current][nextT] = count / tagCounts[current]

    return tagProbs

def calculateWordTagProbs(data):
    vocabularySet = set([])
    wordTagCounts = defaultdict(lambda: defaultdict(int))
    wordTagProbs = defaultdict(lambda: defaultdict(float))
    tagCounts = defaultdict(int)
    wordCounts = defaultdict(int)

    for i in range(len(data)):
        tag = data[i][3]
        word = data[i][2]
        wordTagCounts[tag][word] += 1
        tagCounts[tag]+=1
        vocabularySet.add(word)

    tagCnts = len(tagCounts)

    for tag, words in wordTagCounts.items():
        for word, count in words.items():
            wordTagProbs[tag][word] = (count) / (tagCounts[tag])

    return wordTagProbs, tagCnts, vocabularySet, wordTagCounts, tagCounts

### The Viterbi Algorithm which takes in a sentence as an input  would return an array of predicted POS Tags and the total probability

In [16]:
def viterbi(sentence, tagProbs, wordTagProbs, tagCnts):
    v = [{1: "a"}]
    pathFollowed = {}

    # Base Case:
    for tag in wordTagProbs.keys():
        v[0][tag] = (1/tagCnts)*wordTagProbs[tag][sentence[0]]
        pathFollowed[tag] =[tag]

    # Recursive case
    for t in range(1, len(sentence)):
        v.append({})
        newpath = {}

        for tag in wordTagProbs.keys():
            (prob, state) = max((v[t-1][prevTag] * tagProbs[prevTag][tag] * wordTagProbs[tag][sentence[t]], prevTag)
                                for prevTag in wordTagProbs.keys())
            v[t][tag] = prob
            newpath[tag] = pathFollowed[state] + [tag]

        pathFollowed = newpath


    n = len(sentence) - 1
    (prob, state) = max((v[n][tag], tag) for tag in wordTagProbs.keys())

    return [prob, pathFollowed[state]]

### Functions for getting the corresponding tags, and their corresponding sentences, performing Smoothing and getting a Smoothing Count

In [17]:
def getCorrespondingTags(data):
    dictData = {}
    for smallArr in data:
        key = smallArr[0]
        value = smallArr[3]
        if key in dictData:
            dictData[key].append(value)
        else:
            dictData[key] = [value]

    result = []
    for key, values in dictData.items():
        result.append(values)

    return result

def getCorrespondingSentence(data):
    dictData = {}
    for smallArr in data:
        key = smallArr[0]
        value = smallArr[2]
        if key in dictData:
            dictData[key].append(value)
        else:
            dictData[key] = [value]

    result = []
    for key, values in dictData.items():
        result.append(values)

    return result

def getSmoothingCount(data):
    smoothingCount = 0
    for i in range(len(data)):
        if wordTagProbs[data[i][3]][data[i][2]] == 0:
            smoothingCount+=1
    return smoothingCount

def smoothenTransitionProbability(data, tagProbs, wordTagProbs, countTags, countWords, vocabularySet):
    v = len(vocabularySet)
    for i in range(len(data)):
        wordTagProbs[data[i][3]][data[i][2]] = (countWords[data[i][3]][data[i][2]] + 1) / (v + countTags[data[i][3]])

    for tag, words in wordTagProbs.items():
        for word, prob in words.items():
            wordTagProbs[tag][word] = (countWords[tag][word] + 1) / (v + countTags[tag])

    return wordTagProbs

### Calculation of Accuracy, Precision, Recall and F1 Score of the Model

In [18]:
def calculateEvaluationMetrics(sentences, tagProbs, wordTagProbs, getTags, tagCnts):
    results = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        result = viterbi(sentence, tagProbs, wordTagProbs, tagCnts)[1]
        results.append(result)

    xFlat = [item for sublist in getTags for item in sublist]
    yFlat = [item for sublist in results for item in sublist]

    accuracy = accuracy_score(xFlat, yFlat)
    precision = precision_score(xFlat, yFlat, average='weighted', zero_division=1)
    recall = recall_score(xFlat, yFlat, average='weighted', zero_division=1)
    f1 = f1_score(xFlat, yFlat, average='weighted')

    return accuracy, precision, recall, f1, results

### Training Data Metrics Evaluation

In [19]:
rawTrainingData, trainSentences, sentIdList = parseData("./NLP2/train.txt")
trainingData = [sublist for sublist in rawTrainingData if sublist[1].isdigit()]

tagProbs = calculateTagProbs(trainingData)
wordTagProbs, tagCnts, vocabularySet, wordTagCounts, tagCounts = calculateWordTagProbs(trainingData)

getTags = getCorrespondingTags(trainingData)
smoothingCount = getSmoothingCount(trainingData)
trainingSentences = getCorrespondingSentence(trainingData)
# wordTagProbs = smoothenTransitionProbability(trainingData, tagProbs, wordTagProbs, tagCounts, wordTagCounts)

trainingAccuracy, trainingPrecision, trainingRecall, trainingF1, trainingResults = calculateEvaluationMetrics(trainingSentences, tagProbs, wordTagProbs, getTags, tagCnts)

print(f"The Smoothing Count for the training data is {smoothingCount}")
print(f"The training accuracy is {trainingAccuracy*100} %")
print(f"The training precision is {trainingPrecision*100} %")
print(f"The training recall is {trainingRecall*100} %")
print(f"The training F1 Score is {trainingF1*100} %")

The Smoothing Count for the training data is 0
The training accuracy is 98.1268448784794 %
The training precision is 98.15698556577934 %
The training recall is 98.1268448784794 %
The training F1 Score is 98.13385851813972 %


### Testing Data Metrics Evaluation

In [20]:
rawTestData, testSentences, listOfTestSentIds = parseData("./NLP2/test.txt")
testData = [sublist for sublist in rawTestData if sublist[1].isdigit()]

getTags = getCorrespondingTags(testData)
smoothingCount = getSmoothingCount(testData)
testSentences = getCorrespondingSentence(testData)
wordTagProbs = smoothenTransitionProbability(testData, tagProbs, wordTagProbs, tagCounts, wordTagCounts, vocabularySet)

testAccuracy, testPrecision, testRecall, testF1, testResults = calculateEvaluationMetrics(testSentences, tagProbs, wordTagProbs, getTags, tagCnts)

print(f"The Smoothing Count for the test data is {smoothingCount}")
print(f"The training accuracy is {testAccuracy*100} %")
print(f"The training precision is {testPrecision*100} %")
print(f"The training recall is {testRecall*100} %")
print(f"The training F1 Score is {testF1*100} %")

The Smoothing Count for the test data is 509
The training accuracy is 86.91460055096418 %
The training precision is 87.99260138411482 %
The training recall is 86.91460055096418 %
The training F1 Score is 86.4908720924295 %


In [10]:
print(len(trainingResults))

2000


In [11]:
print(len(testResults))

100


In [12]:
for i in range(len(trainingResults)):
    for j in range(len(trainingResults[i])):
        print(f"{sentIdList[i]}\t{trainingData[j][1]}\t{trainingSentences[i][j]}\t{trainingResults[i][j]}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
GUM_conversation_christmas-64	3	've	VB
GUM_conversation_christmas-64	4	seen	VBN
GUM_conversation_christmas-64	5	my	PRP$
GUM_conversation_christmas-64	6	other	JJ
GUM_conversation_christmas-64	1	one	NN
GUM_conversation_christmas-64	2	.	.
GUM_conversation_christmas-65	1	My	PRP$
GUM_conversation_christmas-65	2	other	JJ
GUM_conversation_christmas-65	3	one	NN
GUM_conversation_christmas-65	4	is	VBZ
GUM_conversation_christmas-65	5	tearing	VBG
GUM_conversation_christmas-65	6	,	,
GUM_conversation_christmas-65	1	and	CC
GUM_conversation_christmas-65	2	falling	VBG
GUM_conversation_christmas-65	3	apart	RB
GUM_conversation_christmas-65	4	.	.
GUM_conversation_christmas-66	1	Oh	UH
GUM_conversation_christmas-66	2	yeah	UH
GUM_conversation_christmas-66	3	?	.
GUM_conversation_christmas-67	1	I	PRP
GUM_conversation_christmas-67	2	've	VBP
GUM_conversation_christmas-67	3	only	RB
GUM_conversation_christmas-67	4	had	VBD
GUM_conversation_christmas-6

In [None]:
for i in range(len(testResults)):
    for j in range(len(testResults[i])):
        print(f"{listOfTestSentIds[i]}\t{testData[j][1]}\t{testSentences[i][j]}\t{testResults[i][j]}")

GUM_academic_discrimination-1	1	The	DT
GUM_academic_discrimination-1	2	prevalence	NN
GUM_academic_discrimination-1	3	of	IN
GUM_academic_discrimination-1	4	discrimination	NN
GUM_academic_discrimination-1	5	across	IN
GUM_academic_discrimination-1	6	racial	JJ
GUM_academic_discrimination-1	7	groups	NNS
GUM_academic_discrimination-1	8	in	IN
GUM_academic_discrimination-1	9	contemporary	JJ
GUM_academic_discrimination-1	10	America	NNS
GUM_academic_discrimination-1	11	:	:
GUM_academic_discrimination-2	1	Results	NNS
GUM_academic_discrimination-2	2	from	IN
GUM_academic_discrimination-2	3	a	DT
GUM_academic_discrimination-2	4	nationally	RB
GUM_academic_discrimination-2	5	representative	JJ
GUM_academic_discrimination-2	6	sample	NN
GUM_academic_discrimination-2	7	of	IN
GUM_academic_discrimination-2	8	adults	NNS
GUM_academic_discrimination-3	1	Introduction	NN
GUM_academic_discrimination-3	2	.	.
GUM_academic_discrimination-4	1	Personal	JJ
GUM_academic_discrimination-4	2	experiences	NNS
GUM_academic_dis

## Generating a TSV File according to the format specified in the Problem Statement

In [None]:
output_file = "viterbi_predictions_train.tsv"
with open(output_file, "w", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    writer.writerow(["Sent_ID", "Data", "Sentence", "Result"])
    for i in range(len(trainingResults)):
        for j in range(len(trainingResults[i])):
            writer.writerow([sentIdList[i], trainingData[j][1], trainingSentences[i][j], trainingResults[i][j]])

In [None]:
output_file = "viterbi_predictions_test.tsv"
with open(output_file, "w", newline="") as tsvfile:
    writer = csv.writer(tsvfile, delimiter="\t")
    writer.writerow(["Sent_ID", "Data", "Sentence", "Result"])
    for i in range(len(testResults)):
        for j in range(len(testResults[i])):
            writer.writerow([listOfTestSentIds[i], testData[j][1], testSentences[i][j], testResults[i][j]])