In [15]:
try:
    import nltk
except ImportError:
    print("NLTK library not installed, installing now...")
    %pip install nltk
    import nltk

from nltk.tag.perceptron import PerceptronTagger
from nltk.tag.mapping import map_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\oscarros\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\oscarros\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

# Functions for reading and formatting the data

In [16]:
def ReadDocument(filename):
    dataSet = []
    with open(filename, 'r') as file:
        content = file.read()
        content = content.replace('\n', ' ')
        content = content.split(' ')

        for token in content:
            if token != '':
                word, posTag = token.split('_')
                word = word.lower()
                dataSet.append((word, posTag))
    return dataSet

def ReadTagTranslator(filename):
    """
    Assigns each brown tag to a universal tag in a dictionary
    """
    tagTranslator = {}
    with open(filename, 'r') as file:
        content = file.read()
        content = content.split('\n')
        for line in content:
            if line != '':
                if len(line.split('\t')) == 2:
                    brownTag, universalTag = line.split('\t')
                    tagTranslator[brownTag] = universalTag
                else:
                    # Handels a special (single) case where the split did not work as expected
                    rest = line.split('\t')
                    tagTranslator[rest[0]] = rest[-1]
    return tagTranslator

def ConvertTags(dataSet, tagTranslator):
    """
    Convert all tags in the dataset to universal tags
    """
    convertedDataSet = []
    for word, posTag in dataSet:
        if posTag in tagTranslator:
            convertedDataSet.append((word, tagTranslator[posTag]))
        else:
            convertedDataSet.append((word, posTag))
    return convertedDataSet

def SplitData(dataSet, splitRatio=0.8):
    splitIndex = int(len(dataSet) * splitRatio)
    trainingData = dataSet[:splitIndex]
    testData = dataSet[splitIndex:]
    return trainingData, testData

# Implement perceptron tagger

In [17]:
def RunTagger(dataSet):
    """
    Loads a pretrained perceptron tagger and tags the given dataset.
    Reformats the dataset by isolating the words from their tags before submitting it to the tagger.
    """
    tagger = PerceptronTagger(load=True)
    tokens = [token for token, tag in dataSet]
    assinedTags = tagger.tag(tokens)
    return assinedTags

def TranslateAssignedTags(assignedTags, source='en-ptb'):
    """
    Translates the tags assigned by the tagger to universal tags
    """
    translatedDataSet = []
    for word, tag in assignedTags:
        universalTag = map_tag(source, 'universal', tag)
        translatedDataSet.append((word, universalTag))
    return translatedDataSet

def ComputeAccuracy(dataSet, assignedTags):
    """
    Computes the accuracy of the tagger
    """
    correct = 0
    totalNumberOfWords = len(dataSet)

    for (word, tag), (word, assignedTag) in zip(dataSet, assignedTags):
        if tag == assignedTag:
            correct += 1
    return correct / totalNumberOfWords

# Run the functions

In [18]:
# Read file and preprocess data
dataSet = ReadDocument("BrownCorpus.txt")
brownToUniversalMappings = ReadTagTranslator("BrownToUniversalTagMap.txt")
dataSet = ConvertTags(dataSet, brownToUniversalMappings)
trainingData, testData = SplitData(dataSet)

# Run tagger and compute accuracy
taggedTestData = RunTagger(testData)
translatedTestData = TranslateAssignedTags(taggedTestData)
taggingAccuracy = ComputeAccuracy(testData, translatedTestData)

print(f'Accuracy of pretrained perceptron')
print(f' > Test data set: {100*taggingAccuracy:.2f} %')

Accuracy of pretrained perceptron
 > Test data set: 87.55 %
