# POS Tagger

Import modules

In [26]:
import nltk
from nltk import bigrams, trigrams

import numpy as np

Use the Brown Corpus.

In [None]:
from nltk.corpus import brown

Get the tokenized and tagged with Universal POS sentences from the "news" category.

In [5]:
brown_words = brown.tagged_words(categories='news', tagset='universal')

Show the tagset.

In [6]:
tags = set([tag for (token,tag) in brown_words])
tags

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

Build a baseline tagger, that assigns the most frequent tag.

In [15]:
tags_freq = nltk.ConditionalFreqDist(brown_words)

In [19]:
tags_freq['the']

FreqDist({'DET': 5580})

In [20]:
tags_freq['the'].max()

'DET'

Get the sentences

In [22]:
brown_sents = brown.tagged_sents(categories='news', tagset='universal')

In [23]:
brown_sents[0]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

Split the corpus into train + test.

In [24]:
test_corpus = brown_sents[:1000]
train_corpus = brown_sents[1000:]

# Baseline
Define a baseline tagger, which assigns the most frequent tag to each token in the corpus.

In [104]:
from nltk import TaggerI
class BaselineTagger(TaggerI):
    """
    A tagger that assigns the most frequent tag to each token.
    """
    def __init__(self, freqdist):
        self.freqdist = freqdist
        
    def tag(self, tokens):
        return [(tok, self.freqdist[tok].max() if self.freqdist[tok] else 'NOUN') for tok in tokens]

In [105]:
basetagger = BaselineTagger(tags_freq)

Try it on the first sentence

In [106]:
basetagger.tag([pair[0] for pair in test_corpus[0]])

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

Measure accuracy:

In [107]:
round(basetagger.evaluate(test_corpus), 4) * 100

96.92

# Unigram tagger

In [108]:
from nltk import UnigramTagger
unigram_tagger = UnigramTagger(train_corpus) 

# Trigram tagger

Train the trigram tagger

In [109]:
from nltk import TrigramTagger
trigram_tagger = TrigramTagger(train_corpus, backoff=unigram_tagger)

Tag a sentence

In [110]:
trigram_tagger.tag([pair[0] for pair in test_corpus[0]])

[('The', 'DET'),
 ('Fulton', None),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'ADJ'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', None),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

Test accuracy:

In [111]:
round(trigram_tagger.evaluate(test_corpus), 4) * 100

86.27

# Naive Bayes

In [61]:
from nltk import ClassifierBasedPOSTagger
nb_tagger = ClassifierBasedPOSTagger(train=train_corpus)

Evaluate the tagger

In [113]:
round(nb_tagger.evaluate(test_corpus), 4) * 100

92.01

# SENNA

In [52]:
from nltk.tag import SennaTagger

In [94]:
senna = SennaTagger('/project/piqasso/tools/senna')

In [95]:
senna.tag('What is the airspeed of an unladen swallow ?'.split()) 

[('What', 'WP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('airspeed', 'NN'),
 ('of', 'IN'),
 ('an', 'DT'),
 ('unladen', 'NN'),
 ('swallow', 'NN'),
 ('?', '.')]

Evaluate SENNA.
We must use the PTB tag set

In [66]:
test_corpus_ptb = brown_sents = brown.tagged_sents(categories='news')[:1000]

In [75]:
senna.tag([pair[0] for pair in test_corpus[0]])

[('The', 'DT'),
 ('Fulton', 'NNP'),
 ('County', 'NNP'),
 ('Grand', 'NNP'),
 ('Jury', 'NNP'),
 ('said', 'VBD'),
 ('Friday', 'NNP'),
 ('an', 'DT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NNP'),
 ('recent', 'JJ'),
 ('primary', 'JJ'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'DT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'IN'),
 ('any', 'DT'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]