### POS Tagging

In [10]:
from nltk.corpus import brown                                                               
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

# the Default Tagger
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'                      # Tagger that tags everything as NN
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)
default_tagger.evaluate(brown_tagged_sents)

#The Regexp tagger
patterns = [
    (r'.*ing$', 'VBG'),                                                                     # gerunds
    (r'.*ed$', 'VBD'),                                                                      # simple past
    (r'.*es$', 'VBZ'),                                                                      # 3rd singular present
    (r'.*ould$', 'MD'),                                                                     # modals
    (r'.*\'s$', 'NN$'),                                                                     # possessive nouns
    (r'.*s$', 'NNS'),                                                                       # plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),                                                       # cardinal numbers
    (r'.*', 'NN')                                                                           # nouns (default)
]
regexp_tagger = nltk.RegexpTagger(patterns)                                                 # regular expression tagger
regexp_tagger.tag(brown_sents[3])
regexp_tagger.evaluate(brown_tagged_sents)

#The Lookup Tagger
fd = nltk.FreqDist(brown.words(categories='news'))                                          # frequency distribution
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))                       # Conditional frequency distribution
most_freq_words = fd.most_common(100)
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
                                    backoff=nltk.DefaultTagger('NN'))                       # unigram tagger
baseline_tagger.evaluate(brown_tagged_sents)

0.5817769556656125

#### Default tagger

In [5]:
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [6]:
default_tagger.evaluate(brown_tagged_sents)


0.13089484257215028

#### Regex tagger

In [7]:
patterns = [
    (r'.*ing$', 'VBG'),                # gerunds
    (r'.*ed$', 'VBD'),                 # simple past
    (r'.*es$', 'VBZ'),                 # 3rd singular present
    (r'.*ould$', 'MD'),                # modals
    (r'.*\'s$', 'NN$'),                # possessive nouns
    (r'.*s$', 'NNS'),                  # plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')                      # nouns (default)
]

In [8]:
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])
regexp_tagger.evaluate(brown_tagged_sents)

0.20186168625812995

#### Look up Tagger

In [11]:
fd = nltk.FreqDist(brown.words(categories='news'))                                          # frequency distribution
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))                       # Conditional frequency distribution
most_freq_words = fd.most_common(100)
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
                                    backoff=nltk.DefaultTagger('NN'))                       # unigram tagger
baseline_tagger.evaluate(brown_tagged_sents)

0.5817769556656125