# Week04 - Tagging

In [13]:
import nltk
from nltk.corpus import brown

In [3]:
text = nltk.word_tokenize("And now for something competelty different")
nltk.pos_tag(text)

"""
CC: Coordinating Conjunction
RB: adverbs
IN: preposition
NN: noun
JJ: adjective

Help:
>> nltk.help.upenn_tagset('RB')
"""

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('competelty', 'NN'),
 ('different', 'JJ')]

In [65]:
# Homonyms: dufferent words that are spelled the same, but have different meanings.
text = nltk.word_tokenize('they refuse to permit use to obtain the refuse permit.')
nltk.pos_tag(text)


[('they', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('use', 'NN'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN'),
 ('.', '.')]

In [5]:
# NLTK - Tagger
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

# find all contents "w_1 w w_2"
# finds all words w' that appear in the same context "w_1 w' w_2"

man time day year car moment world house family child country boy
state job place way war girl work word


In [7]:
# Tagged Corpora
tagged_token = nltk.tag.str2tuple('fly/NN')
print(tagged_token)
print(tagged_token[0])
print(tagged_token[1])

# This allows us to isolate the two components and use each easily

('fly', 'NN')
fly
NN


In [69]:
# Steps to Token, Tag Tuples
sent = '''​The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN​
    other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC​
    Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS​
    said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB​
    accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT​
    interest/NN of/IN both/ABX governments/NNS ''/'' ./.​'''

[nltk.tag.str2tuple(t) for t in sent.split()]


[('\u200bThe', 'AT'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('commented', 'VBD'),
 ('on', 'IN'),
 ('a', 'AT'),
 ('number', 'NN'),
 ('of', 'IN\u200b'),
 ('other', 'AP'),
 ('topics', 'NNS'),
 (',', ','),
 ('AMONG', 'IN'),
 ('them', 'PPO'),
 ('the', 'AT'),
 ('Atlanta', 'NP'),
 ('and', 'CC\u200b'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('purchasing', 'VBG'),
 ('departments', 'NNS'),
 ('which', 'WDT'),
 ('it', 'PPS\u200b'),
 ('said', 'VBD'),
 ('``', '``'),
 ('ARE', 'BER'),
 ('well', 'QL'),
 ('operated', 'VBN'),
 ('and', 'CC'),
 ('follow', 'VB'),
 ('generally', 'RB\u200b'),
 ('accepted', 'VBN'),
 ('practices', 'NNS'),
 ('which', 'WDT'),
 ('inure', 'VB'),
 ('to', 'IN'),
 ('the', 'AT'),
 ('best', 'JJT\u200b'),
 ('interest', 'NN'),
 ('of', 'IN'),
 ('both', 'ABX'),
 ('governments', 'NNS'),
 ("''", "''"),
 ('.', '.\u200b')]

In [73]:
print(nltk.corpus.brown.tagged_words()[:5])
print(nltk.corpus.brown.tagged_words(tagset='universal')[:5])

print(
"""
Universal target provides a more general and consistent way of representing part-of-speech information across different languages and corpora.
""")

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL')]
[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN')]

Universal target provides a more general and consistent way of representing part-of-speech information across different languages and corpora.



In [91]:
# news article (most common tags)
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]

In [96]:
word_tag_pairs = list(nltk.bigrams(brown_news_tagged))
# observation
#word_tag_pairs[10: 12]
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]

['NOUN',
 'DET',
 'ADJ',
 'ADP',
 '.',
 'VERB',
 'CONJ',
 'NUM',
 'ADV',
 'PRT',
 'PRON',
 'X']

In [103]:
# most common verbs in news text
wsj = nltk.corpus.treebank.tagged_words(tagset='universal')
word_tag_fd = nltk.FreqDist(wsj)

[wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']


['is',
 'said',
 'was',
 'are',
 'be',
 'has',
 'have',
 'will',
 'says',
 'would',
 'were',
 'had',
 'been',
 'could',
 "'s",
 'can',
 'do',
 'say',
 'make',
 'may',
 'did',
 'rose',
 'made',
 'does',
 'expected',
 'buy',
 'take',
 'get',
 'might',
 'sell',
 'added',
 'sold',
 'help',
 'including',
 'should',
 'reported',
 'according',
 'pay',
 'compared',
 'being',
 'fell',
 'began',
 'based',
 'used',
 'closed',
 "'re",
 'want',
 'see',
 'took',
 'yield',
 'offered',
 'set',
 'priced',
 'approved',
 'come',
 'noted',
 'cut',
 'ended',
 'found',
 'increased',
 'become',
 'think',
 'named',
 'go',
 'trying',
 'proposed',
 'received',
 'growing',
 'declined',
 'held',
 'give',
 'came',
 'use',
 'put',
 'making',
 'continue',
 'raise',
 'estimated',
 'called',
 'paid',
 'designed',
 'going',
 'expects',
 'seeking',
 'must',
 'plans',
 'wo',
 'increasing',
 'saying',
 'got',
 'owns',
 'trading',
 'acquired',
 'gained',
 'fined',
 'reached',
 'holding',
 'announced',
 'filed',
 'became',


In [104]:
# studying the word 'often'
brown_learned_text = brown.words(categories='learned')
sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))

[',',
 '.',
 'accomplished',
 'analytically',
 'appear',
 'apt',
 'associated',
 'assuming',
 'became',
 'become',
 'been',
 'began',
 'call',
 'called',
 'carefully',
 'chose',
 'classified',
 'colorful',
 'composed',
 'contain',
 'differed',
 'difficult',
 'encountered',
 'enough',
 'equate',
 'extremely',
 'found',
 'happens',
 'have',
 'ignored',
 'in',
 'involved',
 'more',
 'needed',
 'nightly',
 'observed',
 'of',
 'on',
 'out',
 'quite',
 'represent',
 'responsible',
 'revamped',
 'seclude',
 'set',
 'shortened',
 'sing',
 'sounded',
 'stated',
 'still',
 'sung',
 'supported',
 'than',
 'to',
 'when',
 'work']

In [110]:
brown_lrnd_tagged = nltk.corpus.brown.tagged_words(categories='learned', tagset='universal')
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()

VERB  ADV  ADP  ADJ    .  PRT 
  37    8    7    6    4    2 


In [112]:
# POS ambiguities
brown_news_tagged = nltk.corpus.brown.tagged_words(categories='news', tagset='universal')
data = nltk.ConditionalFreqDist(
    (word.lower(), tag) for (word, tag) in brown_news_tagged
)

for word in sorted(data.conditions()):
    if len(data[word]) > 3:
        tags = [tag for (tag, _) in data[word].most_common()]
        print(word, ' '.join(tags))

best ADJ ADV VERB NOUN
close ADV ADJ VERB NOUN
open ADJ VERB NOUN ADV
present ADJ ADV NOUN VERB
that ADP DET PRON ADV


In [113]:
# Loopup tagger
brown_tagger_sents = brown.tagged_sents(categories='news')
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.most_common()
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagger_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  baseline_tagger.evaluate(brown_tagger_sents)


0.9349006503968017