# Part of Speech Tagging

Part of Speech = Word classes = lexical categories

POS tagging: Process a sequence of words and attach the part of speech tag to each word

Define what is a tagset
What are the most common tags
What are the most comon tagsets - difference in detail 

In [2]:
import nltk

text = nltk.word_tokenize("And now for something completely different.")
tagged_text = nltk.pos_tag(text)
tagged_text

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ'),
 ('.', '.')]

In [12]:
nltk.help.upenn_tagset('RB')

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


An example to show how tags help disambiguate between functions

In [13]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
tagged_text = nltk.pos_tag(text)
tagged_text


[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [24]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
woman = text.similar('woman')

man time day year car moment world family house country child boy
state job way war girl place word work


## Tagged corpora


In [3]:
tagged_words = nltk.corpus.brown.tagged_words()
tagged_words[0:20]

[(u'The', u'AT'),
 (u'Fulton', u'NP-TL'),
 (u'County', u'NN-TL'),
 (u'Grand', u'JJ-TL'),
 (u'Jury', u'NN-TL'),
 (u'said', u'VBD'),
 (u'Friday', u'NR'),
 (u'an', u'AT'),
 (u'investigation', u'NN'),
 (u'of', u'IN'),
 (u"Atlanta's", u'NP$'),
 (u'recent', u'JJ'),
 (u'primary', u'NN'),
 (u'election', u'NN'),
 (u'produced', u'VBD'),
 (u'``', u'``'),
 (u'no', u'AT'),
 (u'evidence', u'NN'),
 (u"''", u"''"),
 (u'that', u'CS')]

In [4]:
tagged_sentences = nltk.corpus.brown.tagged_sents()
tagged_sentences[0:1]

[[(u'The', u'AT'),
  (u'Fulton', u'NP-TL'),
  (u'County', u'NN-TL'),
  (u'Grand', u'JJ-TL'),
  (u'Jury', u'NN-TL'),
  (u'said', u'VBD'),
  (u'Friday', u'NR'),
  (u'an', u'AT'),
  (u'investigation', u'NN'),
  (u'of', u'IN'),
  (u"Atlanta's", u'NP$'),
  (u'recent', u'JJ'),
  (u'primary', u'NN'),
  (u'election', u'NN'),
  (u'produced', u'VBD'),
  (u'``', u'``'),
  (u'no', u'AT'),
  (u'evidence', u'NN'),
  (u"''", u"''"),
  (u'that', u'CS'),
  (u'any', u'DTI'),
  (u'irregularities', u'NNS'),
  (u'took', u'VBD'),
  (u'place', u'NN'),
  (u'.', u'.')]]

Each corpus uses their own categories, universal tagset is a simplified tagset

In [8]:
tagged_words = nltk.corpus.brown.tagged_words(tagset='universal')
tagged_words[0:20]

[(u'The', u'DET'),
 (u'Fulton', u'NOUN'),
 (u'County', u'NOUN'),
 (u'Grand', u'ADJ'),
 (u'Jury', u'NOUN'),
 (u'said', u'VERB'),
 (u'Friday', u'NOUN'),
 (u'an', u'DET'),
 (u'investigation', u'NOUN'),
 (u'of', u'ADP'),
 (u"Atlanta's", u'NOUN'),
 (u'recent', u'ADJ'),
 (u'primary', u'NOUN'),
 (u'election', u'NOUN'),
 (u'produced', u'VERB'),
 (u'``', u'.'),
 (u'no', u'DET'),
 (u'evidence', u'NOUN'),
 (u"''", u'.'),
 (u'that', u'ADP')]

In [12]:
from nltk.corpus import brown 

brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word,tag) in brown_news_tagged)
tag_fd.most_common() # most common POs tags in the news part of the brown corpus

[(u'NOUN', 30640),
 (u'VERB', 14399),
 (u'ADP', 12355),
 (u'.', 11928),
 (u'DET', 11389),
 (u'ADJ', 6706),
 (u'ADV', 3349),
 (u'CONJ', 2717),
 (u'PRON', 2535),
 (u'PRT', 2264),
 (u'NUM', 2166),
 (u'X', 106)]

In [24]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceders = [a[1] for (a,b) in word_tag_pairs if b[1]  == 'NOUN'] # etiquetas que preceden más comunmente a un nombre 
fdist = nltk.FreqDist(noun_preceders)
fdist.most_common()

[(u'NOUN', 7959),
 (u'DET', 7363),
 (u'ADJ', 4758),
 (u'ADP', 3781),
 (u'.', 2795),
 (u'VERB', 1842),
 (u'CONJ', 938),
 (u'NUM', 894),
 (u'ADV', 186),
 (u'PRT', 94),
 (u'PRON', 19),
 (u'X', 11)]

In [26]:
word_tag_fd = nltk.FreqDist(brown_news_tagged)
[wt[0] for (wt,_) in word_tag_fd.most_common() if wt[1] == 'VERB'][0:20] # Verbos mas comunes

[u'is',
 u'was',
 u'be',
 u'said',
 u'will',
 u'are',
 u'has',
 u'had',
 u'have',
 u'were',
 u'would',
 u'been',
 u'made',
 u'can',
 u'could',
 u'get',
 u'may',
 u'did',
 u'do',
 u'should']

In [32]:
tagCfd = nltk.ConditionalFreqDist(brown_news_tagged)
tagCfd['play'].most_common()

[(u'VERB', 30), (u'NOUN', 11)]

In [36]:
wordCfd = nltk.ConditionalFreqDist((tag,word) for (word,tag) in brown_news_tagged)
wordCfd['VERB']

FreqDist({u'is': 732, u'was': 717, u'be': 526, u'said': 402, u'will': 388, u'are': 328, u'has': 300, u'had': 279, u'have': 265, u'were': 252, ...})

In [40]:
wsj = nltk.corpus.treebank.tagged_words()
cfd = nltk.ConditionalFreqDist(wsj)
cfd['play'].most_common()

[(u'VB', 3), (u'VBP', 1), (u'NN', 1)]

In [49]:
sorted([w for w in cfd.conditions() if 'VBD' in cfd[w] and 'VBN' in cfd[w]])

[u'Asked',
 u'accelerated',
 u'accepted',
 u'accused',
 u'acquired',
 u'added',
 u'adopted',
 u'advanced',
 u'advised',
 u'agreed',
 u'aimed',
 u'alleged',
 u'allowed',
 u'announced',
 u'anticipated',
 u'applied',
 u'approved',
 u'argued',
 u'asked',
 u'assumed',
 u'assured',
 u'attached',
 u'attempted',
 u'attributed',
 u'awarded',
 u'backed',
 u'banned',
 u'barred',
 u'believed',
 u'blamed',
 u'boosted',
 u'bought',
 u'bribed',
 u'brought',
 u'built',
 u'called',
 u'carried',
 u'caused',
 u'changed',
 u'chastised',
 u'cited',
 u'climbed',
 u'closed',
 u'collapsed',
 u'committed',
 u'compared',
 u'complained',
 u'completed',
 u'concluded',
 u'condemned',
 u'confirmed',
 u'consented',
 u'contained',
 u'continued',
 u'contributed',
 u'controlled',
 u'covered',
 u'created',
 u'cut',
 u'damaged',
 u'decided',
 u'declared',
 u'deemed',
 u'denied',
 u'described',
 u'determined',
 u'developed',
 u'died',
 u'disclosed',
 u'discovered',
 u'discussed',
 u'dismissed',
 u'dominated',
 u'dropped',

In [53]:
idx1 = wsj.index(('kicked','VBD'))
wsj[idx1-4:idx1+4]

[(u'While', u'IN'),
 (u'program', u'NN'),
 (u'trades', u'NNS'),
 (u'swiftly', u'RB'),
 (u'kicked', u'VBD'),
 (u'in', u'IN'),
 (u',', u','),
 (u'a', u'DT')]

In [56]:
idx2 = wsj.index(('kicked','VBN'))
wsj[idx2-4:idx2+4]

[(u'head', u'NN'),
 (u'of', u'IN'),
 (u'state', u'NN'),
 (u'has', u'VBZ'),
 (u'kicked', u'VBN'),
 (u'off', u'RP'),
 (u'an', u'DT'),
 (u'issue', u'NN')]

In [73]:
def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            return w1 + " " + w2 + " " + w3


phrases = [process(tagged_sent) for tagged_sent in brown.tagged_sents()]
phraseFd = nltk.FreqDist(phrases)
phraseFd.most_common()

[(None, 53489),
 (u'trying to get', 13),
 (u'want to see', 10),
 (u'going to get', 10),
 (u'trying to make', 10),
 (u'want to go', 9),
 (u'wanted to know', 7),
 (u'going to take', 7),
 (u'wanted to go', 6),
 (u'trying to find', 6),
 (u'want to make', 6),
 (u'want to know', 6),
 (u'tried to make', 6),
 (u'got to get', 6),
 (u'got to know', 5),
 (u'used to say', 5),
 (u'expect to find', 5),
 (u'going to tell', 5),
 (u'want to leave', 5),
 (u'like to see', 5),
 (u'like to think', 5),
 (u'permitted to operate', 5),
 (u'want to talk', 5),
 (u'surprised to find', 4),
 (u'wanted to get', 4),
 (u'try to keep', 4),
 (u'failed to show', 4),
 (u'began to talk', 4),
 (u'going to kill', 4),
 (u'allowed to stand', 4),
 (u'began to feel', 4),
 (u'entitled to sue', 4),
 (u'designed to provide', 4),
 (u'used to measure', 4),
 (u'wanted to take', 4),
 (u'like to make', 4),
 (u'come to see', 4),
 (u'trying to keep', 4),
 (u'used to describe', 4),
 (u'want to buy', 4),
 (u'like to know', 4),
 (u'refused t