In [1]:
import nltk
import re

# Question 01

In [71]:
# define sentences
sent1 = 'I/PRP booked/VB a/DT flight/NNP from/IN Toronto/NNP to/TO Calgary/NNP ./.'
sent2 = 'Does/VBZ this/DT flight/NNP serve/VB complementary/JJ drinks/NNS ?/?'
sent3 = 'I/PRP have/VBP a/DT friend/NN living/VB in/IN Toronto/NNP ./.'

# changes to tag tuples
tags1 = [nltk.tag.str2tuple(w) for w in sent1.split()]
tags2 = [nltk.tag.str2tuple(w) for w in sent2.split()]
tags3 = [nltk.tag.str2tuple(w) for w in sent3.split()]

# get the tagset of the Penn Treebank
treebank_tagset = nltk.corpus.treebank.tagged_words()

# get the conditional freq dist between word and tag
cfd = nltk.ConditionalFreqDist(treebank_tagset)

# find tagging erros compared to the Penn Treebank tagset
for i, tag_tuple in enumerate([tags1, tags2, tags3]):
    for word, tag in tag_tuple:
        # if word cannot be specified in the Penn Treebank
        if not cfd[word].most_common():
            print(f"The word '{word}' is not found in the Penn Treebank tagset.")
            continue
        # check the correct tag in Penn Treebank
        corrent_tag = cfd[word].most_common()[0][0]
        # check the error
        if corrent_tag != tag:
            print(f"Error in {word}/{tag} (sent{i+1}); Correct tag is {corrent_tag}.")

Error in booked/VB (sent1); Correct tag is VBD.
The word 'flight' is not found in the Penn Treebank tagset.
The word 'Calgary' is not found in the Penn Treebank tagset.
The word 'Does' is not found in the Penn Treebank tagset.
The word 'flight' is not found in the Penn Treebank tagset.
The word 'complementary' is not found in the Penn Treebank tagset.
Error in ?/? (sent2); Correct tag is ..
Error in living/VB (sent3); Correct tag is NN.


# Question 02

In [80]:
# Train a unigram tagger based on the brown 'adventure'
#  freq dist
fd = nltk.FreqDist(nltk.corpus.brown.words(categories='adventure'))
#  conditional freq dist
cfd = nltk.ConditionalFreqDist(nltk.corpus.brown.tagged_words(categories='adventure'))
#  most frequency words (top 200)
most_freq_words = fd.most_common(200)
#  define likely tags
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
#  create tag model
baseline_tagger = nltk.UnigramTagger(model=likely_tags)

# Get some new texts
new_sent1 = nltk.corpus.brown.sents(categories='humor')[5]
new_sent2 = nltk.corpus.brown.sents(categories='learned')[10]

# apply the tag model to new texts
print('Humor: ')
print(baseline_tagger.tag(new_sent1))
print('')

print('Learned: ')
print(baseline_tagger.tag(new_sent2))


print("""
- We can observe that some words do not have a tag.      
- The reason why a tag is not assigned in some words is that those words are not found in the trained word-tag pairs.
- In this case, for example, the word "Finally" does not exist in the adventure category of the broan text.
""")

Humor: 
[('Finally', None), (',', ','), ('at', 'IN'), ('Ye', None), ('Olde', None), ('Gasse', None), ('Filling', None), ('Station', None), ('on', 'IN'), ('Avocado', None), ('Avenue', None), (',', ','), ('they', 'PPSS'), ('learned', None), ('that', 'CS'), ('their', 'PP$'), ('man', 'NN'), (',', ','), ('having', None), ('paused', None), ('to', 'TO'), ('get', 'VB'), ('oil', None), ('for', 'IN'), ('his', 'PP$'), ('car', None), (',', ','), ('had', 'HVD'), ('asked', None), ('about', 'IN'), ('the', 'AT'), ('route', None), ('to', 'TO'), ('San', None), ('Diego', None), ('.', '.')]

Learned: 
[('Of', None), ('the', 'AT'), ('remaining', None), ('planets', None), (',', ','), ('only', 'RB'), ('Mars', None), ('and', 'CC'), ('Saturn', None), ('have', 'HV'), ('been', 'BEN'), ('observed', None), ('as', 'CS'), ('radio', None), ('sources', None), (',', ','), ('and', 'CC'), ('not', '*'), ('very', 'QL'), ('much', 'AP'), ('information', None), ('is', 'BEZ'), ('available', None), ('.', '.')]

- We can observe

# Question 03

In [2]:
# load brown corpos tags
tags = nltk.corpus.brown.tagged_words(categories='news', tagset='universal')

# (a): Which nouns are more common in their plural form
#  define freq dist of nouns whose format is plural 
plural_nouns_fd = nltk.FreqDist([word for (word, tag) in tags if tag == 'NOUN' and re.search(r'^[A-Za-z]+s$', word)])
print("More common plural form nouns (show the first 20 items): ")
print(plural_nouns_fd.most_common(20))
print('')

# (b): Which word has the greatest number of distinct tags?
#  define freq dist of tags conditioned by each word
cfd = nltk.ConditionalFreqDist(tags)
#  get the word the greatest number of distinct
word = max(cfd.conditions(), key=lambda w: len(cfd[w]))
print("The word whose distinct tags are the largest: ")
print(f"Word: '{word}', Tags: {cfd[word].__repr__()}")
print("Therefore, the word 'that' has 4 distinct tags: Adposition, Pronoun, Determiner(Article), and Adverb")
print('')

# (c): List tags in order of decreasing frequency
tags_dec_freq = list(nltk.FreqDist([tag for (word, tag) in tags]))
print("Return the 20 most frequent tags: ")
print(tags_dec_freq[:20])
print("")

# (d): Which tags are nouns most commonly found after? What do these tags represent?
tags_after_noun_freq = nltk.FreqDist([b[1] for (a, b) in nltk.bigrams(tags) if a[1] == 'NOUN'])
print("Return the 5 most common tags followed by noun: ")
print(tags_after_noun_freq.most_common(5))
print("They reprecent Noun, Period, Adposition, Verb, Conjunctions")
print('')

More common plural form nouns (show the first 20 items): 
[('years', 102), ('members', 69), ('sales', 51), ('Dallas', 49), ('Texas', 48), ('Miss', 46), ('business', 42), ('months', 42), ('days', 38), ('States', 38), ('Laos', 38), ('James', 37), ('schools', 37), ('Maris', 36), ('laws', 30), ('runs', 30), ('bonds', 29), ('funds', 28), ('countries', 28), ('series', 25)]

The word whose distinct tags are the largest: 
Word: 'that', Tags: FreqDist({'ADP': 545, 'PRON': 128, 'DET': 124, 'ADV': 5})
Therefore, the word 'that' has 4 distinct tags: Adposition, Pronoun, Determiner(Article), and Adverb

Return the 20 most frequent tags: 
['NOUN', 'VERB', 'ADP', '.', 'DET', 'ADJ', 'ADV', 'CONJ', 'PRON', 'PRT', 'NUM', 'X']

Return the 5 most common tags followed by noun: 
[('NOUN', 7959), ('.', 7732), ('ADP', 6519), ('VERB', 4192), ('CONJ', 1457)]
They reprecent Noun, Period, Adposition, Verb, Conjunctions

