# Natural Language Processing

## Exercise Sheet 5

In [4]:
#imports for all exercises
import nltk
import nltk.corpus as cp
from collections import defaultdict
from operator import itemgetter

### Exercise 1

Produce a sorted list of tags used in the Brown corpus, removing duplicates. Do the same for the universal part-of-speech tagset.

In [5]:
# brown tagset
sorted(set( t for (w,t) in cp.brown.tagged_words() ))

["'",
 "''",
 '(',
 '(-HL',
 ')',
 ')-HL',
 '*',
 '*-HL',
 '*-NC',
 '*-TL',
 ',',
 ',-HL',
 ',-NC',
 ',-TL',
 '--',
 '---HL',
 '.',
 '.-HL',
 '.-NC',
 '.-TL',
 ':',
 ':-HL',
 ':-TL',
 'ABL',
 'ABN',
 'ABN-HL',
 'ABN-NC',
 'ABN-TL',
 'ABX',
 'AP',
 'AP$',
 'AP+AP-NC',
 'AP-HL',
 'AP-NC',
 'AP-TL',
 'AT',
 'AT-HL',
 'AT-NC',
 'AT-TL',
 'AT-TL-HL',
 'BE',
 'BE-HL',
 'BE-TL',
 'BED',
 'BED*',
 'BED-NC',
 'BEDZ',
 'BEDZ*',
 'BEDZ-HL',
 'BEDZ-NC',
 'BEG',
 'BEM',
 'BEM*',
 'BEM-NC',
 'BEN',
 'BEN-TL',
 'BER',
 'BER*',
 'BER*-NC',
 'BER-HL',
 'BER-NC',
 'BER-TL',
 'BEZ',
 'BEZ*',
 'BEZ-HL',
 'BEZ-NC',
 'BEZ-TL',
 'CC',
 'CC-HL',
 'CC-NC',
 'CC-TL',
 'CC-TL-HL',
 'CD',
 'CD$',
 'CD-HL',
 'CD-NC',
 'CD-TL',
 'CD-TL-HL',
 'CS',
 'CS-HL',
 'CS-NC',
 'CS-TL',
 'DO',
 'DO*',
 'DO*-HL',
 'DO+PPSS',
 'DO-HL',
 'DO-NC',
 'DO-TL',
 'DOD',
 'DOD*',
 'DOD*-TL',
 'DOD-NC',
 'DOZ',
 'DOZ*',
 'DOZ*-TL',
 'DOZ-HL',
 'DOZ-TL',
 'DT',
 'DT$',
 'DT+BEZ',
 'DT+BEZ-NC',
 'DT+MD',
 'DT-HL',
 'DT-NC',
 'DT-TL',
 'D

In [6]:
# universal tagset
sorted(set( t for (w,t) in cp.brown.tagged_words(tagset='universal') ))

['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']

### Exercise 2

Write a program to process the Brown Corpus using the universal part-of-speech tagset to find out which nouns are more common in their plural form than in their singular form. Only consider regular plurals formed with the "-s" suffix. Print an alphabetically sorted list of the nouns together with the frequencies for the singular and plural forms, one per line. 


In [14]:
brown_tagged = cp.brown.tagged_words(tagset='universal')

nouns = [ w.lower() for (w,t) in brown_tagged if t == 'NOUN' and w.isalpha() ]
fdist = nltk.FreqDist(nouns)

# idea: have a dict with key= singular word, value= list of freq (singular, plural)
noun_dict = defaultdict(lambda: [0,0])

for (w, freq) in fdist.most_common():
    # considered plural
    if w.endswith('s'):
        sing_w = w[:-1]
        noun_dict[sing_w][1] = freq
    # singular
    else:
        noun_dict[w][0] = freq

# clear out nouns without a plural, as we search ones with high freq of plurals
cleared_noun_dict = { w:l for w,l in noun_dict.items() if l[1] != 0 and w }

for (w, l) in sorted(cleared_noun_dict.items()):
    print('{}(s)  -  singlar: {}   plural: {}'.format(w, l[0], l[1]))



abba(s)  -  singlar: 0   plural: 2
abberation(s)  -  singlar: 0   plural: 1
abbreviation(s)  -  singlar: 1   plural: 1
abdomini(s)  -  singlar: 0   plural: 1
abernathy(s)  -  singlar: 2   plural: 1
aberration(s)  -  singlar: 3   plural: 5
abilitie(s)  -  singlar: 0   plural: 13
abnormalitie(s)  -  singlar: 0   plural: 1
abolitionist(s)  -  singlar: 1   plural: 4
aborigine(s)  -  singlar: 7   plural: 8
abortion(s)  -  singlar: 6   plural: 1
abram(s)  -  singlar: 0   plural: 1
abruptnes(s)  -  singlar: 0   plural: 1
abscesse(s)  -  singlar: 0   plural: 3
absence(s)  -  singlar: 53   plural: 3
absolute(s)  -  singlar: 1   plural: 3
absolutenes(s)  -  singlar: 0   plural: 2
absorption(s)  -  singlar: 12   plural: 2
abstract(s)  -  singlar: 1   plural: 4
abstractednes(s)  -  singlar: 0   plural: 1
abstraction(s)  -  singlar: 16   plural: 7
abstractionist(s)  -  singlar: 0   plural: 3
abstractor(s)  -  singlar: 0   plural: 1
abstrusenesse(s)  -  singlar: 0   plural: 1
absurditie(s)  -  singl

### Exercise 3

Find out which word has the greatest number of distinct tags in the Brown corpus using the original tagset. Without using the `most_common` function, print a list of the tags together with the frequencies for the word, sorted by frequency from highest to lowest, one per line.



In [26]:
brown_tagged = cp.brown.tagged_words()

cfd = nltk.ConditionalFreqDist( (w.lower(), t) for (w,t) in brown_tagged )

# idea: create dict with key: word, value: list of tuples(tag, frequency) 
tag_dict = {}

for w in cfd.conditions():
    tag_dict[w] = sorted( ((t,f) for (t,f) in cfd[w].items()), key=itemgetter(1), reverse=True )

for (k,v) in tag_dict.items():
    print(k + ': ')
    for (t, f) in v:
        print(' ' + t + ' - ' + str(f))

the: 
 AT - 69013
 AT-TL - 675
 AT-HL - 253
 AT-NC - 26
 NIL - 3
 AT-TL-HL - 1
fulton: 
 NP-TL - 10
 NP - 7
county: 
 NN-TL - 83
 NN - 71
 NN-HL - 1
grand: 
 JJ - 30
 JJ-TL - 15
 FW-JJ-TL - 3
jury: 
 NN - 63
 NN-TL - 3
 NN-HL - 1
said: 
 VBD - 1747
 VBN - 213
 VBN-HL - 1
friday: 
 NR - 55
 NR-TL - 4
 NP - 1
an: 
 AT - 3706
 AT-HL - 13
 CC - 12
 AT-TL - 6
 NIL - 2
 AT-NC - 1
investigation: 
 NN - 43
 NN-TL - 8
of: 
 IN - 35028
 IN-TL - 1166
 IN-HL - 207
 IN-TL-HL - 6
 IN-NC - 3
 NIL - 2
atlanta's: 
 NP$ - 4
recent: 
 JJ - 178
 JJ-HL - 1
primary: 
 JJ - 79
 NN - 15
 JJ-HL - 2
election: 
 NN - 74
 NN-TL - 3
produced: 
 VBN - 62
 VBD - 28
``: 
 `` - 8837
no: 
 AT - 1805
 RB - 180
 QL - 137
 AT-TL - 8
 AT-HL - 6
 RB-NC - 3
evidence: 
 NN - 203
 VB - 1
'': 
 '' - 8789
that: 
 CS - 6464
 DT - 2260
 WPS - 1654
 WPO - 135
 QL - 56
 DT-NC - 6
 DT-TL - 5
 WPS-TL - 3
 WPS-NC - 3
 CS-NC - 2
 WPS-HL - 2
 CS-HL - 1
 DT-HL - 1
 NIL - 1
 WPO-NC - 1
any: 
 DTI - 1328
 QL - 12
 DTI-HL - 2
 DTI-TL - 1
 RB

### Exercise 4

Tabulate the frequencies of the universal tags that precede nouns in the Brown Corpus. 

In [27]:
brown_tagged = cp.brown.tagged_words(tagset='universal')

tags = [ a[1] for (a, b) in nltk.bigrams(brown_tagged) if b[1] == ('NOUN')]

fdist = nltk.FreqDist(tags)
fdist.tabulate()

  DET   ADJ  NOUN   ADP     .  VERB  CONJ   NUM   ADV   PRT  PRON     X 
85845 54653 41309 37418 20084 17851  9294  5668  1851  1068   440    77 


### Exercise 5

Write a function `ambiguous(tagged_text)` that returns the number of ambiguous word types as well as the number of all word types in a tagged text. A word type is ambiguous if it is tagged with at least two different tags. Use the function to print both values as well as the percentage of ambiguous word types for the Brown Corpus both for the original and the universal tagset.

In [29]:
def ambiguous(tagged_text):
    cfd = nltk.ConditionalFreqDist( (w.lower(),t) for (w,t) in tagged_text )

    amb_words = [ w for w in cfd.conditions() if len(cfd[w]) > 1 ]
    words = set( w for w,t in tagged_text)

    percentage = round( len(amb_words) / len(words) * 100, 2)

    print('Word types: {} - Ambiguous word types: {} - Percentage: {}%'.format(len(words), len(amb_words), percentage))


print('With brown tagset:') 
ambiguous(cp.brown.tagged_words())

print('With universal tagset:') 
ambiguous(cp.brown.tagged_words(tagset='universal'))


With brown tagset:
Word types: 56057 - Ambiguous word types: 9580 - Percentage: 17.09%
With universal tagset:
Word types: 56057 - Ambiguous word types: 3408 - Percentage: 6.08%


### Exercise 6

Write code to search the Brown Corpus to answer the following questions:

a) produce an alphabetically sorted list of the distinct words tagged as `MD`  
b) identify words that can be plural nouns or third person singular verbs  
c) print an alphabetically sorted list of distinct three-word prepositional phrases of the form `IN+AT+NN`, separated by semicolons


In [126]:
brown_tagged = cp.brown.tagged_words()

In [127]:
# a)
sorted( set( w for w,t in brown_tagged if t == 'MD'))

['Can',
 'Could',
 'May',
 'Might',
 'Must',
 'Ought',
 'Shall',
 'Should',
 'Will',
 'Would',
 "c'n",
 'can',
 'colde',
 'could',
 'dare',
 'kin',
 'maht',
 'mai',
 'may',
 'maye',
 'mayst',
 'might',
 'must',
 'need',
 'ought',
 'shall',
 'should',
 'shuld',
 'shulde',
 'wil',
 'will',
 'wilt',
 'wod',
 'wold',
 'wolde',
 'would']

In [128]:
# b)
set( w for w,t in brown_tagged if t == 'NNS' or t == 'VBZ')

{'favors',
 'specifications',
 'greenhouses',
 'shibboleths',
 'sells',
 'overshoots',
 'Figs.',
 'ballots',
 'tepees',
 'bronchi',
 'brutalities',
 'suppers',
 'inventions',
 'Circumstances',
 'jogs',
 'slants',
 'tonalities',
 'lay-offs',
 'contracts',
 'meats',
 'predictors',
 'remains',
 'fits',
 'seventies',
 'nuisances',
 'burlesques',
 'prospers',
 'torrents',
 'habitants',
 'Doctors',
 'inducements',
 'necessities',
 '$1.7',
 'invaders',
 'punks',
 'circles',
 'pinnings',
 'entertainments',
 'rationalizations',
 'rallies',
 'abnormalities',
 'microwaves',
 'islands',
 'communiques',
 'earnings',
 'dregs',
 'Highways',
 'larvae',
 'pp.',
 'photoelectrons',
 'Invitations',
 'Elections',
 'grandmothers',
 'props',
 '$28',
 'admires',
 'policies',
 'babies',
 'setbacks',
 'marts',
 'discrepancies',
 'swallows',
 'ologies',
 'majors',
 'Costs',
 'combines',
 'shipyards',
 'initials',
 'publicists',
 'italics',
 'howls',
 'furs',
 'suppositions',
 'textures',
 'hollows',
 'explanatio

In [131]:
# c)
sorted(set( f'{w1};{w2};{w3}' for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(brown_tagged) if t1 == 'IN' and t2 == 'AT' and t3 == 'NN'))

["'bout;the;saddle",
 "'ceptin';the;light",
 'About;the;murder',
 'Above;the;tongue',
 'Across;the;bay',
 'Across;the;bridge',
 'Across;the;front',
 'Across;the;road',
 'Across;the;street',
 'Across;the;table',
 'Across;the;way',
 'After;a;conversation',
 'After;a;day',
 'After;a;dinner',
 'After;a;flood',
 'After;a;minute',
 'After;a;moment',
 'After;a;pause',
 'After;a;reception',
 'After;a;roundup',
 'After;a;sort',
 'After;a;supper',
 'After;a;time',
 'After;a;while',
 'After;a;year',
 'After;an;earthquake',
 'After;every;money',
 'After;every;session',
 'After;the;collapse',
 'After;the;demise',
 'After;the;diagnosing',
 'After;the;game',
 'After;the;meal',
 'After;the;meeting',
 'After;the;pegboard',
 'After;the;soil',
 'After;the;spate',
 'After;the;storm',
 'After;the;war',
 'Against;the;ruin',
 'Along;the;way',
 'Amid;a;shortage',
 'Amid;the;crackle',
 'Among;the;policy',
 'Around;the;billiard',
 'Around;the;table',
 'As;a;result',
 'At;a;ceremony',
 'At;a;minimum',
 'At;a;nod

### Exercise 7

Write a function `prec_adv(word, text)` that returns an alphabetically sorted list of distinct adverbs that precede `word` in `text`. Use this function to find out which adverbs precede the words "love", "like", and "prefer" in the Brown corpus. 

In [30]:
def prec_adv(word, text):
    text_tagged = text.tagged_words(tagset='universal')

    words = set( a[0] for (a, b) in nltk.bigrams(text_tagged) if a[1] == 'ADV' and b[0] == word )
    return sorted(words)

for w in ['love', 'like', 'prefer']:
    advs = prec_adv(w, cp.brown)
    print('Searching preceding adverbs for \'{}\':\n{}\n'.format(w, advs))

Searching preceding adverbs for 'love':
['always', 'dearly', 'just', 'not']

Searching preceding adverbs for 'like':
['How', 'Jist', 'Just', 'Kinda', 'More', 'Not', 'abreast', 'almost', 'alone', 'always', 'around', 'by', 'close', 'deceptively', 'even', 'exactly', 'gloriously', 'here', 'increasingly', 'jist', 'just', 'less', 'more', 'much', 'not', 'often', 'particularly', 'quick', 'quite', 'rather', 'remarkably', 'roughly', 'simply', 'so', 'somewhat', 'sure', 'there', 'together', 'wildly', 'yet']

Searching preceding adverbs for 'prefer':
['generally', 'much', 'not', 'spontaneously']

