# Natural Language Processing

## Exercise Sheet 6

In [1]:
#imports for all exercises
import nltk
from nltk.classify import apply_features, accuracy
from nltk.corpus import names, ppattach, senseval, movie_reviews
import random
from statistics import mean
from pickle import load

### Exercise 1

Write a name gender classifier using the Names Corpus, the `apply_features` function, shuffling, and a test set of 500 instances. Use the following features:

a) first letter;  
b) last letter;  
c) last two letters;  
d) length;  
e) for each letter one feature, which is true if the name contains the letter.

Use the `NaiveBayesClassifier`, calculate the accuracy, and display the 10 most informative features.


In [None]:
def gender_features(word):
    features = {}
  
    features['first_letter'] = word[0].lower()
    features['last_letter'] = word[-1].lower()
    features['last_two_lettes'] = word[-2:].lower()
    features['length'] = len(word)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["has({})".format(letter)] = (letter in word.lower())
    
    return features
        

In [None]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
accuracy(classifier, test_set)

In [None]:
classifier.show_most_informative_features(10)

### Exercise 2

The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. Using this dataset, build a `NaiveBayesClassifier` that predicts the correct sense tag for a given instance for the word "hard":

Use the preceding and following word as features. They can be calculated by retrieving the position of the word "hard" as `p=inst.position` and then accessing `inst.context[p-1]` and `inst.context[p+1]`.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.

In [None]:
def features(inst):
    pos = inst.position
    prev = inst.context[pos-1][0]
    next = inst.context[pos+1][0]

    return {
        'prev_word': prev,
        'next_word': next
    }

In [None]:
instances = senseval.instances('hard.pos')
labeled_instances = [(inst, inst.senses) for inst in instances]

In [None]:
size = int(len(labeled_instances) * 0.1)
accuracy_memory = []

for i in range(10):
    random.shuffle(labeled_instances)
    train_set = apply_features(features, labeled_instances[size:])
    test_set = apply_features(features, labeled_instances[:size])

    classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    acc = accuracy(classifier, test_set)
    accuracy_memory.append(acc)
    print( 'Accuracy of run {}: {}'.format(i+1, acc) )


avg = mean(accuracy_memory)
print( 'Overall Accuracy of 10 runs: {}'.format(avg) )

### Exercise 3

The synonyms "strong" and "powerful" pattern differently. Use the tagged Brown corpus with the universal tagset to first list the nouns which follow "strong" vs. "powerful". Write for this a function `next_noun(word, tagged_text)` which returns the list of nouns that follow `word` in the `tagged_text`. Build then a `NaiveBayesClassifier` that predicts when each word should be used by using the function `apply_features` and the following noun as single feature.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.


In [None]:
def next_noun(word, tagged_text):
  return [ b[0] for (a, b) in nltk.bigrams(tagged_text) if a[0] == word and b[1] == 'NOUN' ]

In [None]:
strong_nouns = next_noun('strong', nltk.corpus.brown.tagged_words(tagset='universal'))
powerful_nouns = next_noun('powerful', nltk.corpus.brown.tagged_words(tagset='universal'))

strong_nouns_tagged = [ (n, 'strong') for n in strong_nouns ]
powerful_nouns_tagged = [ (n, 'powerful') for n in powerful_nouns ]

combined_nouns_tagged = strong_nouns_tagged + powerful_nouns_tagged

In [None]:
def word_feature(word):
    return {'word': word}

In [None]:
size = int(len(combined_nouns_tagged) * 0.1)
accuracy_cache = []

for i in range(10):

    random.shuffle(combined_nouns_tagged)
    train_set = apply_features(word_feature, combined_nouns_tagged[size:])
    test_set = apply_features(word_feature, combined_nouns_tagged[:size])

    classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    acc = accuracy(classifier, test_set)
    accuracy_cache.append(acc)
    print('Accuracy in round {}: {}'.format(i+1, acc))

mean_acc = mean(accuracy_cache)
print('Mean Accuracy of 10 tries: {}'.format(mean_acc))

### Exercise 4

Based on the Movie Reviews document classifier discussed in this chapter, build a new `NaiveBayesClassifier`. Tag first the Movie Reviews Corpus using the combined tagger from the previous chapter stored in `t2.pkl`. Filter the tagged words to contain only words for the tags `['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']` as well as only alphabetic tokens with at least three characters. Convert the words to lowercase. Use the most common 5000 words as `word_features` in the function `document_features`. 

Run 10 iterations by reshuffling the instances and printing the accuracy and 5 most informative features for each iteration. Finally, print the average accuracy.
    

In [2]:
# imports tagger for tokenized text from chapter 5
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()

In [3]:
# creates documents with list of tagged words and movie review
documents_tagged = [(tagger.tag(movie_reviews.words(fileid)), category)
                    for category in movie_reviews.categories()
                    for fileid in movie_reviews.fileids(category)
                    ]

In [4]:
target_tags = ['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']

# filter out words without tags from target_tags, words that are not alphabetic and shorter than 3 chars
# also converts words to lowercase and discards tags, as they are not needed anymore
documents_filtered = []
for (tag_words, cat) in documents_tagged:
    words_temp = []
    for(w, tag) in tag_words:
        if w.isalpha() and len(w) > 2 and tag in target_tags:
            words_temp.append(w.lower())
    documents_filtered.append((words_temp, cat))

In [5]:
# find 5000 most common words to use as features
fdist = nltk.FreqDist( w for (words, cat) in documents_filtered for w in words )
word_features = list(fdist)[:5000]

In [6]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [11]:
size = int(len(documents_filtered) * 0.1)
accuracy_memory = []

for i in range(10):
    random.shuffle(documents_filtered)
    train_set = apply_features(document_features, documents_filtered[size:])
    test_set = apply_features(document_features, documents_filtered[:size])

    classifier = nltk.NaiveBayesClassifier.train(train_set)

    acc = accuracy(classifier, test_set)
    accuracy_memory.append(acc)
    print( '\nAccuracy of run {}: {}'.format(i+1, acc) )
    # print( '5 Most informative features: {}\n'.format(classifier.show_most_informative_features(5)) )
    classifier.show_most_informative_features(5)

Accuracy of run 1: 0.8
Most Informative Features
     contains(insulting) = True              neg : pos    =     16.3 : 1.0
     contains(ludicrous) = True              neg : pos    =     14.6 : 1.0
        contains(seagal) = True              neg : pos    =     13.7 : 1.0
   contains(outstanding) = True              pos : neg    =     10.5 : 1.0
         contains(sucks) = True              neg : pos    =     10.2 : 1.0
Accuracy of run 2: 0.82
Most Informative Features
   contains(magnificent) = True              pos : neg    =     20.4 : 1.0
         contains(sucks) = True              neg : pos    =     16.3 : 1.0
     contains(maintains) = True              pos : neg    =     13.0 : 1.0
   contains(outstanding) = True              pos : neg    =     12.6 : 1.0
       contains(tribute) = True              pos : neg    =     11.0 : 1.0


In [10]:
avg = mean(accuracy_memory)
print( 'Overall Accuracy of 10 runs: {}'.format(avg) )

Overall Accuracy of 10 runs: 0.8200000000000001


### Exercise 5

The PP Attachment Corpus is a corpus describing prepositional phrase attachment decisions. Each instance in the training corpus is encoded as a `PPAttachment` object:

    from nltk.corpus import ppattach
    ppattach.attachments('training')
    
        [PPAttachment(sent='0', verb='join', noun1='board',
            prep='as', noun2='director', attachment='V'),
        PPAttachment(sent='1', verb='is', noun1='chairman',
            prep='of', noun2='N.V.', attachment='N'),
        ...]

    inst = ppattach.attachments('training')[1]
    (inst.noun1, inst.prep, inst.noun2)
    
        ('chairman', 'of', 'N.V.')

In the same way, `ppattach.attachments('test')` accesses the test instances. Select only the instances where `inst.attachment` is `'N'`:

In [None]:
nattach = [inst for inst in ppattach.attachments('training')
               if inst.attachment == 'N']

Using this sub-corpus, build a `NaiveBayesClassifier` that attempts to predict which preposition is used to connect a given pair of nouns. For example, given the pair of nouns "team" and "researchers", the classifier should predict the preposition "of". 

Write for this purpose a function `prepare_featuresets(subcorpus)`, where `subcorpus` is either the string "training" or "test" to return the training set or the test set. 

Print the achieved accuracy as well as the result of `classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })`.

In [None]:

def preposition_features(att):
    return ({'noun1': att.noun1, 'noun2': att.noun2}, att.prep)
    
def prepare_featuresets(subcorpus):
    nattach = [inst for inst in ppattach.attachments(subcorpus)
               if inst.attachment == 'N']
    
    return apply_features(preposition_features, nattach)

In [None]:
train_set = prepare_featuresets('training')
test_set = prepare_featuresets('test')

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
print( 'Accuracy: {}'.format( accuracy(classifier, test_set) ) )
print( classifier.classify({'noun1': 'team', 'noun2': 'researchers'}) )