In [23]:
import nltk
import random
from nltk.corpus import movie_reviews, twitter_samples, names, nps_chat
from random import shuffle

In [30]:
def gender_features(word):
    return{'last_letter': word[-1]}

labeled_names = ([(name, 'male') for name in names.words('male.txt')]
                 + [(name, 'female') for name in names.words('female.txt')])

shuffle(labeled_names)

In [31]:
from nltk.classify import apply_features

train_set = apply_features(gender_features, labeled_names[500:])

test_set = apply_features(gender_features, labeled_names[:500])

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [32]:
print('Erin is a ' + classifier.classify(gender_features('Erin')))
print('Josh is a ' + classifier.classify(gender_features('Josh')))

Erin is a male
Josh is a female


In [33]:
print(nltk.classify.accuracy(classifier,test_set))

0.764


In [170]:
print(classifier.show_most_informative_features(10))

Most Informative Features
             last_letter = 'k'              male : female =     46.8 : 1.0
             last_letter = 'a'            female : male   =     35.4 : 1.0
             last_letter = 'f'              male : female =     16.7 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =      9.9 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'o'              male : female =      9.2 : 1.0
             last_letter = 'm'              male : female =      8.8 : 1.0
             last_letter = 'r'              male : female =      7.1 : 1.0
             last_letter = 'w'              male : female =      5.1 : 1.0
None


In [191]:
errors = []

for (name, tag) in labeled_names[:500]:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append([tag, guess, name])

In [60]:
for (tag, guess, name) in sorted(errors[:10]):
    print('correct = {:<8} guessed = {:<8s} name = {:<30}'.format(tag, guess, name))

correct = female   guessed = male     name = Lilian                        
correct = female   guessed = male     name = Tamar                         
correct = female   guessed = male     name = Winnifred                     
correct = male     guessed = female   name = Andrea                        
correct = male     guessed = female   name = Davie                         
correct = male     guessed = female   name = Emmery                        
correct = male     guessed = female   name = Jaime                         
correct = male     guessed = female   name = Orbadiah                      
correct = male     guessed = female   name = Shay                          
correct = male     guessed = female   name = Tanney                        


In [34]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [36]:
train_set2 = apply_features(gender_features, train_names)
devtest_set = apply_features(gender_features, devtest_names)
test_set2 = apply_features(gender_features, test_names)

In [37]:
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

In [39]:
print('Erin is a ' + classifier2.classify(gender_features('Erin')))

Erin is a male


In [40]:
print(nltk.classify.accuracy(classifier2, devtest_set))

0.769


In [41]:
classifier2.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     36.9 : 1.0
             last_letter = 'k'              male : female =     26.2 : 1.0
             last_letter = 'f'              male : female =     15.4 : 1.0
             last_letter = 'p'              male : female =     12.0 : 1.0
             last_letter = 'd'              male : female =      9.1 : 1.0
             last_letter = 'o'              male : female =      8.7 : 1.0
             last_letter = 'v'              male : female =      8.6 : 1.0
             last_letter = 'm'              male : female =      7.6 : 1.0
             last_letter = 'r'              male : female =      6.3 : 1.0
             last_letter = 'g'              male : female =      5.3 : 1.0


In [42]:
nltk.classify.accuracy(classifier2, test_set2)

0.762

In [57]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

In [58]:
for (tag, guess, name) in errors[:10]:
    print('correct = {:<8} guess = {:<8s} name = {:<30}'.format(tag, guess, name))

correct = male     guess = female   name = Orbadiah                      
correct = male     guess = female   name = Jaime                         
correct = female   guess = male     name = Winnifred                     
correct = male     guess = female   name = Emmery                        
correct = male     guess = female   name = Shay                          
correct = male     guess = female   name = Andrea                        
correct = female   guess = male     name = Lilian                        
correct = male     guess = female   name = Tanney                        
correct = male     guess = female   name = Davie                         
correct = female   guess = male     name = Tamar                         


In [45]:
#refining feature function
def gender_feature2(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

In [46]:
train_set3 = apply_features(gender_feature2, train_names)
devtest_set2 = apply_features(gender_feature2, devtest_names)
test_set3 = apply_features(gender_feature2, test_names)
Classifier = nltk.classify.NaiveBayesClassifier.train(train_set3)

In [47]:
nltk.classify.accuracy(Classifier, devtest_set2)

0.791

In [48]:
Classifier.show_most_informative_features(10)

Most Informative Features
                 suffix2 = 'na'           female : male   =     86.6 : 1.0
                 suffix2 = 'la'           female : male   =     67.3 : 1.0
                 suffix2 = 'ia'           female : male   =     44.9 : 1.0
                 suffix1 = 'a'            female : male   =     36.9 : 1.0
                 suffix2 = 'us'             male : female =     34.6 : 1.0
                 suffix2 = 'ra'           female : male   =     32.8 : 1.0
                 suffix2 = 'sa'           female : male   =     28.0 : 1.0
                 suffix2 = 'ta'           female : male   =     27.7 : 1.0
                 suffix1 = 'k'              male : female =     26.2 : 1.0
                 suffix2 = 'do'             male : female =     25.2 : 1.0


In [49]:
nltk.classify.accuracy(Classifier, test_set3)

0.79

In [52]:
error = []
for (name, tag) in devtest_names:
    guess = Classifier.classify(gender_feature2(name))
    if guess != tag:
        error.append((tag, guess, name))

In [55]:
for (tag, guess, name) in error[:10]:
    print('correct = {:<20} guess = {:<20} name = {}'.format(tag, guess, name))

correct = male                 guess = female               name = Orbadiah
correct = male                 guess = female               name = Jaime
correct = female               guess = male                 name = Winnifred
correct = male                 guess = female               name = Emmery
correct = male                 guess = female               name = Andrea
correct = female               guess = male                 name = Lilian
correct = male                 guess = female               name = Davie
correct = female               guess = male                 name = Tamar
correct = female               guess = male                 name = Conney
correct = male                 guess = female               name = Zebedee


Document classification

In [83]:
documents = [(list(movie_reviews.words(fileid)), category) for fileid in movie_reviews.fileids()
                  for category in movie_reviews.categories()]

random.shuffle(documents)

In [129]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)

In [128]:
#words = set(w.lower() for w in movie_reviews.words())
#print(sorted(list(words)))

In [144]:
def generate_feature(document):
    document_words = set(document)
    feature = {}
    for word in word_features:
        feature['contais({})'.format(word)]=(word in document_words)
    return feature

In [151]:
data_length = len(documents)

train_set = apply_features(generate_feature, documents[:int(data_length*.9)])
test_set = apply_features(generate_feature, documents[int(data_length*.9):])

print(len(documents))
print(len(train_set))
print(len(test_set))

4000
3600
400


In [152]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [153]:
classifier.classify(generate_feature(movie_reviews.words(fileids = 'neg/cv000_29416.txt')))

'neg'

Dialogue classification

In [154]:
posts = nltk.corpus.nps_chat.xml_posts()

def dialogue_act_features(post):
     features = {}
     for word in nltk.word_tokenize(post):
         features['contains({})'.format(word.lower())] = True
     return features

In [155]:
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
print(featuresets[:2])

random.shuffle(featuresets)

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))

[({'contains(now)': True, 'contains(im)': True, 'contains(left)': True, 'contains(with)': True, 'contains(this)': True, 'contains(gay)': True, 'contains(name)': True}, 'Statement'), ({'contains(:)': True, 'contains(p)': True}, 'Emotion')]
0.6335227272727273


In [156]:
classifier.show_most_informative_features(10)


Most Informative Features
             contains(>) = True            Other : System =    536.2 : 1.0
          contains(part) = True           System : Statem =    442.0 : 1.0
            contains(no) = True           nAnswe : System =    441.6 : 1.0
            contains(hi) = True            Greet : System =    435.5 : 1.0
         contains(empty) = True            Other : System =    396.3 : 1.0
           contains(brb) = True              Bye : Statem =    312.3 : 1.0
          contains(nope) = True           nAnswe : Statem =    255.8 : 1.0
           contains(are) = True           whQues : System =    210.8 : 1.0
           contains(yes) = True           yAnswe : System =    202.2 : 1.0
          contains(what) = True           whQues : Greet  =    194.1 : 1.0


Evaluation

In [157]:
from nltk.metrics.scores import (precision, recall, f_measure)
import collections

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print(refsets['Emotion'])
print(testsets['Emotion'])

{515, 517, 1031, 1039, 535, 26, 1050, 31, 37, 41, 564, 64, 69, 585, 587, 588, 87, 97, 110, 115, 119, 637, 133, 647, 144, 146, 662, 666, 671, 163, 676, 165, 680, 169, 177, 708, 710, 205, 209, 726, 224, 739, 232, 753, 760, 761, 260, 772, 782, 790, 792, 793, 290, 298, 307, 315, 831, 330, 851, 854, 855, 345, 349, 868, 870, 359, 360, 365, 373, 889, 394, 401, 407, 409, 414, 930, 935, 424, 936, 937, 944, 437, 962, 964, 453, 967, 969, 472, 473, 476, 480, 1004, 1005, 494, 1015, 504, 1017, 1020}
{515, 260, 517, 1031, 394, 782, 1039, 144, 662, 407, 790, 409, 792, 793, 1049, 1050, 414, 31, 290, 163, 930, 37, 935, 680, 41, 169, 298, 177, 307, 310, 64, 962, 708, 69, 453, 710, 964, 585, 967, 587, 588, 205, 969, 209, 851, 726, 854, 855, 345, 473, 476, 889, 224, 480, 870, 359, 232, 360, 1004, 365, 110, 494, 760, 753, 1005, 115, 1015, 504, 761, 1020, 637, 767}


In [158]:
print(refsets.keys())
    
print("Precision: ", precision(refsets['Emotion'], testsets['Emotion']))

print("Recall: ", recall(refsets['Emotion'], testsets['Emotion']))
    
print("F Score: ", f_measure(refsets['Emotion'], testsets['Emotion']))

dict_keys(['ynQuestion', 'whQuestion', 'Greet', 'Bye', 'Statement', 'System', 'Emphasis', 'Accept', 'Emotion', 'Continuer', 'Reject', 'Clarify', 'yAnswer', 'nAnswer', 'Other'])
Precision:  0.9583333333333334
Recall:  0.7040816326530612
F Score:  0.8117647058823528


Confusion Matrix

In [160]:
answer = [tag for (document, tag) in test_set]

print(len(answer))
print(answer[:20])

1056
['ynQuestion', 'whQuestion', 'Greet', 'Bye', 'Statement', 'System', 'Statement', 'System', 'Greet', 'Emphasis', 'Statement', 'Statement', 'Statement', 'Statement', 'whQuestion', 'Statement', 'Statement', 'Statement', 'Statement', 'Accept']


In [163]:
guesses = []
for (document, tag) in test_set:
    guess_this = classifier.classify(document)
    guesses.append(guess_this)

print(len(guesses))
print(guesses[:30])

1056
['Other', 'whQuestion', 'Greet', 'Bye', 'nAnswer', 'System', 'nAnswer', 'System', 'Greet', 'whQuestion', 'Greet', 'Statement', 'Statement', 'Statement', 'whQuestion', 'Statement', 'Other', 'Clarify', 'Clarify', 'yAnswer', 'System', 'System', 'Other', 'Greet', 'Clarify', 'System', 'Statement', 'ynQuestion', 'nAnswer', 'System']


In [164]:
cm = nltk.ConfusionMatrix(answer, guesses)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

           |                                  w      y                      |
           |      S                           h      n                      |
           |      t                           Q      Q      E               |
           |      a                    E      u      u      m               |
           |      t      S             m      e      e      p      A      R |
           |      e      y      G      o      s      s      h      c      e |
           |      m      s      r      t      t      t      a      c      j |
           |      e      t      e      i      i      i      s      e      e |
           |      n      e      e      o      o      o      i      p      c |
           |      t      m      t      n      n      n      s      t      t |
-----------+----------------------------------------------------------------+
 Statement | <14.6%>  0.2%   0.5%   0.2%   0.7%   1.3%   1.0%   0.4%   2.0% |
    System |      . <21.2%>     .      .   0.1%      .   0.4%   