# Classifying names

In [None]:
import nltk

from nltk.corpus import names

female_names = names.words('female.txt')
male_names = names.words('male.txt')
print 'female names: ' + str(len(female_names))
print 'male names: ' + str(len(male_names))
print len([w for w in male_names if w in female_names])


In [None]:
cfd = nltk.ConditionalFreqDist(
    (fileid, name[-1])
    for fileid in names.fileids()
    for name in names.words(fileid))
cfd.plot()

## First try
###Extracting some features for last names

In [None]:
def gender_features(word):
    return {'last_letter': word[-1]}

gender_features('Shrek')

###Create a labelled dataset

In [None]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])

import random
random.shuffle(labeled_names)

###Extract features and train a classifier

In [None]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

###Test

In [None]:
classifier.classify(gender_features('Neo'))

In [None]:
classifier.classify(gender_features('Trinity'))

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier.show_most_informative_features(5)

## Second Try
### Extracting more features

In [None]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

###Test

In [None]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

(lower accuracy than before)

###Introduce a dev-set

In [None]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [None]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

###Error analysis

In [None]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )
        
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

###New features: One or two letter suffixes

In [None]:
def gender_features(word):
    return {'suffix1': word[-1:],
           'suffix2': word[-2:]}

In [None]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

##A better dataset?
Read in a dataset of names from the Social Security Database

In [None]:
dist = classifier.prob_classify(gender_features('Neo'))
for label in dist.samples():
    print("%s: %f" % (label, dist.prob(label)))

Read in a dataset of names from the Social Security Database, 10x bigger

In [None]:
import csv
import glob
from collections import Counter

ssn_name_dict = {'M' : Counter(), 'F' : Counter()}

ssnfiles = glob.glob("names/yob*.txt")
for ssnfile in ssnfiles:
    with open(ssnfile) as csvfile:
        ssnreader = csv.reader(csvfile, delimiter=',')
        for row in ssnreader:
            ssn_name_dict[row[1]][row[0]] += int(row[2])

print 'Female names: ' + str(len(ssn_name_dict['F']))
print 'Male names: ' + str(len(ssn_name_dict['M']))

In [None]:
labeled_ssn_names = ([(name, 'male') for name in ssn_name_dict['M'].keys()] +
                 [(name, 'female') for name in ssn_name_dict['F'].keys()])

import random
random.shuffle(labeled_ssn_names)

###Train a classifier on the dataset

In [None]:
train_names = labeled_ssn_names[1500:]
devtest_names = labeled_ssn_names[500:1500]
test_names = labeled_ssn_names[:500]

In [None]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

In [None]:
train_set2 = [(gender_features2(n), gender) for (n, gender) in train_names]
devtest_set2 = [(gender_features2(n), gender) for (n, gender) in devtest_names]
test_set2 = [(gender_features2(n), gender) for (n, gender) in test_names]
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)
print(nltk.classify.accuracy(classifier2, devtest_set2))

In [None]:
dist = classifier.prob_classify(gender_features('Abigail'))
for label in dist.samples():
    print("%s: %f" % (label, dist.prob(label)))

In [None]:
print len(train_set)

In [None]:
print len(set(ssn_name_dict['M'].keys()).intersection(ssn_name_dict['F'].keys()))
print "done"

In [None]:
dt_classifier = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(dt_classifier, devtest_set))

In [None]:
me_classifier = nltk.MaxentClassifier.train(train_set)
print(nltk.classify.accuracy(me_classifier, devtest_set))

##Scikit-learn

http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [None]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

#assigning predictor and target variables
x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
y= np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets 
model.fit(x, y)

#Predict Output 
predicted= model.predict([[1,2],[3,4]])