# Gender Identification With A Decision Tree

In [47]:
### Initialize a classifier
import nltk

# Download names if they don't exist locally
try:
    nltk.data.find('corpora/names')
except LookupError:
    nltk.download('names')
    
    
from nltk.corpus import names
import random


In [49]:
def gender_features(word):
    return {'last_letter': word[-1] }


In [50]:
# Compile all names into a list
all_names = ([(name, 'male') for name in names.words('male.txt')] +
            [(name, 'female') for name in names.words('female.txt')])

# randomize the entire list
random.shuffle(all_names)


# Setup the train, devtest and test sets
train_names, devtest_names, test_names = all_names[0:500], all_names[500:1000], all_names[1000:]
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]




In [51]:
# make a classifier from the training set
classifier = nltk.classify.DecisionTreeClassifier.train(train_set)

# Print the classifier logic
print(classifier)


last_letter=a? ........................................ female
last_letter=b? ........................................ male
last_letter=c? ........................................ male
last_letter=d? ........................................ male
last_letter=e? ........................................ female
last_letter=f? ........................................ male
last_letter=g? ........................................ female
last_letter=h? ........................................ male
last_letter=i? ........................................ female
last_letter=k? ........................................ male
last_letter=l? ........................................ male
last_letter=m? ........................................ male
last_letter=n? ........................................ male
last_letter=o? ........................................ male
last_letter=p? ........................................ male
last_letter=r? ........................................ male
last_letter=s? .

In [53]:
# Train the classifier
print( nltk.classify.accuracy(classifier, train_set))


0.77


In [52]:
# Get the errors from using the devtest_set
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

for (tag, guess, name) in sorted(errors):
    print( 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))


0.77
correct=female   guess=male     name=Adah                          
correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Ardys                         
correct=female   guess=male     name=Averil                        
correct=female   guess=male     name=Avivah                        
correct=female   guess=male     name=Avrit                         
correct=female   guess=male     name=Bert                          
correct=female   guess=male     name=Beryl                         
correct=female   guess=male     name=Bird                          
correct=female   guess=male     name=Brynn                         
correct=female   guess=male     name=Carilyn                       
correct=female   guess=male     name=Carin                         
correct=female   guess=male     name=Caro                          
correct=female   guess=male     name=Charis                        
correct=female   guess=male     name=Chriss

In [54]:
# Test the classifier with the devtest set
print (nltk.classify.accuracy(classifier, devtest_set))

# Evaluate with test_set
print (nltk.classify.accuracy(classifier, test_set))


0.762
0.7534562211981567
