# Gender Identification With A Decision Tree

In [16]:
### Initialize a classifier
import re
import pandas as pd
import random
import nltk

# Download names if they don't exist locally
try:
    nltk.data.find('corpora/names')
except LookupError:
    nltk.download('names')
    
    
from nltk.corpus import names


### Define Different Features for the Decision Tree Classifiers

In [13]:
# Guessing the gender from the last letter of a name
def gender_features(word):
    return {'last_letter': word[-1] }

# Guess the gender from the first/last letter and counting letters
def gender_features2(name):
    features = {}
    features['firstletter'] = name[0].lower()
    features['lastletter'] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features['count (%s)' % letter] = name.lower().count(letter)
        features['has (%s)' % letter] = (letter in name.lower())
    return features

# Just a predictably bad guesser
def bad_feature(word):
    return {'bleah' : 1}


def gender_features3(name):
    features = {}
    features['firstletter'] = name[0].lower()
    features['lastletter'] = name[-1].lower()
    features['.*arry$'] = re.match(r".*arry$", name)
    features['.*b[ea]rt$'] = re.match(".*b[ea]rt$", name)
    features['.*ie$'] = re.match(".*ie$", name)
    features['Sch'] = re.match(r"Sch", name)
    features['Pam'] = re.match(r"Pam", name)
    features['V.*a$'] = re.match(r"V.*a$", name)
    features['M.*l$'] = re.match(r"M.*l$", name) 
    
    return features


# create a list of feature function
gender_functions = [gender_features, 
                    gender_features2, 
                    bad_feature, 
                    gender_features3
                   ]


### Setup Various Testing Sets

In [20]:
# Compile all names into a list
all_names = ([(name, 'male') for name in names.words('male.txt')] +
            [(name, 'female') for name in names.words('female.txt')])

# randomize the entire list
random.shuffle(all_names)


# Setup the train, devtest and test sets
train_names, devtest_names, test_names = all_names[0:500], all_names[500:1000], all_names[1000:]


### Try All of the Feature Functions

In [21]:
# Iterate through the different feature functions 
# and compare their accuracy

results_list = []

# For each feature function, try to classify the test sets
for fn in gender_functions:
    train_set = [(fn(n), g) for (n,g) in train_names]
    devtest_set = [(gender_features2(n), g) for (n,g) in devtest_names]
    test_set = [(gender_features2(n), g) for (n,g) in test_names]
    
    # make a classifier from the training set
    classifier = nltk.classify.DecisionTreeClassifier.train(train_set)

    # Print the classifier logic
    #print(classifier)

    # Get the accuracies
    accuracy_train = nltk.classify.accuracy(classifier, train_set)
    accuracy_devtest = nltk.classify.accuracy(classifier, devtest_set)
    accuracy_test = nltk.classify.accuracy(classifier, test_set)

    results_list.append([fn.__name__, accuracy_train, accuracy_devtest, accuracy_test])


results_df = pd.DataFrame(results_list,
                          columns=['Function', 
                                   'Training Accuracy', 
                                   'Devtest Accuracy',
                                   'Test Accuracy',
                                   ])

results_df.head()


Unnamed: 0,Function,Training Accuracy,Devtest Accuracy,Test Accuracy
0,gender_features,0.79,0.362,0.370968
1,gender_features2,0.97,0.684,0.720334
2,bad_feature,0.628,0.638,0.629032
3,gender_features3,0.906,0.716,0.727823


In [6]:
# Get the errors from using the devtest_set
#errors = []
#for (name, tag) in devtest_names:
#    guess = classifier.classify(gender_features(name))
#    if guess != tag:
#        errors.append( (tag, guess, name) )

#for (tag, guess, name) in sorted(errors):
#    print( 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
