# Predicting Gender based upon name of a person

In [1]:
import nltk as nltk
from nltk.corpus import names
import random
import numpy as np
from collections import Counter

## Design Single Feature extracted from a name
**example gender_features('Shrek') = {'last_letter': 'k'}**

In [2]:
def gender_features(word):
    return {
            'last_letter': word[-1],
            }

male_names = [(name, 'male') for name in names.words('male.txt')]
female_names = [(name, 'female') for name in names.words('female.txt')]

labeled_names = male_names + female_names
mean_acc = 0
accArr = []
for split_count in [4000, 3000, 2000, 1000, 500]:
    for _ in range(100):
        random.shuffle(labeled_names)
        featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
        train_set, test_set = featuresets[split_count:], featuresets[:split_count]
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        acc = nltk.classify.accuracy(classifier, test_set)*100
        accArr.append(acc)
        mean_acc += acc/100
    print("sample size - Train: {}, Test:  {}".format(len(train_set),len(test_set)))
    print("Classifier accuracy: {:.2f}%,   IQR accuracy:  {:.2f} : {:.2f}".format(np.mean(accArr),
                                                                            np.percentile(accArr, 25),
                                                                            np.percentile(accArr, 75)))

ans1 = classifier.classify(gender_features('Mark'))
ans2 = classifier.classify(gender_features('Precilla'))

print("Mark is:", ans1)
print("Precilla is:", ans2)

classifier.show_most_informative_features()

# print(nltk.classify.accuracy(classifier, test_set))







sample size - Train: 3944, Test:  4000
Classifier accuracy: 75.97%,   IQR accuracy:  75.62 : 76.28
sample size - Train: 4944, Test:  3000
Classifier accuracy: 75.98%,   IQR accuracy:  75.62 : 76.31
sample size - Train: 5944, Test:  2000
Classifier accuracy: 76.00%,   IQR accuracy:  75.60 : 76.40
sample size - Train: 6944, Test:  1000
Classifier accuracy: 75.96%,   IQR accuracy:  75.50 : 76.45
sample size - Train: 7444, Test:  500
Classifier accuracy: 75.99%,   IQR accuracy:  75.46 : 76.55
Mark is: male
Precilla is: female
Most Informative Features
             last_letter = 'a'            female : male   =     35.5 : 1.0
             last_letter = 'k'              male : female =     32.1 : 1.0
             last_letter = 'v'              male : female =     16.6 : 1.0
             last_letter = 'f'              male : female =     14.8 : 1.0
             last_letter = 'p'              male : female =     12.7 : 1.0
             last_letter = 'd'              male : female =      9.4 : 

### Design set of features extracted from a name

In [3]:
def gender_features(word):
    return {
            'last_2_letters': word[-2:],
            'last_3_letters': word[-3:],
            # 'last_vowel': (word[-1].lower() in 'aeiou'),
            }

male_names = [(name, 'male') for name in names.words('male.txt')]
female_names = [(name, 'female') for name in names.words('female.txt')]

labeled_names = male_names + female_names
mean_acc = 0
accArr = []
for split_count in [4000, 3000, 2000, 1000, 500]:
    for _ in range(100):
        random.shuffle(labeled_names)
        featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
        train_set, test_set = featuresets[split_count:], featuresets[:split_count]
        classifier = nltk.NaiveBayesClassifier.train(train_set)
        acc = nltk.classify.accuracy(classifier, test_set)*100
        accArr.append(acc)
        mean_acc += acc/100
    print("sample size - Train: {}, Test:  {}".format(len(train_set),len(test_set)))
    print("Classifier accuracy: {:.2f}%,   IQR accuracy:  {:.2f} : {:.2f}".format(np.mean(accArr),
                                                                            np.percentile(accArr, 25),
                                                                            np.percentile(accArr, 75)))

ans1 = classifier.classify(gender_features('Mark'))
ans2 = classifier.classify(gender_features('Precilla'))

print("Mark is:", ans1)
print("Precilla is:", ans2)

# print density of 100 most important feature categories for trained classifier
feature_list = []
for feature, _ in classifier.most_informative_features(100):
    feature_list.append(feature)
print(Counter(feature_list))

classifier.show_most_informative_features()







sample size - Train: 3944, Test:  4000
Classifier accuracy: 79.30%,   IQR accuracy:  79.02 : 79.66
sample size - Train: 4944, Test:  3000
Classifier accuracy: 79.49%,   IQR accuracy:  79.12 : 79.88
sample size - Train: 5944, Test:  2000
Classifier accuracy: 79.55%,   IQR accuracy:  79.12 : 80.05
sample size - Train: 6944, Test:  1000
Classifier accuracy: 79.58%,   IQR accuracy:  79.10 : 80.10
sample size - Train: 7444, Test:  500
Classifier accuracy: 79.65%,   IQR accuracy:  79.03 : 80.20
Mark is: male
Precilla is: female
Counter({'last_2_letters': 50, 'last_3_letters': 50})
Most Informative Features
          last_2_letters = 'na'           female : male   =     99.6 : 1.0
          last_2_letters = 'la'           female : male   =     73.9 : 1.0
          last_2_letters = 'ia'           female : male   =     40.2 : 1.0
          last_2_letters = 'sa'           female : male   =     35.8 : 1.0
          last_2_letters = 'us'             male : female =     27.8 : 1.0
          last_2_