# **Name Gender Classifier**

In [1]:
def gender_features(word):
    return{'last_letter': word[-1]}

In [2]:
gender_features('Shrek')

{'last_letter': 'k'}

In [3]:
import nltk
from nltk.corpus import names
nltk.download('names')
names.words('male.txt')[:20]

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


['Aamir',
 'Aaron',
 'Abbey',
 'Abbie',
 'Abbot',
 'Abbott',
 'Abby',
 'Abdel',
 'Abdul',
 'Abdulkarim',
 'Abdullah',
 'Abe',
 'Abel',
 'Abelard',
 'Abner',
 'Abraham',
 'Abram',
 'Ace',
 'Adair',
 'Adam']

In [4]:
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

In [5]:
len(namesgender)

7944

In [6]:
namesgender[:20]

[('Aamir', 'male'),
 ('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male'),
 ('Abbott', 'male'),
 ('Abby', 'male'),
 ('Abdel', 'male'),
 ('Abdul', 'male'),
 ('Abdulkarim', 'male'),
 ('Abdullah', 'male'),
 ('Abe', 'male'),
 ('Abel', 'male'),
 ('Abelard', 'male'),
 ('Abner', 'male'),
 ('Abraham', 'male'),
 ('Abram', 'male'),
 ('Ace', 'male'),
 ('Adair', 'male'),
 ('Adam', 'male')]

In [7]:
namesgender[7924:]

[('Zena', 'female'),
 ('Zenia', 'female'),
 ('Zia', 'female'),
 ('Zilvia', 'female'),
 ('Zita', 'female'),
 ('Zitella', 'female'),
 ('Zoe', 'female'),
 ('Zola', 'female'),
 ('Zonda', 'female'),
 ('Zondra', 'female'),
 ('Zonnya', 'female'),
 ('Zora', 'female'),
 ('Zorah', 'female'),
 ('Zorana', 'female'),
 ('Zorina', 'female'),
 ('Zorine', 'female'),
 ('Zsa Zsa', 'female'),
 ('Zsazsa', 'female'),
 ('Zulema', 'female'),
 ('Zuzana', 'female')]

In [8]:
import random
random.shuffle(namesgender)
namesgender[:20]

[('Apostolos', 'male'),
 ('Viv', 'female'),
 ('Daffy', 'male'),
 ('Rainer', 'male'),
 ('Magdalena', 'female'),
 ('Ossie', 'male'),
 ('Nicoli', 'female'),
 ('Scotty', 'male'),
 ('Johann', 'male'),
 ('Milo', 'male'),
 ('Virgie', 'male'),
 ('Courtney', 'male'),
 ('Dede', 'female'),
 ('Christophe', 'male'),
 ('Roddy', 'male'),
 ('Karleen', 'female'),
 ('Malia', 'female'),
 ('Nitin', 'female'),
 ('Eran', 'female'),
 ('Janine', 'female')]

In [9]:
featuresets = [(gender_features(n), g) for (n,g) in namesgender]
featuresets[:20]

[({'last_letter': 's'}, 'male'),
 ({'last_letter': 'v'}, 'female'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'r'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'o'}, 'male'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'e'}, 'female')]

In [10]:
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print (nltk.classify.accuracy(classifier, test_set))

0.758


In [11]:
classifier.classify(gender_features('Neo'))

'male'

In [12]:
classifier.classify(gender_features('Trinity'))

'female'

In [13]:
classifier.show_most_informative_features(20)

Most Informative Features
             last_letter = 'a'            female : male   =     33.0 : 1.0
             last_letter = 'k'              male : female =     31.5 : 1.0
             last_letter = 'v'              male : female =     16.5 : 1.0
             last_letter = 'f'              male : female =     16.1 : 1.0
             last_letter = 'd'              male : female =     10.3 : 1.0
             last_letter = 'p'              male : female =      9.9 : 1.0
             last_letter = 'm'              male : female =      9.2 : 1.0
             last_letter = 'o'              male : female =      8.3 : 1.0
             last_letter = 'r'              male : female =      7.0 : 1.0
             last_letter = 'z'              male : female =      5.6 : 1.0
             last_letter = 'g'              male : female =      5.5 : 1.0
             last_letter = 'w'              male : female =      5.1 : 1.0
             last_letter = 's'              male : female =      4.2 : 1.0

# Choosing Good Features

In [14]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [15]:
features = gender_features2('Shrek')
len(features)

54

In [16]:
features

{'firstletter': 's',
 'lastletter': 'k',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 1,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 1,
 'has(k)': True,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 0,
 'has(n)': False,
 'count(o)': 0,
 'has(o)': False,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 1,
 'has(r)': True,
 'count(s)': 1,
 'has(s)': True,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [17]:
featuresets2 = [(gender_features2(n), g) for (n,g) in namesgender]
featuresets2

[({'firstletter': 'a',
   'lastletter': 's',
   'count(a)': 1,
   'has(a)': True,
   'count(b)': 0,
   'has(b)': False,
   'count(c)': 0,
   'has(c)': False,
   'count(d)': 0,
   'has(d)': False,
   'count(e)': 0,
   'has(e)': False,
   'count(f)': 0,
   'has(f)': False,
   'count(g)': 0,
   'has(g)': False,
   'count(h)': 0,
   'has(h)': False,
   'count(i)': 0,
   'has(i)': False,
   'count(j)': 0,
   'has(j)': False,
   'count(k)': 0,
   'has(k)': False,
   'count(l)': 1,
   'has(l)': True,
   'count(m)': 0,
   'has(m)': False,
   'count(n)': 0,
   'has(n)': False,
   'count(o)': 3,
   'has(o)': True,
   'count(p)': 1,
   'has(p)': True,
   'count(q)': 0,
   'has(q)': False,
   'count(r)': 0,
   'has(r)': False,
   'count(s)': 2,
   'has(s)': True,
   'count(t)': 1,
   'has(t)': True,
   'count(u)': 0,
   'has(u)': False,
   'count(v)': 0,
   'has(v)': False,
   'count(w)': 0,
   'has(w)': False,
   'count(x)': 0,
   'has(x)': False,
   'count(y)': 0,
   'has(y)': False,
   'count(z

In [18]:
for (n,g) in namesgender[:5]:
    print (n, gender_features2(n), '\n')

Apostolos {'firstletter': 'a', 'lastletter': 's', 'count(a)': 1, 'has(a)': True, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 0, 'has(e)': False, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 0, 'has(h)': False, 'count(i)': 0, 'has(i)': False, 'count(j)': 0, 'has(j)': False, 'count(k)': 0, 'has(k)': False, 'count(l)': 1, 'has(l)': True, 'count(m)': 0, 'has(m)': False, 'count(n)': 0, 'has(n)': False, 'count(o)': 3, 'has(o)': True, 'count(p)': 1, 'has(p)': True, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 2, 'has(s)': True, 'count(t)': 1, 'has(t)': True, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False} 

Viv {'firstletter': 'v', 'lastletter': 'v', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0,

In [19]:
train_set, test_set = featuresets2[500:], featuresets2[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, test_set))

0.78


In [20]:
train_names = namesgender[1500:]
devtest_names = namesgender[500:1500]
test_names = namesgender[:500]

In [21]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, devtest_set))

0.747


In [22]:
def geterrors(devtest):
    errors = []
    for (name, tag) in devtest:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors
errors = geterrors(devtest_names)
len(errors)

253

In [23]:
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print ('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

In [24]:
printerrors(errors)

correct=female   guess=male     name=Abagail                       
correct=female   guess=male     name=Abigael                       
correct=female   guess=male     name=Adriaens                      
correct=female   guess=male     name=Agnes                         
correct=female   guess=male     name=Aigneis                       
correct=female   guess=male     name=Aileen                        
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Averil                        
correct=female   guess=male     name=Bert                          
correct=female   guess=male     name=Bird                          
correct=female   guess=male     name=Blair                         
correct=female   guess=male     name=Bo                            
correct=female   guess=male     name=Brittan                       
correct=female   guess=male     name=Bryn       