In [1]:
#Importing libraries
import random
import nltk
from nltk.corpus import names

In [2]:
# creating list of tuples containing name and gender (from the male.txt and female.txt i.e names corpus )
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

# shuffling the  labeled_names
random.shuffle(labeled_names)

In [3]:
#feature extractor function builds a dictionary containing relevant information about a given name:
def gender_features(word):
    return {'last_letter': word[-1]}

In [4]:
# demo
gender_features('subham')

{'last_letter': 'm'}

In [5]:
#creating features set eg: [ ( {'last_letter': 'm'} , 'male') ]
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [6]:
# splitting the feature set into train and test set
train_set, test_set = featuresets[500:], featuresets[:500]

In [7]:
# creating naive bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [10]:
# classifying a given name using the model
classifier.classify(gender_features('rahul'))

'male'

In [9]:
# testing accuracy of our model by test set 
accuracy=nltk.classify.accuracy(classifier, test_set)
print accuracy*100    

72.6


############################ When working with large corpora, #######################################

In [17]:
# apply_features() function returns an object that acts like a list but does not store all the feature sets in memory

from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [18]:
# building and testing classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy=nltk.classify.accuracy(classifier, test_set)
print accuracy*100    

75.8


########################## modifying gender_features  ##########################################

In [13]:
#Choosing the more appropriate Features
def gender_features(word):
     return {'last_letter': word[-1:],
             'last_2letter': word[-2:]}

In [14]:
# building and testing classifier
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy=nltk.classify.accuracy(classifier, test_set)
print accuracy*100    

75.2


In [17]:
#check
classifier.classify(gender_features('rahul'))

'male'

In [38]:
# show_most_informative_features() finds out which features the classifier found to be most informative
classifier.show_most_informative_features(10)

Most Informative Features
            last_2letter = u'na'          female : male   =     99.1 : 1.0
            last_2letter = u'la'          female : male   =     76.7 : 1.0
            last_2letter = u'us'            male : female =     67.1 : 1.0
            last_2letter = u'ia'          female : male   =     56.3 : 1.0
            last_2letter = u'ta'          female : male   =     44.1 : 1.0
             last_letter = u'a'           female : male   =     39.6 : 1.0
            last_2letter = u'ra'          female : male   =     36.2 : 1.0
            last_2letter = u'sa'          female : male   =     34.2 : 1.0
             last_letter = u'k'             male : female =     32.7 : 1.0
            last_2letter = u'io'            male : female =     28.3 : 1.0
