In [0]:
#Split a name into features
def gender_features(word):
    word = word.lower()
    return {'first_letter': word[0], 'first2_letter': word[0:2], 'first3_letter': word[0:3], 'last_letter': word[-1], 'last2_letter': word[-2:], 'last3_letter': word[-3:]}

In [5]:
#Testing the `gender_features()` function
gender_features("Tim")

{'first2_letter': 'ti',
 'first3_letter': 'tim',
 'first_letter': 't',
 'last2_letter': 'im',
 'last3_letter': 'tim',
 'last_letter': 'm'}

In [13]:
import random
import nltk
nltk.download('names')
from nltk.corpus import names

# Read the names from the files.
# Label each name with the corresponding gender.
male_names = [(name, 'male') for name in names.words('male.txt')]
female_names = [(name, 'female') for name in names.words('female.txt')]

# Combine the lists.
labeled_names = male_names + female_names

# Shuffle the list.
random.shuffle(labeled_names)

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


In [0]:
from nltk import NaiveBayesClassifier

# Extract the features using the `gender_features()` function.
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

# Split the dataset into train and test set.
train_set, test_set = featuresets[500:], featuresets[:500]

# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

In [15]:
# Get the accuracy of our classifier
from nltk.classify import accuracy
print(accuracy(classifier, test_set))

0.818


In [16]:
# Have a look at the features and their corresponding likelihood of beeing female : male
classifier.show_most_informative_features(10)

Most Informative Features
            last2_letter = 'na'           female : male   =    164.8 : 1.0
            last2_letter = 'la'           female : male   =     75.3 : 1.0
            last2_letter = 'ia'           female : male   =     39.3 : 1.0
             last_letter = 'a'            female : male   =     38.2 : 1.0
            last2_letter = 'sa'           female : male   =     36.6 : 1.0
             last_letter = 'k'              male : female =     32.3 : 1.0
            last2_letter = 'do'             male : female =     26.1 : 1.0
            last2_letter = 'ra'           female : male   =     25.8 : 1.0
            last2_letter = 'us'             male : female =     25.5 : 1.0
            last2_letter = 'rd'             male : female =     25.0 : 1.0


In [0]:
# Function to get the guessed gender of one name
def getGender(name):
  gender = classifier.classify(gender_features(name))
  print("{} is most probably a {}.".format(name, gender))

In [0]:
# Function to get the guessed gender of a list of names
def getGenders(list):
  for name in list:
    getGender(name)

In [19]:
getGender("Tim")

Tim is most probably a male.


In [20]:
getGenders(["Peter","Petra","Klaus","Harald","Silvia","Elisa","Thorsten"])

Peter is most probably a male.
Petra is most probably a female.
Klaus is most probably a male.
Harald is most probably a male.
Silvia is most probably a female.
Elisa is most probably a female.
Thorsten is most probably a male.
