# Name Classification

### Objective

To develop a Machine Learning algorithm that classifies name strings of Indian names into the right gender, religion, and ethnicity.

Setting up dependencies

In [1]:
import pandas as pd
import nltk
import random

### Gender Classification

Loading the CBSE results file for gender classification

In [4]:
df = pd.read_csv('training_data/results_2015.csv')
df = df[['name', 'father_name', 'mother_name', 'first_name', 'last_name']]

In [5]:
df['father_first_name'] = df['father_name'].apply(lambda x: x.split()[0] if pd.notnull(x) else None)
df['father_last_name'] = df['father_name'].apply(lambda x: x.split()[-1] if pd.notnull(x) and len(x.split()) > 1 else None)

In [6]:
df['mother_first_name'] = df['mother_name'].apply(lambda x: x.split()[0] if pd.notnull(x) else None)
df['mother_last_name'] = df['mother_name'].apply(lambda x: x.split()[-1] if pd.notnull(x) and len(x.split()) > 1 else None)

In [7]:
male_first_names = df.father_first_name.value_counts()
male_first_names.name = 'male_count'
female_first_names = df.mother_first_name.value_counts()
female_first_names.name = 'female_count'
first_names = pd.concat([male_first_names, female_first_names], axis=1).fillna(0)
first_names['total_count'] = first_names['male_count'] + first_names['female_count']
first_names['male_prop'] = 1.*first_names['male_count']/first_names['total_count']
del male_first_names
del female_first_names

In [8]:
male_last_names = df.father_last_name.value_counts()
male_last_names.name = 'male_count'
female_last_names = df.mother_last_name.value_counts()
female_last_names.name = 'female_count'
last_names = pd.concat([male_last_names, female_last_names], axis=1).fillna(0)
last_names['total_count'] = last_names['male_count'] + last_names['female_count']
last_names['male_prop'] = 1.*last_names['male_count']/last_names['total_count']
del male_last_names
del female_last_names

In [9]:
father_names = pd.DataFrame(df.father_name.values)
father_names['gender'] = 'male'
mother_names = pd.DataFrame(df.mother_name.values)
mother_names['gender'] = 'female'
labeled_names = father_names.append(mother_names).reset_index(drop=True)
labeled_names.columns = ['name', 'gender']

In [10]:
labeled_names = labeled_names[labeled_names.name.notnull()]
labeled_names['first_name'] = labeled_names['name'].apply(lambda x: x.split()[0])
labeled_names['last_name'] = labeled_names['name'].apply(lambda x: x.split()[-1] if len(x.split()) > 1 else None)

In [11]:
labeled_names.fillna('', inplace=True)

In [12]:
#Note: These are useful for distinguishing between Muslim, Christian and Hindu names, but not for Hindu/Sikh/Jain/Buddhist names
def gender_features(name):
    name = name.lower()
    first_name = name.split()[0]
    last_name = name.split()[-1]
    
    first_name_first = first_name[0]
    first_name_first_two = first_name[:2] if len(first_name) > 1 else None
    first_name_first_three = first_name[:3] if len(first_name) > 2 else None
    first_name_first_four = first_name[:4] if len(first_name) > 3 else None
    first_name_first_five = first_name[:5] if len(first_name) > 4 else None
    
    first_name_last = first_name[-1]
    first_name_last_two = first_name[-2:] if len(first_name) > 1 else None
    first_name_last_three = first_name[-3:] if len(first_name) > 2 else None
    first_name_last_four = first_name[-4:] if len(first_name) > 3 else None
    first_name_last_five = first_name[-5:] if len(first_name) > 4 else None
    
    if first_name != last_name:
        last_name_first = last_name[0]
        last_name_first_two = last_name[:2] if len(last_name) > 1 else None
        last_name_first_three = last_name[:3] if len(last_name) > 2 else None
        last_name_first_four = last_name[:4] if len(last_name) > 3 else None
        last_name_first_five = last_name[:5] if len(last_name) > 4 else None
        
        last_name_last = last_name[-1]
        last_name_last_two = last_name[-2:] if len(last_name) > 1 else None
        last_name_last_three = last_name[-3:] if len(last_name) > 2 else None
        last_name_last_four = last_name[-4:] if len(last_name) > 3 else None
        last_name_last_five = last_name[-5:] if len(last_name) > 4 else None
    else:
        last_name_first = None
        last_name_first_two = None
        last_name_first_three = None
        last_name_first_four = None
        last_name_first_five = None
        
        last_name_last = None
        last_name_last_two = None
        last_name_last_three = None
        last_name_last_four = None
        last_name_last_five = None
    
    features = {'first_name_first': first_name_first, 'first_name_first_two': first_name_first_two,
                'first_name_first_three': first_name_first_three, 'first_name_first_four': first_name_first_four,
                'first_name_first_five': first_name_first_five,
                'first_name_last': first_name_last, 'first_name_last_two': first_name_last_two,
                'first_name_last_three': first_name_last_three, 'first_name_last_four': first_name_last_four,
                'first_name_last_five': first_name_last_five,
                'last_name_first': last_name_first, 'last_name_first_two': last_name_first_two,
                'last_name_first_three': last_name_first_three, 'last_name_first_four': last_name_first_four,
                'last_name_last_five': last_name_first_five,
                'last_name_last': last_name_last, 'last_name_last_two': last_name_last_two,
                'last_name_last_three': last_name_last_three, 'last_name_last_four': last_name_last_four,
                'last_name_last_five': last_name_last_five}
    
    return features

In [13]:
feature_sets = [(gender_features(i[0]), i[1]) for i in labeled_names[['first_name', 'gender']].values]

In [14]:
train_indices = random.sample(xrange(len(feature_sets)), int(0.8*len(feature_sets)))
train_indices_set = set(train_indices)
test_indices = [i for i in xrange(len(feature_sets)) if i not in train_indices_set]

In [15]:
train_set = [feature_sets[i] for i in train_indices]
test_set = [feature_sets[i] for i in test_indices]

In [16]:
gender_classifier = nltk.NaiveBayesClassifier.train(train_set)

In [17]:
nltk.classify.accuracy(gender_classifier, test_set)

0.8760336680448907

In [18]:
nltk.classify.accuracy(gender_classifier, train_set)

0.8915623730941041

In [19]:
name = 'Aniket Deb'
d = gender_classifier.prob_classify(gender_features(name)).__dict__['_prob_dict']
{i: 2**d[i] for i in d}

{'female': 0.4357356576065635, 'male': 0.5642643423934361}

### Religion Classification

In [22]:
names = pd.read_pickle('training_data/names.pkl').append(pd.read_pickle('training_data/names_df.pkl')).reset_index(drop=True)
names = names[['Mother Tongue', 'Ethnicity', 'Religion', 'broad_religion', 'gender', 'name']]
names['name'] = names['name'].apply(lambda x: x.replace('Dr ', '').replace('Mr ', '').replace('Dr. ', '').replace('Mr. ', '').replace('Ms ', '').replace('Ms. ', ''))
names['broad_religion'] = names['broad_religion'].apply(lambda x: x.replace('Sikh', 'Hindu'))
names['first_name'] = names['name'].apply(lambda x: x.split()[0] if len(x) > 0 else None)
names['last_name'] = names['name'].apply(lambda x: x.split()[-1] if len(x) > 0 else None)
names = names[(names.name.str.lower() != 'anonymous') & (names.name.str.lower() != 'abcdef')]
names = names[names.first_name.notnull()]

In [23]:
#Note: These are useful for distinguishing between Muslim, Christian and Hindu names, but not for Hindu/Sikh/Jain/Buddhist names
def religion_features(name):
    name = name.lower()
    first_name = name.split()[0]
    last_name = name[len(first_name):].lstrip().rstrip()
    
    first_name_first = first_name[0]
    first_name_first_two = first_name[:2] if len(first_name) > 1 else None
    first_name_first_three = first_name[:3] if len(first_name) > 2 else None
    first_name_first_four = first_name[:4] if len(first_name) > 3 else None
    
    first_name_last = first_name[-1]
    first_name_last_two = first_name[-2:] if len(first_name) > 1 else None
    first_name_last_three = first_name[-3:] if len(first_name) > 2 else None
    first_name_last_four = first_name[-4:] if len(first_name) > 3 else None
    
    if first_name != last_name and len(last_name) > 0:
        last_name_first = last_name[0]
        last_name_first_two = last_name[:2] if len(last_name) > 1 else None
        last_name_first_three = last_name[:3] if len(last_name) > 2 else None
        last_name_first_four = last_name[:4] if len(last_name) > 3 else None
        
        last_name_last = last_name[-1]
        last_name_last_two = last_name[-2:] if len(last_name) > 1 else None
        last_name_last_three = last_name[-3:] if len(last_name) > 2 else None
        last_name_last_four = last_name[-4:] if len(last_name) > 3 else None
    else:
        last_name_first = None
        last_name_first_two = None
        last_name_first_three = None
        last_name_first_four = None
        
        last_name_last = None
        last_name_last_two = None
        last_name_last_three = None
        last_name_last_four = None
    
    features = {'prop_%s'%i: 1.*name.count(i)/len(name) for i in 'abcdefghijklmnopqrstuvwxyz'}
    features.update({'first_name_first': first_name_first, 'first_name_first_two': first_name_first_two,'first_name_first_three': first_name_first_three, 'first_name_first_four': first_name_first_four,
               'first_name_last': first_name_last, 'first_name_last_two': first_name_last_two,
               'first_name_last_three': first_name_last_three, 'first_name_last_four': first_name_last_four,
               'last_name_first': last_name_first, 'last_name_first_two': last_name_first_two,
               'last_name_first_three': last_name_first_three, 'last_name_first_four': last_name_first_four,
               'last_name_last': last_name_last, 'last_name_last_two': last_name_last_two,
               'last_name_last_three': last_name_last_three, 'last_name_last_four': last_name_last_four
               })
    
    return features

In [24]:
feature_sets = [(religion_features(i[0]), i[1]) for i in names[['name', 'broad_religion']].values]
train_indices = random.sample(xrange(len(feature_sets)), int(0.8*len(feature_sets)))
train_indices_set = set(train_indices)
test_indices = [i for i in xrange(len(feature_sets)) if i not in train_indices_set]

In [25]:
train_set = [feature_sets[i] for i in train_indices]
test_set = [feature_sets[i] for i in test_indices]
religion_classifier = nltk.NaiveBayesClassifier.train(train_set)

In [26]:
nltk.classify.accuracy(religion_classifier, test_set)

0.9317535545023696

In [27]:
nltk.classify.accuracy(religion_classifier, train_set)

0.9537212609623134

In [28]:
name = 'Ansar Kadri'
d = religion_classifier.prob_classify(religion_features(name)).__dict__['_prob_dict']
{i: 2**d[i] for i in d}

{u'Christian': 2.1598370594291562e-08,
 'Hindu': 9.589596574701158e-10,
 u'Muslim': 0.9999999774426755}

### Ethnicity Classification

It's hard to differentiate between Tamil, Telugu, Kannada and Malayalam names. Might be good to put them as 'South Indian'.

In [29]:
names['Mother Tongue'] = names['Mother Tongue'].apply(lambda x: 'South Indian' if x in ['Telugu', 'Kannada', 'Malayalam', 'Tamil'] else x)
names['Mother Tongue'] = names['Mother Tongue'].apply(lambda x: 'North Indian' if x in ['Kashmiri', 'Dogri', 'Hindi', 'Urdu', 'Sindhi', 'Bhojpuri'] else x)
names['Mother Tongue'] = names['Mother Tongue'].apply(lambda x: 'East Indian' if x in ['Oriya', 'Bengali', 'Assamese'] else x)
names['Mother Tongue'] = names['Mother Tongue'].apply(lambda x: 'West Indian' if x in ['Marathi', 'Gujarati'] else x)

In [30]:
def ethnicity_features(name, kind='last'):
    if kind == 'last':
        if name is not None:
            name = name.split()[-1]
        else:
            return {'name_first': None, 'name_first_two': None, 'name_first_three': None, 'name_first_four': None,
                'name_last': None, 'name_last_two': None, 'name_last_three': None, 'name_last_four': None,
                'len_name': 0}
    elif kind == 'first':
        name = name.split()[0]
    name = name.lower()
    
    name_first = name[0]
    name_first_two = name[:2]# if len(name) > 1 else None
    name_first_three = name[:3]# if len(name) > 2 else None
    name_first_four = name[:4]# if len(name) > 3 else None

    name_last = name[-1]
    name_last_two = name[-2:]# if len(name) > 1 else None
    name_last_three = name[-3:]# if len(name) > 2 else None
    name_last_four = name[-4:]# if len(name) > 3 else None
    
    features = {'name_first': name_first, 'name_first_two': name_first_two,
                'name_first_three': name_first_three, 'name_first_four': name_first_four,
                'name_last': name_last, 'name_last_two': name_last_two,
                'name_last_three': name_last_three, 'name_last_four': name_last_four,
                'len_name': len(name)}
    
    return features

In [31]:
last_name_feature_sets = [(ethnicity_features(i[0], kind='last'), i[1]) for i in names[~names['Mother Tongue'].isin(['Konkani', 'English', 'Others', 'Kutchi'])][['last_name', 'Mother Tongue']].values]
train_indices = random.sample(xrange(len(last_name_feature_sets)), int(0.8*len(last_name_feature_sets)))
train_indices_set = set(train_indices)
test_indices = [i for i in xrange(len(last_name_feature_sets)) if i not in train_indices_set]
train_set = [last_name_feature_sets[i] for i in train_indices]
test_set = [last_name_feature_sets[i] for i in test_indices]
ethnicity_classifier_last_name = nltk.NaiveBayesClassifier.train(train_set)

In [32]:
first_name_feature_sets = [(ethnicity_features(i[0], kind='first'), i[1]) for i in names[~names['Mother Tongue'].isin(['Konkani', 'English', 'Others', 'Kutchi'])][['first_name', 'Mother Tongue']].values]
train_indices = random.sample(xrange(len(first_name_feature_sets)), int(0.8*len(first_name_feature_sets)))
train_indices_set = set(train_indices)
test_indices = [i for i in xrange(len(first_name_feature_sets)) if i not in train_indices_set]
train_set = [first_name_feature_sets[i] for i in train_indices]
test_set = [first_name_feature_sets[i] for i in test_indices]
ethnicity_classifier_first_name = nltk.NaiveBayesClassifier.train(train_set)

In [33]:
nltk.classify.accuracy(ethnicity_classifier_last_name, test_set)

0.5068785578747628

In [34]:
nltk.classify.accuracy(ethnicity_classifier_last_name, train_set)

0.5246679316888045

In [35]:
nltk.classify.accuracy(ethnicity_classifier_first_name, test_set)

0.648719165085389

In [36]:
nltk.classify.accuracy(ethnicity_classifier_first_name, train_set)

0.7077205882352942

In [40]:
def get_likely_ethnicity(f, l, confounding_surname=False):
    max_l = max(l, key=l.get)
    max_f = max(f, key=f.get)
    if not confounding_surname:
        comb = {i: 0.25*f[i] + 0.75*l[i] for i in f}
    else:
        comb = {i: 0.65*f[i] + 0.35*l[i] for i in f}
    max_comb = max(comb.values())
    if max_comb > 0.8:
        conf = 'Confident'
    elif max_comb > 0.5:
        conf = 'Somewhat Confident'
    else:
        conf = 'Not Confident'
    if max(l.values()) > 0.98 and not confounding_surname:
        return max_l, comb, 'Very Confident'
    else:
        if max_l == max_f:
            return max_l, comb, conf
        else:
            return max(comb, key=comb.get), comb, conf

In [43]:
name = 'Pankhuri Goel'
d = ethnicity_classifier_first_name.prob_classify(ethnicity_features(name, kind='first')).__dict__['_prob_dict']
first_name_stats = {i: 2**d[i] for i in d}

d = ethnicity_classifier_last_name.prob_classify(ethnicity_features(name, kind='last')).__dict__['_prob_dict']
last_name_stats = {i: 2**d[i] for i in d}

is_counfounding = {'Kumar', 'Singh'}
likely_ethnicity, comb_prob, conf = get_likely_ethnicity(first_name_stats, last_name_stats, 
                                                         confounding_surname = name.split()[-1] in is_counfounding)

print sorted(first_name_stats.iteritems(), reverse=True, key=lambda x: x[1])
print sorted(last_name_stats.iteritems(), reverse=True, key=lambda x: x[1])
print sorted(comb_prob.iteritems(), reverse=True, key=lambda x: x[1])

print 'Ethnicity:', likely_ethnicity
print 'Confidence:', conf

[('West Indian', 0.9754985095171822), ('South Indian', 0.019688959068685816), ('North Indian', 0.004173894672426308), (u'Punjabi', 0.0005825211122566047), ('East Indian', 5.611562945147949e-05)]
[('North Indian', 0.7152946124510926), ('West Indian', 0.284636837455821), (u'Punjabi', 5.223433796380833e-05), ('East Indian', 9.922652964244658e-06), ('South Indian', 6.3931021559281975e-06)]
[('North Indian', 0.537514433006426), ('West Indian', 0.4573522554711613), ('South Indian', 0.004927034593788401), (u'Punjabi', 0.00018480603153700744), ('East Indian', 2.1470897086053367e-05)]
Ethnicity: North Indian
Confidence: Somewhat Confident


Performance is bad for short surnames like Wahi, Bhat, Vig, Saran, Kanoi, Dewan, Nehru etc. Only major flaw right now.

- get more and better data (!)
- change feature definitions to take care of these 'edge' cases

Can use a better method for get ethnicity (random forest?) instead of manually defining things

In [44]:
import pickle

In [45]:
pickle.dump(gender_classifier, open('models/gender_classifier.pkl', 'wb'))

In [46]:
pickle.dump(religion_classifier, open('models/religion_classifier.pkl', 'wb'))

In [47]:
pickle.dump(ethnicity_classifier_last_name, open('models/ethnicity_classifier_last_name.pkl', 'wb'))

In [48]:
pickle.dump(ethnicity_classifier_first_name, open('models/ethnicity_classifier_first_name.pkl', 'wb'))