In this file, we will work on how we can classify the nationalities of people by using their names. There is a lot about how we can play with names.

In [1]:
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import os
import re

In [2]:
# f_url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/Indian-Female-Names.csv"
# m_url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/Indian-Male-Names.csv"

# male_data = pd.read_csv(m_url)
# female_data = pd.read_csv(f_url)

male_data = pd.read_csv("nationality/male_names.csv")
female_data = pd.read_csv("nationality/female_names.csv")

In [3]:
female_data.head()

Unnamed: 0,name,gender,race
0,shivani,f,indian
1,isha,f,indian
2,smt shyani devi,f,indian
3,divya,f,indian
4,mansi,f,indian


Now we are creating helper functions for data cleaning and processing 

In [4]:
repl_list = ['s/o','d/o','w/o','/','&',',','-']

def clean_data(name):
    name = str(name).lower()
    name = (''.join(i for i in name if ord(i)<128)).strip()
    for repl in repl_list:
        name = name.replace(repl," ")
    if '@' in name:
        pos = name.find('@')
        name = name[:pos].strip()
    name = name.split(" ")
    name = " ".join([each.strip() for each in name])
    return name

In [5]:
def remove_records(merged_data):
    merged_data['delete'] = 0
    merged_data.loc[merged_data['name'].str.find('with') != -1,'delete'] = 1
    merged_data.loc[merged_data['count_words']>=5,'delete']=1
    merged_data.loc[merged_data['count_words']==0,'delete']=1
    merged_data.loc[merged_data['name'].str.contains(r'\d') == True,'delete']=1
    cleaned_data = merged_data[merged_data.delete==0]
    return cleaned_data

In [6]:
merged_data = pd.concat((male_data,female_data),axis=0)

In [7]:
merged_data['name'] = merged_data['name'].apply(clean_data)
merged_data['count_words'] = merged_data['name'].str.split().apply(len)

In [8]:
cleaned_data = remove_records(merged_data)

In [9]:
indian_cleaned_data = cleaned_data[['name','count_words']].drop_duplicates(subset='name',keep='first')
indian_cleaned_data['label'] = 'indian'

In [10]:
len(indian_cleaned_data)

13754

After loading and removing the wrong entries in the data, we got a few records around 13,000.

In [11]:
cleaned_data.head()

Unnamed: 0,name,gender,race,count_words,delete
0,barjraj,m,indian,1,0
1,ramdin verma,m,indian,2,0
2,sharat chandran,m,indian,2,0
3,birender mandal,m,indian,2,0
4,amit,m,indian,1,0


In [12]:
merged_data.race.value_counts()

indian    30227
Name: race, dtype: int64

In [13]:
indian_cleaned_data.head()

Unnamed: 0,name,count_words,label
0,barjraj,1,indian
1,ramdin verma,2,indian
2,sharat chandran,2,indian
3,birender mandal,2,indian
4,amit,1,indian


Lets create some non-Indian names using Faker - a pretty cool package to generate realistic names from different regions.

In [17]:
# pip install faker

In [18]:
from faker import Faker
fake = Faker("en_US")
fake.name()

'Alexandra Bennett'

In [19]:
from faker import Faker
import random
req = 15000
non_indian_names = []

langs = ['ar_EG','bs_BA','de_DE','dk_DK','en_AU','en_CA','en_GB',
'en_IN','en_NZ','en_US','it_IT','no_NO','ro_RO']

for i in range(0,req):
    lng_indx = random.randint(0,len(langs)-1)
    fake = Faker(langs[lng_indx])
    non_indian_names.append(fake.name().lower())

In [20]:
non_indian_names_orig = list(set(non_indian_names))

In [21]:
len(non_indian_names_orig)

14664

In [22]:
non_indian_data = pd.DataFrame({'name':non_indian_names_orig})
non_indian_data['count_words'] = non_indian_data['name'].str.split().apply(len)

We have generated approximately the same number of names as we have in the Indian data set. We then removed samples longer than 5 words. The Indian data set contained a lot of names with just first names. So we need to make the overall non-Indian distribution also similar.

In [23]:
non_indian_data.head()

Unnamed: 0,name,count_words
0,reece nash,2
1,veronica harris,2
2,suzanne green-garner,2
3,derek eaton,2
4,melissa harrison,2


Lets check the distribution of count of words in names. We dont want them to be too different

In [24]:
indian_cleaned_data['count_words'].value_counts()

2    7954
1    4322
3    1344
4     134
Name: count_words, dtype: int64

In [25]:
non_indian_data['count_words'].value_counts()

2    13058
3     1422
4      179
5        5
Name: count_words, dtype: int64

In [26]:
two_word_names = non_indian_data[non_indian_data['count_words']==2]['name']
one_word_req = 5000
names_one_two_words = [each.split()[0] for each in two_word_names[:one_word_req]] + list(two_word_names[one_word_req:])
count_words = [1] * one_word_req + [2] * len(two_word_names[one_word_req:])
not_two_words_pd  = non_indian_data[non_indian_data['count_words']!=2]
one_two_words_pd = pd.DataFrame({'name':names_one_two_words,'count_words':count_words})
non_indian_data = pd.concat((not_two_words_pd,one_two_words_pd),axis=0)
non_indian_data['count_words'].value_counts()
non_indian_data['label'] = 'non_indian'
non_indian_data = non_indian_data[non_indian_data['count_words']<5]
non_indian_data['count_words'].value_counts()

2    8058
1    5000
3    1422
4     179
Name: count_words, dtype: int64

In [27]:
full_data = pd.concat((non_indian_data[['name','label']],indian_cleaned_data[['name','label']]),axis=0)
name_data = full_data.sample(frac=1)

# full_data.to_csv("name_data.csv",index=False)
# from google.colab import files
# files.download('name_data.csv')

name_data.head()

Unnamed: 0,name,label
10640,hr frederik sørensen,non_indian
8752,sig. giampaolo bosio,non_indian
227,rinukanwr,indian
6921,austin torres,non_indian
548,raju garg,indian


In [28]:
# data_url = "https://raw.githubusercontent.com/ashavish/name-nationality/master/data/name_data.csv"
# name_data = pd.read_csv(data_url)

In [29]:
name_data['label'].value_counts()

non_indian    14659
indian        13754
Name: label, dtype: int64

We end up with about 14,000 non-Indian names and 13,000 Indian names. Now let’s build a neural network to classify nationalities using names:

In [30]:
from sklearn.model_selection import train_test_split
X = name_data['name'].astype(str)
Y = name_data['label']
train_names,test_names,train_labels,test_labels = train_test_split(X,Y,test_size=0.20,random_state =42,stratify=Y)

### Naive Bayes with Count Vectorizer for name classification

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report

vectorizer = CountVectorizer()
X_ = vectorizer.fit_transform(train_names.values.astype('U'))
len(vectorizer.get_feature_names())

11734

In [32]:
model = MultinomialNB()
model.fit(X_,train_labels)

X_test = vectorizer.transform(test_names.values.astype('U'))

test_predicted = model.predict(X_test)

print(classification_report(test_labels,test_predicted))

              precision    recall  f1-score   support

      indian       0.98      0.76      0.85      2751
  non_indian       0.81      0.99      0.89      2932

    accuracy                           0.87      5683
   macro avg       0.90      0.87      0.87      5683
weighted avg       0.89      0.87      0.87      5683



### Testing on new Names
Lets create some names which are not present in the data at all and check the model on these names

In [33]:
check_new_names = ['lalitha','tyson','shailaja','shyamala','vishwanathan','ramanujam','conan','kryslovsky',
'ratnani','diego','kakoli','shreyas','brayden','shanon']

X_new = vectorizer.transform(check_new_names)
predictions_nb_cv = model.predict(X_new)
test = pd.DataFrame({'names':check_new_names,'predictions_nb_cv':predictions_nb_cv}) 
test

Unnamed: 0,names,predictions_nb_cv
0,lalitha,non_indian
1,tyson,non_indian
2,shailaja,non_indian
3,shyamala,non_indian
4,vishwanathan,non_indian
5,ramanujam,non_indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,non_indian
9,diego,non_indian


Doesnt do well at all ! But thats expected. Now lets try with subword encoding

### Naive Bayes with SentencePiece Embedding

In [35]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 6.4 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3


In [36]:
from tokenizers import ByteLevelBPETokenizer,CharBPETokenizer,SentencePieceBPETokenizer,BertWordPieceTokenizer


f = open("train_names.txt","w")
for each in list(train_names):
    f.write(str(each))
    f.write("\n")

f.close()

In [37]:
tokenizer = SentencePieceBPETokenizer()
tokenizer.train(["./train_names.txt"],vocab_size=2000,min_frequency=2)

encoded_tokens = [tokenizer.encode(str(each)).tokens for each in train_names]
encoded_tokens_test = [tokenizer.encode(str(each)).tokens for each in test_names]

encoded_tokens = [" ".join(each)  for each in encoded_tokens]
encoded_tokens_test = [" ".join(each)  for each in encoded_tokens_test]

encoded_tokens[:10]

['▁raju ▁d ass',
 '▁hr ▁b ørg e ▁lar s en',
 '▁l og an ▁mar sh al l',
 '▁parm ila ▁son a',
 '▁b ald ev',
 '▁sam s un g ▁s ▁singh',
 '▁na om i ▁kir k',
 '▁rita ▁rasm uss en',
 '▁ka ish av',
 '▁brand y ▁mor al es ▁md']

In [38]:
tfidf_vect = TfidfVectorizer()
X_ = tfidf_vect.fit_transform(encoded_tokens)
len(tfidf_vect.get_feature_names())

model = MultinomialNB()
model.fit(X_,train_labels)

X_test = tfidf_vect.transform(encoded_tokens_test)

test_predicted = model.predict(X_test)

print(classification_report(test_labels,test_predicted))

              precision    recall  f1-score   support

      indian       0.91      0.94      0.92      2751
  non_indian       0.94      0.91      0.92      2932

    accuracy                           0.92      5683
   macro avg       0.92      0.92      0.92      5683
weighted avg       0.92      0.92      0.92      5683



Pretty decent. Now lets check on some new words

In [39]:
encoded_tokens_check = [tokenizer.encode(str(each).lower()).tokens for each in check_new_names]
encoded_tokens_check = [" ".join(each)  for each in encoded_tokens_check]

X_new = tfidf_vect.transform(encoded_tokens_check)
predictions_nb_enc_tf = model.predict(X_new)
test = pd.DataFrame({'names':check_new_names,'predictions_nb_enc_tf':predictions_nb_enc_tf}) 
test

Unnamed: 0,names,predictions_nb_enc_tf
0,lalitha,indian
1,tyson,non_indian
2,shailaja,indian
3,shyamala,indian
4,vishwanathan,indian
5,ramanujam,indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,indian
9,diego,non_indian


### Character based encoding with LSTM model

In [42]:
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers.embeddings import Embedding
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import Callback
np.random.seed(42)

In [43]:
def char_encoded_representation(data,tokenizer,vocab_size,max_len):
    char_index_sentences = tokenizer.texts_to_sequences(data)
    sequences = [to_categorical(x, num_classes=vocab_size) for x in char_index_sentences]
    X = sequence.pad_sequences(sequences, maxlen=max_len)
    return X

In [44]:
max_len = max([len(str(each)) for each in train_names])
# mapping = get_char_mapping(train_names)
# vocab_size = len(mapping)

tok = Tokenizer(char_level=True)
tok.fit_on_texts(train_names)
vocab_size = len(tok.word_index) + 1
X_train = char_encoded_representation(train_names,tok,vocab_size,max_len)
X_train.shape

(22730, 36, 55)

In [45]:
X_test = char_encoded_representation(test_names,tok,vocab_size,max_len)
X_test.shape

(5683, 36, 55)

In [46]:
le = LabelEncoder()
le.fit(train_labels)
y_train = le.transform(train_labels)
y_test = le.transform(test_labels)

In [47]:
# Model Specification

def build_model(hidden_units,max_len,vocab_size):
    model = Sequential()
    # model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
    model.add(LSTM(hidden_units,input_shape=(max_len,vocab_size)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [48]:
class myCallback(Callback): 
    def __init__(self,X_test,y_test):
        self.X_test = X_test
        self.y_test = y_test
    def on_epoch_end(self, epoch, logs={}): 
        loss,acc = model.evaluate(self.X_test, self.y_test, verbose=0)
        print('\nTesting loss: {}, acc: {}\n'.format(loss, acc))

In [49]:
model = build_model(100,max_len,vocab_size)
model.fit(X_train, y_train, epochs=50, batch_size=64,callbacks=myCallback(X_test,y_test))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               62400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 62,501
Trainable params: 62,501
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50

Testing loss: 0.37326768040657043, acc: 0.8319549560546875

Epoch 2/50

Testing loss: 0.317292183637619, acc: 0.8599331378936768

Epoch 3/50

Testing loss: 0.2842683792114258, acc: 0.8826324343681335

Epoch 4/50

Testing loss: 0.2879863381385803, acc: 0.881928563117981

Epoch 5/50

Testing loss: 0.2609037458896637, acc: 0.8924863338470459

Epoch 6/50

Testing loss: 0.23408614099025726, acc: 0.9030441641807556

Epoch 7/50

Testing loss: 0.22307248413562775, acc: 0.9111384749412537

Epoch 8/50


<keras.callbacks.History at 0x7f699f202e10>

In [50]:
X_predict = char_encoded_representation(check_new_names,tok,vocab_size,max_len)

predictions_prob = model.predict(X_predict)
predictions = np.array(predictions_prob)
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
predictions = np.squeeze(predictions)
predictions_lstm_char = le.inverse_transform(list(predictions.astype(int)))
test = pd.DataFrame({'names':check_new_names,'predictions_lstm_char':predictions_lstm_char}) 
test

Unnamed: 0,names,predictions_lstm_char
0,lalitha,indian
1,tyson,non_indian
2,shailaja,indian
3,shyamala,indian
4,vishwanathan,indian
5,ramanujam,indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,indian
9,diego,non_indian


### SentencePiece Encoding with LSTM

Lets also check with a encoding using the **SentencePiece Encoding** we used for Naive Bayes. But now we will use it with an LSTM with a much smaller vocabulary

In [51]:
from tokenizers import ByteLevelBPETokenizer,CharBPETokenizer,SentencePieceBPETokenizer,BertWordPieceTokenizer
vocab_size = 200

tokenizer = SentencePieceBPETokenizer()
tokenizer.train(["./train_names.txt"],vocab_size=vocab_size,min_frequency=2)

In [52]:
def sent_piece_encoded_representation(data,tokenizer):
    encoded_tokens = [tokenizer.encode(str(each)).ids for each in data]
    sequences = [to_categorical(x, num_classes=vocab_size) for x in encoded_tokens]
    X = sequence.pad_sequences(sequences, maxlen=max_len)
    return X

In [53]:
max_len = max([len(str(each)) for each in train_names])
le = LabelEncoder()
le.fit(train_labels)
y_train = le.transform(train_labels)
y_test = le.transform(test_labels)

In [54]:
X_train = sent_piece_encoded_representation(train_names,tokenizer)
X_train.shape

(22730, 36, 200)

In [55]:
X_test = sent_piece_encoded_representation(test_names,tokenizer)
X_test.shape

(5683, 36, 200)

In [56]:
model = build_model(100,max_len,vocab_size)
model.fit(X_train, y_train, epochs=50, batch_size=64,callbacks=myCallback(X_test,y_test))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 120,501
Trainable params: 120,501
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50

Testing loss: 0.28326326608657837, acc: 0.8852718472480774

Epoch 2/50

Testing loss: 0.2601301968097687, acc: 0.8926623463630676

Epoch 3/50

Testing loss: 0.24539077281951904, acc: 0.9019883871078491

Epoch 4/50

Testing loss: 0.22528895735740662, acc: 0.9088509678840637

Epoch 5/50

Testing loss: 0.21613946557044983, acc: 0.9120182991027832

Epoch 6/50

Testing loss: 0.20123088359832764, acc: 0.9180010557174683

Epoch 7/50

Testing loss: 0.19604331254959106, acc: 0.9211683869361877

Ep

<keras.callbacks.History at 0x7f694bfe9050>

In [57]:
X_predict = sent_piece_encoded_representation(check_new_names,tokenizer)

predictions_prob = model.predict(X_predict)
predictions = np.array(predictions_prob)
predictions[np.where(predictions > 0.5)[0]] = 1
predictions[np.where(predictions <= 0.5)[0]] = 0
predictions = np.squeeze(predictions)

predictions_lstm_sent_enc = le.inverse_transform(list(predictions.astype(int)))

In [58]:
test = pd.DataFrame({'names':check_new_names,'predictions_lstm_sent_enc':predictions_lstm_sent_enc}) 
test

Unnamed: 0,names,predictions_lstm_sent_enc
0,lalitha,indian
1,tyson,non_indian
2,shailaja,indian
3,shyamala,indian
4,vishwanathan,indian
5,ramanujam,indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,indian
9,diego,indian
