#### About
RNNs for classifying surname language of origin

Dataset Link - https://www.kaggle.com/datasets/sinclairg/surname-language-of-origin

In [3]:
#mandatory imports
import glob
import os
import unicodedata
import io
import string
import pandas as pd

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir('/content/drive/MyDrive/Datasets')
!unzip archive.zip

Archive:  archive.zip
  inflating: data/eng-fra.txt        
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating: data/names/Vietnamese.txt  


In [5]:
all_letters = string.ascii_letters +".,;':"
print(all_letters)

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,;':


In [6]:
print(len(all_letters))

57


In [7]:
#helper fuinctions -https://www.kaggle.com/code/sinclairg/pytorch-rnn-name-classification
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

In [10]:
surname_class = {}
origins = []
for filename in glob.glob('/content/drive/MyDrive/Datasets/data/names/*.txt'):
    origin = filename.split('/')[-1].split('.')[0]
    origins.append(origin)
    lines = readLines(filename)
    for surname in lines:
        surname_class[surname] = origin

print(len(surname_class))

17416


In [11]:
print("{} different origins are available".format(len(origins)))

18 different origins are available


In [12]:
surname_class

{'Khoury': 'Arabic',
 'Nahas': 'Arabic',
 'Daher': 'Arabic',
 'Gerges': 'Arabic',
 'Nazari': 'Arabic',
 'Maalouf': 'Arabic',
 'Naifeh': 'Arabic',
 'Guirguis': 'Arabic',
 'Baba': 'Japanese',
 'Sabbagh': 'Arabic',
 'Attia': 'Arabic',
 'Tahan': 'Arabic',
 'Haddad': 'Arabic',
 'Aswad': 'Arabic',
 'Najjar': 'Arabic',
 'Dagher': 'Arabic',
 'Maloof': 'Arabic',
 'Isa': 'Arabic',
 'Asghar': 'Arabic',
 'Nader': 'Arabic',
 'Gaber': 'Arabic',
 'Abboud': 'Arabic',
 'Zogby': 'Arabic',
 'Srour': 'Arabic',
 'Bahar': 'Arabic',
 'Mustafa': 'English',
 'Hanania': 'Arabic',
 'Tuma': 'Arabic',
 'Saliba': 'Arabic',
 'Shamoon': 'Arabic',
 'Handal': 'Arabic',
 'Amari': 'Arabic',
 'Atiyeh': 'Arabic',
 'Said': 'English',
 'Khouri': 'Arabic',
 'Sleiman': 'Arabic',
 'Seif': 'Arabic',
 'Harb': 'Arabic',
 'Asker': 'Arabic',
 'Antar': 'Arabic',
 'Awad': 'English',
 'Shadid': 'Arabic',
 'Hajjar': 'Arabic',
 'Kalb': 'German',
 'Bazzi': 'Arabic',
 'Masih': 'Arabic',
 'Ghanem': 'Arabic',
 'Antoun': 'Arabic',
 'Sarraf': 

In [13]:
dataset = pd.DataFrame.from_dict(surname_class, orient="index").reset_index()
dataset.columns = ['surname', 'origin']


In [14]:
dataset

Unnamed: 0,surname,origin
0,Khoury,Arabic
1,Nahas,Arabic
2,Daher,Arabic
3,Gerges,Arabic
4,Nazari,Arabic
...,...,...
17411,Truong,Vietnamese
17412,Van,Vietnamese
17413,Vinh,Vietnamese
17414,Vuong,Vietnamese


## Approach
1. Since, we have 18 countries so we will make a softmax for 18 classes and feed letters in surname one by one to each RNN cell to predict it.
2. We will convert each char in surname to ASCII index and then pass it through an embedding layer prior passing it to RNN cells.

In [15]:
# pytorch imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence



In [16]:
#creating dataset
class NameDataset(Dataset):
    def __init__(self,dataframe):
        self.dataframe = dataframe
        self.origin = self.dataframe['origin'].values.tolist()
        self.origin_list = list(sorted(set(self.origin)))
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        surname = self.dataframe['surname'][index]
        origin = self.dataframe['origin'][index]
        item = {'surname':surname, 'origin':origin}
        return item
    
    def get_origins(self):
        return self.origin_list

    def origin_id(self,origin):
        return self.origin_list.index(origin)
    
    def get_origin(self,id):
        return self.origin_list[id] 

In [17]:
training_dataset = NameDataset(dataset)
training_dataset[0]


{'surname': 'Khoury', 'origin': 'Arabic'}

In [18]:
print(training_dataset.get_origin(1))
print(training_dataset.get_origin(0))

Chinese
Arabic


In [19]:
print(training_dataset.get_origins())

['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']


In [20]:
print(training_dataset.origin_id('Russian'))

14


In [26]:

#creating data loader
dataloader = DataLoader(training_dataset, batch_size=4096,shuffle=True)

In [27]:
for batch in dataloader:
    print(batch['surname'], batch['origin'])
    break

['Hlopiev', 'Zhivoderov', 'Moh', 'Vantonder', 'Oom', 'Kerwar', 'Abzaev', 'Lebinson', 'Yam', 'Westwood', 'Loong', 'Adzhalov', 'Nyashin', 'Bektabegov', 'Makhmutov', 'Avdienko', 'Balazovsky', 'Pakhunov', 'Wall', 'Glenn', 'Aganbegyan', 'Veitch', 'Ryjkin', 'Travert', 'Eccleston', 'Vesninov', 'Airey', 'Blahut', 'Schoonenburg', 'Jandarbiev', 'Vaipan', 'Bakalov', 'Farrelly', 'Isaacs', 'Jagich', 'Maughan', 'Abelyan', 'Jekov', 'Dzhanibekov', 'Barrett', 'Meeuwe', 'Zhurin', 'Awelicheff', 'Schumacher', 'Lokhmatikov', 'Gomatos', 'Tanaka', 'Lumb', 'Tuvin', 'Dang', 'Oinuma', 'Kaberman', 'Munkata', 'Nikolaou', 'Zasukhin', 'Carr', 'Vinding', 'Zenger', 'Hunter', 'Bakirov', "Awak'Yan", 'Yalovets', 'Mohamed', 'Awtomovitch', 'Avik', 'Kelliher', 'Fabri', 'Hinchuk', 'Juzva', 'Abyzov', 'Ablesimov', 'Crocker', 'Duff', 'Hamzin', 'Zhdanov', 'Pahmutov', 'Bezubyak', 'Monfort', 'Veligura', 'Bahmetiev', 'Muklevich', 'Tillett', 'Shalyto', 'Langlois', 'Prokoshkin', 'Golobokov', 'Sinha', 'Hanbikov', 'Trickett', 'Makino'

In [44]:
# Hyperparams
HIDDEN_SIZE = 100
N_LAYERS = 4
BATCH_SIZE = 256
N_EPOCHS = 100
N_CHARS = 128  # ASCII


In [45]:
def origin_to_tensor(origins):
    origin_ids = [training_dataset.origin_id(
        origin) for origin in origins]
    return torch.LongTensor(origin_ids)

def str2ascii_arr(msg):
    arr = [ord(c) for c in msg]
    return arr, len(arr)


In [46]:
# pad sequences 
def pad_sequences(vectorized_seqs,seq_lens, origins):
    seq_tensor = torch.zeros((len(vectorized_seqs), seq_lens.max())).long()
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lens)):
        seq_tensor[idx,:seq_len] = torch.LongTensor(seq)

    # sorting tensors by length
    seq_lens, perm_idx = seq_lens.sort(0, descending=True)
    seq_tensor = seq_tensor[perm_idx]
    #print(seq_tensor)

    #sorting the target in same order
    #print(origins)
    target = origin_to_tensor(origins)
    if len(origins):
        target = target[perm_idx]

    return seq_tensor,seq_lens,target

In [47]:
def generate_io(surnames,origins):
    sequence_and_len = [str2ascii_arr(surname) for surname in surnames]
    vectorized_sequence = [item[0] for item in sequence_and_len]
    seq_lens = torch.LongTensor([item[1] for item in sequence_and_len])
    #print(vectorized_sequence, seq_lens)
    return (pad_sequences(vectorized_sequence,seq_lens, origins))

    

In [48]:
training_dataset.__getitem__(1)

{'surname': 'Nahas', 'origin': 'Arabic'}

In [49]:
for i, batch in enumerate(dataloader):
    surname , origin = batch['surname'],batch['origin'] 
    seq_tensor, seq_lens, target = generate_io(surname,origin)
    print(seq_tensor,seq_lens,target)
    if i ==3:
        break

tensor([[ 83, 104, 105,  ..., 116, 111, 118],
        [ 77,  97,  99,  ...,  97,   0,   0],
        [ 66, 101, 107,  ..., 118,   0,   0],
        ...,
        [ 86, 117,   0,  ...,   0,   0,   0],
        [ 73, 105,   0,  ...,   0,   0,   0],
        [ 82, 105,   0,  ...,   0,   0,   0]]) tensor([19, 17, 17,  ...,  2,  2,  2]) tensor([14,  8, 14,  ..., 17, 10, 11])
tensor([[ 66, 101, 107,  ..., 115, 107, 121],
        [ 71, 111, 114,  ...,   0,   0,   0],
        [ 66, 101, 115,  ...,   0,   0,   0],
        ...,
        [ 83, 111,   0,  ...,   0,   0,   0],
        [ 75, 111,   0,  ...,   0,   0,   0],
        [ 76, 121,   0,  ...,   0,   0,   0]]) tensor([18, 15, 15,  ...,  2,  2,  2]) tensor([14, 14, 14,  ..., 11, 11, 17])
tensor([[ 67, 104, 114,  ..., 108, 111, 115],
        [ 66,  97, 105,  ..., 107, 111,   0],
        [ 80,  97, 114,  ..., 111, 115,   0],
        ...,
        [ 77,  97,   0,  ...,   0,   0,   0],
        [ 83, 105,   0,  ...,   0,   0,   0],
        [ 89, 105,   

In [50]:
# creating our RNN classifier
class RNNClassifier(nn.Module):

    def __init__(self,input_size,hidden_size, output_size,n_layers=1,bidirectional=True):
        super().__init__()
        self.hidden_size=hidden_size
        self.n_layers = n_layers
        self.n_directions = int(bidirectional)+1

        self.embedding_layer = nn.Embedding(input_size,hidden_size)
        self.gru_layer = nn.GRU(hidden_size, hidden_size,n_layers,bidirectional=bidirectional)
        self.fc_layer = nn.Linear(hidden_size,output_size)

    def forward(self,input,seq_lens):
        # we shall run this all at once 
        # initial input shape = B,S 
        # transpose= Seq, Batch
        input = input.t()
        batch = input.size(1)

        hidden = self._init_hidden(batch).cuda()
        # embedding = S,B to S, B, I (embedding sie)
        embedding = self.embedding_layer(input)

        #packing them
        gru_input = pack_padded_sequence(embedding,seq_lens.data.cpu().numpy())

        self.gru_layer.flatten_parameters()
        output,hidden = self.gru_layer(gru_input,hidden)

        # using last layer output as FC input
        fc_output = self.fc_layer(hidden[-1])
        return fc_output

    def _init_hidden(self,batch_size):
        hidden = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size)
        return hidden


In [51]:
N_ORIGINS=18
model = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_ORIGINS, N_LAYERS)

In [52]:
model

RNNClassifier(
  (embedding_layer): Embedding(128, 100)
  (gru_layer): GRU(100, 100, num_layers=4, bidirectional=True)
  (fc_layer): Linear(in_features=100, out_features=18, bias=True)
)

#### For multi-gpu

``` 
if torch.cuda.device_count() > 1:
         print("Let's use", torch.cuda.device_count(), "GPUs!")
         # dim = 0 [3, xxx] -> [1, ...], [1, ...], [1, ...] on 3 GPUs
         model = nn.DataParallel(model)

         ```

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)



cuda


## Training Phase

In [61]:
total_loss =0
num_epochs=1000
optimizer = torch.optim.SGD(model.parameters(),lr=0.000000001) #tried with ADAM too.
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    for i, batch  in enumerate(dataloader):
        surname = batch['surname']
        origin = batch['origin']
        #generating padded inputs-outputs
        input,seq_lens,target = generate_io(surname, origin)
        #sending to device
        input = input.to(device)
        seq_lens = seq_lens.to(device)
        target = target.to(device)
        #generating output and calculating loss
        output = model(input,seq_lens)
        loss = criterion(output,target)
        total_loss+=loss.item()
        #resetting grads, backpropagating and updating weights
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    
    print("Epoch - {}, Loss - {}".format(epoch,total_loss))

Epoch - 0, Loss - 6.372887134552002
Epoch - 1, Loss - 12.825802206993103
Epoch - 2, Loss - 19.206522941589355
Epoch - 3, Loss - 25.64569902420044
Epoch - 4, Loss - 32.03822720050812
Epoch - 5, Loss - 38.485339641571045
Epoch - 6, Loss - 44.90578806400299
Epoch - 7, Loss - 51.256033062934875
Epoch - 8, Loss - 57.68836486339569
Epoch - 9, Loss - 64.04503798484802
Epoch - 10, Loss - 70.4618991613388
Epoch - 11, Loss - 76.87907040119171
Epoch - 12, Loss - 83.22955131530762
Epoch - 13, Loss - 89.63880515098572
Epoch - 14, Loss - 96.02927672863007
Epoch - 15, Loss - 102.4140555858612
Epoch - 16, Loss - 108.85346937179565
Epoch - 17, Loss - 115.23891568183899
Epoch - 18, Loss - 121.71614396572113
Epoch - 19, Loss - 128.13973927497864
Epoch - 20, Loss - 134.62423133850098
Epoch - 21, Loss - 141.07040238380432
Epoch - 22, Loss - 147.51491928100586
Epoch - 23, Loss - 153.88260507583618
Epoch - 24, Loss - 160.31077992916107
Epoch - 25, Loss - 166.72107899188995
Epoch - 26, Loss - 173.152480602264

# Remarks
This is a basic model and hence bound to perform poorly.


## Testing

In [63]:
input_name = "SURAJ"
model.eval()
input,seq_lens,target = generate_io([input_name],[])
input = input.to(device)
seq_lens = seq_lens.to(device)
target = target.to(device)
output = model(input,seq_lens)
pred = output.data.max(1,keepdim=True)[1]
origin_id = pred.cpu().numpy()[0][0]
print("Predicted origin is {}".format(training_dataset.get_origin(origin_id)))

Predicted origin is English


In [64]:
input_name = "JUNGWOO"
model.eval()
input,seq_lens,target = generate_io([input_name],[])
input = input.to(device)
seq_lens = seq_lens.to(device)
target = target.to(device)
output = model(input,seq_lens)
pred = output.data.max(1,keepdim=True)[1]
origin_id = pred.cpu().numpy()[0][0]
print("Predicted origin is {}".format(training_dataset.get_origin(origin_id)))

Predicted origin is English
