In [1]:
import os
import numpy as np
import unicodedata
import string
import torch 
import torch.nn as nn

In [3]:
#data preprocessing 
all_letters = string.ascii_letters + " .,:'"
n_letters = len(all_letters)


In [5]:
def unicodeToAscii(letter):
    return ''.join(l for l in unicodedata.normalize('NFD', letter) if unicodedata.category != 'Mn' and l in all_letters)

In [16]:
os.listdir('./datasets/RNN_Name_data')

['Arabic.txt',
 'Chinese.txt',
 'Czech.txt',
 'Dutch.txt',
 'English.txt',
 'French.txt',
 'German.txt',
 'Greek.txt',
 'Irish.txt',
 'Italian.txt',
 'Japanese.txt',
 'Korean.txt',
 'Polish.txt',
 'Portuguese.txt']

In [40]:
all_names =[]
all_country = []
for f in os.listdir("./datasets/RNN_Name_data"):
    file1 = open("./datasets/RNN_Name_data/"+f, "r")
    list1 = file1.readlines()
    clean_list = list(map(unicodeToAscii, list1))
    all_names.extend(clean_list)
    all_country.extend([f.split(".")[0]] * len(clean_list))
    

In [42]:
n_rows = len(all_names)

In [43]:
emb = torch.eye(n_letters)
mapping = dict(zip(np.unique(all_country), range(n_rows)))

In [44]:
mapping

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13}

In [50]:
def get_data(idx):
    name = all_names[idx]
    country = all_country[idx]
    name_char_list = np.array(list(name))
    indexes  = np.where(name_char_list[..., None] == np.array(list(all_letters)))[1]
    return emb[torch.from_numpy(indexes)], torch.tensor(mapping[country])

In [52]:
get_data(0)

(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0.,

In [53]:
class NeuralNet(nn.Module):
    def __init__(self, n_country, n_letters):
        super(NeuralNet, self).__init__()
        self.rnn= nn.RNN(n_letters, 2*n_letters)
        self.fc = nn.Linear(2*n_letters, n_country)
    
    def forward(self,x):
        out, _ = self.rnn(x)
        out1 = self.fc(out[-1,:])
        return out1

In [54]:
model = NeuralNet(len(np.unique(all_country)), n_letters)

In [55]:
model

NeuralNet(
  (rnn): RNN(57, 114)
  (fc): Linear(in_features=114, out_features=14, bias=True)
)

In [58]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= 0.01)

In [63]:
epochs = 25
all_losses = []
for epoch in range(epochs):
    arr = np.arange(n_rows)
    np.random.shuffle(arr)
    epoch_loss = 0
    for ind in arr:
        data, target = get_data(ind)
        output = model(data)
        loss = loss_func(output, target)
        epoch_loss += loss.detach().numpy()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    all_losses.append(epoch_loss)