In [1]:
import torch
import os
import numpy as np
import random

In [2]:
path = './names'
files = [file for file in os.listdir(path)]
vocabulary = set()
for file in files:
    with open(os.path.join(path, file), 'r', encoding= 'utf-8') as f:
        for word in f.readlines():
            for char in list(word.strip()):
                vocabulary.add(char)
        
vocabulary = list(vocabulary)
input_dim = len(vocabulary) + 1

In [3]:
class Data(torch.utils.data.Dataset):
    def __init__(self, path, files, vocabulary, input_dim):
        super().__init__()
        self.files = files
        self.vocabulary, self.input_dim = vocabulary, input_dim
        self.word_tensor, self.word_data, self.label_index, self.label = [], [], [], []

        for i in range(len(self.files)):
            with open(os.path.join(path, self.files[i]), 'r', encoding= 'utf-8') as f:
                for word in f.readlines():
                    self.word_tensor.append(self.onehotencoding(word.strip()))
                    self.label_index.append(torch.tensor(i))
                    self.word_data.append(word.strip())
                    self.label.append(self.files[i])

    def onehotencoding(self, word):
        ret = torch.zeros((len(word), 1, self.input_dim))
        for i in range(len(word)):
            if word[i] in self.vocabulary:
                ret[i, 0, self.vocabulary.index(word[i])] = 1
            else:
                ret[i, 0, -1] = 1
        return ret.reshape((1, len(word), -1))
    
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, index):
        return self.word_tensor[index], self.word_data[index], self.label_index[index], self.label[index]
    

In [4]:
alldata = Data('./names/', files, vocabulary, input_dim)
# note
train, test = torch.utils.data.random_split(alldata, [0.9, 0.1])

In [5]:
class Network(torch.nn.Module):
    def __init__(self, input_dim, out_features):
        super().__init__()
        self.rnn = torch.nn.RNN(input_size= input_dim, hidden_size= 128, num_layers= 1, batch_first= True)
        self.fc1 = torch.nn.Linear(in_features= 128, out_features= out_features)
    def forward(self, x):
        y, h = self.rnn(x)
        y = y[:, -1, :]
        return self.fc1(y)

In [6]:
model = Network(input_dim= input_dim, out_features= len(files))
model = model.to('cuda')

In [7]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr= 0.001, weight_decay= 0.0001)

In [8]:
model.train()
for epoch in range(30):
    batches = list(range(len(train)))
    # note
    random.shuffle(batches)
    # note
    batches = np.array_split(batches, len(batches) // 64)
    running_loss = 0.0
    for batch in batches:
        for i in batch:
            optimizer.zero_grad()
            word_tensor, word_data, label_index, label = train[i]
            word_tensor, label_index = word_tensor.to('cuda'), label_index.unsqueeze(0).to('cuda')
            output = model(word_tensor)
            loss = criterion(output, label_index)
            loss.backward()
            running_loss += loss.item()
            optimizer.step()
    print(f'epoch - {epoch}, loss = {running_loss}')
            

epoch - 0, loss = 20936.30409527826
epoch - 1, loss = 17121.894589433083
epoch - 2, loss = 15585.996399236436
epoch - 3, loss = 14947.810268568115
epoch - 4, loss = 14443.727006836383
epoch - 5, loss = 14458.36654445478
epoch - 6, loss = 14252.085662359408
epoch - 7, loss = 14193.348772683756
epoch - 8, loss = 14351.90128091197
epoch - 9, loss = 14551.83564152757
epoch - 10, loss = 14566.513487397206
epoch - 11, loss = 14684.958279825249
epoch - 12, loss = 15087.16189686989
epoch - 13, loss = 15087.881853227478
epoch - 14, loss = 15186.075643055905
epoch - 15, loss = 15184.144665243104
epoch - 16, loss = 15637.481440077849
epoch - 17, loss = 15384.58089388015
epoch - 18, loss = 15450.231729058905
epoch - 19, loss = 15547.618483333412
epoch - 20, loss = 15480.739470179113
epoch - 21, loss = 15749.851882539904
epoch - 22, loss = 15916.2795302523
epoch - 23, loss = 15966.253389358142
epoch - 24, loss = 15615.821221468956
epoch - 25, loss = 15893.593759628537
epoch - 26, loss = 15888.18294

In [9]:
model.eval()
all_pred, all_label = [], []
with torch.no_grad():
    batches = list(range(len(test)))
    random.shuffle(batches)
    batches = np.array_split(batches, len(batches) // 64)
    for batch in batches:
        for i in batch:
            word_tensor, word_data, label_index, label = test[i]
            word_tensor = word_tensor.to('cuda')
            output = model(word_tensor)
            val, index = torch.max(output, dim = 1)
            all_pred.append(index.to('cpu').numpy())
            all_label.append(label_index)


In [10]:
from sklearn.metrics import accuracy_score
print(accuracy_score(all_pred, all_label))

0.7249626307922272


In [11]:
word = 'Sharma'
onehotword = torch.zeros((len(word), 1, input_dim))
for i in range(len(word)):
    if word[i] in vocabulary:
        onehotword[i, 0, vocabulary.index(word[i])] = 1.0
    else:
        onehotword[i, 0, -1] = 1.0
onehotword = onehotword.reshape((1, len(word), input_dim))
model.eval()
onehotword = onehotword.to('cuda')
output = model(onehotword)
val, ind = torch.max(output, dim = 1)
answer = files[ind]
print(answer)

Arabic.txt
