In [81]:
import torch
import os
import numpy as np
import random

In [82]:
path = './names'
files = [file for file in os.listdir(path)]
vocabulary = set()
for file in files:
    with open(os.path.join(path, file), 'r', encoding= 'utf-8') as f:
        for word in f.readlines():
            for char in list(word.strip()):
                vocabulary.add(char)
        
vocabulary = list(vocabulary)
input_dim = len(vocabulary) + 1

In [85]:
class Data(torch.utils.data.Dataset):
    def __init__(self, path, files, vocabulary, input_dim):
        super().__init__()
        self.files = files
        self.vocabulary, self.input_dim = vocabulary, input_dim
        self.word_tensor, self.word_data, self.label_index, self.label = [], [], [], []

        for i in range(len(self.files)):
            with open(os.path.join(path, self.files[i]), 'r', encoding= 'utf-8') as f:
                for word in f.readlines():
                    self.word_tensor.append(self.onehotencoding(word.strip()))
                    self.label_index.append(torch.tensor(i))
                    self.word_data.append(word.strip())
                    self.label.append(self.files[i])

    def onehotencoding(self, word):
        ret = torch.zeros((len(word), 1, self.input_dim))
        for i in range(len(word)):
            if word[i] in self.vocabulary:
                ret[i, 0, self.vocabulary.index(word[i])] = 1
            else:
                ret[i, 0, -1] = 1
        return ret.reshape((1, len(word), -1))
    
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, index):
        return self.word_tensor[index], self.word_data[index], self.label_index[index], self.label[index]
    

In [86]:
alldata = Data('./names/', files, vocabulary, input_dim)
train, test = torch.utils.data.random_split(alldata, [0.9, 0.1])

In [87]:
class Network(torch.nn.Module):
    def __init__(self, input_dim, out_features):
        super().__init__()
        self.rnn = torch.nn.RNN(input_size= input_dim, hidden_size= 128, num_layers= 1, batch_first= True)
        self.fc1 = torch.nn.Linear(in_features= 128, out_features= out_features)
    def forward(self, x):
        y, h = self.rnn(x)
        y = y[:, -1, :]
        return self.fc1(y)

In [88]:
model = Network(input_dim= input_dim, out_features= len(files))
model = model.to('cuda')

In [89]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr= 0.001, weight_decay= 0.0001)

In [90]:
model.train()
for epoch in range(30):
    batches = list(range(len(train)))
    random.shuffle(batches)
    batches = np.array_split(batches, len(batches) // 64)
    running_loss = 0.0
    for batch in batches:
        for i in batch:
            optimizer.zero_grad()
            word_tensor, word_data, label_index, label = train[i]
            word_tensor, label_index = word_tensor.to('cuda'), label_index.unsqueeze(0).to('cuda')
            output = model(word_tensor)
            loss = criterion(output, label_index)
            loss.backward()
            running_loss += loss.item()
            optimizer.step()
    print(f'epoch - {epoch}, loss = {running_loss}')
            

epoch - 0, loss = 20651.485175973125
epoch - 1, loss = 17071.902968951792
epoch - 2, loss = 15635.974860505376
epoch - 3, loss = 14894.697162134587
epoch - 4, loss = 14526.435984684882
epoch - 5, loss = 14603.666376518519
epoch - 6, loss = 14480.88834379756
epoch - 7, loss = 14492.082215148665
epoch - 8, loss = 14646.508340227057
epoch - 9, loss = 14892.29424189874
epoch - 10, loss = 14667.916275638116
epoch - 11, loss = 14938.64810840923
epoch - 12, loss = 14806.568407434539
epoch - 13, loss = 15130.981915629694
epoch - 14, loss = 15227.648713134313
epoch - 15, loss = 15469.241608211883
epoch - 16, loss = 15605.151804631336
epoch - 17, loss = 15892.293474008373
epoch - 18, loss = 16196.0603591256
epoch - 19, loss = 16371.782870057286
epoch - 20, loss = 16608.052628370773
epoch - 21, loss = 16489.02255625074
epoch - 22, loss = 16530.51069921702
epoch - 23, loss = 16831.283150280637
epoch - 24, loss = 16910.571762392086
epoch - 25, loss = 16940.035541308163
epoch - 26, loss = 17103.9006

In [94]:
model.eval()
all_pred, all_label = [], []
with torch.no_grad():
    batches = list(range(len(test)))
    random.shuffle(batches)
    batches = np.array_split(batches, len(batches) // 64)
    for batch in batches:
        for i in batch:
            word_tensor, word_data, label_index, label = test[i]
            word_tensor = word_tensor.to('cuda')
            output = model(word_tensor)
            val, index = torch.max(output, dim = 1)
            all_pred.append(index.to('cpu').numpy())
            all_label.append(label_index)


In [95]:
from sklearn.metrics import accuracy_score
print(accuracy_score(all_pred, all_label))

0.7140009965122073
