In [1]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

  cpu = _conversion_method_template(device=torch.device("cpu"))


'cpu'

### Data preparation

#### 1. From unicode to ASCII

In [2]:
import string
import unicodedata

# "_" represents an out-of-vocabulary character
allowed_characters = string.ascii_letters + " .,;'" + "_"
n_letters = len(allowed_characters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

example of usage: 

In [3]:
print (f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}")

converting 'Ślusàrski' to Slusarski


#### 2. Names to tensors

In [4]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):

    if letter not in allowed_characters:
        return allowed_characters.find("_")
    else:
        return allowed_characters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [8]:
print("a => index:  ", letterToIndex('a'))
print("a => tensor: ", lineToTensor('a')) 
print("ouissal => tensor size: ", lineToTensor('ouissal').size()) # (line_length, 1, n_letters)

a => index:   0
a => tensor:  tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]])
ouissal => tensor size:  torch.Size([7, 1, 58])


#### 3. Dataset creation

Each Dataset needs to implement three functions: `__init__`, `__len__`, and `__getitem__`

In [9]:
from io import open
import glob
import os
import time

import torch
from torch.utils.data import Dataset

In [None]:
class NamesDataset(Dataset):

    def __init__(self, data_dir):
        self.data_dir = data_dir 
        self.load_time = time.localtime
        labels_set = set() #set of all classes

        self.data = []
        self.data_tensors = []
        self.labels = [] # labels of each sample in dataset
        self.labels_tensors = []

        #read all the ``.txt`` files in the specified directory
        text_files = glob.glob(os.path.join(data_dir, '*.txt'))
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            for name in lines:
                self.data.append(name)
                self.data_tensors.append(lineToTensor(name))
                self.labels.append(label)

        #Cache the tensor representation of the labels
        self.labels_uniq = list(labels_set)
        for idx in range(len(self.labels)):
            temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long) #dtype must be long for our loss function like CrossEntropyLoss
            self.labels_tensors.append(temp_tensor)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_item = self.data[idx]
        data_label = self.labels[idx]
        data_tensor = self.data_tensors[idx]
        label_tensor = self.labels_tensors[idx]

        return label_tensor, data_tensor, data_label, data_item