ðŸ“˜ <a href="https://download.pytorch.org/tutorial/data.zip" target="_blank">Download data</a>

In [46]:
import zipfile

# Path to your zip file
zip_path = "data.zip"

# Extract all contents
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("data")  # Extract to 'data' folder

In [47]:
import string
import unicodedata

# I will use "_" to represent an out-of-vocabulary character, that is, any character we are not handling in our model
allowed_chars = string.ascii_letters  + " .,;'" + "_"
n_letters = len(allowed_chars)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_chars
    )



In [48]:
print (f"converting 'ÅšlusÃ rski' to {unicodeToAscii('ÅšlusÃ rski')}")

converting 'ÅšlusÃ rski' to Slusarski


In [49]:
import torch
def letter_to_index(letter:str):
  if letter not in allowed_chars:
    return allowed_chars.find("_")
  else:
    return allowed_chars.find(letter)

def name_to_tensor(name:str):
  tensor = torch.zeros(len(name), 1, n_letters)

  for idx, letter in enumerate(name):
    tensor[idx][0][letter_to_index(letter)] = 1
  return tensor

In [50]:
print(name_to_tensor("Albert"))

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0

In [51]:
from torch.utils.data import Dataset
import os
import glob

# Creating a custom Dataset class for loading names from text files
class NamesDataset(Dataset):
  def __init__(self, path):
    self.path = path # Path to the folder containing .txt files
    labels_set = set() # Used to collect all unique labels
    self.data = [] # This will store all the names
    self.data_tensors = [] # This will store the tensor versions of the names
    self.labels = [] # This will store the label (e.g., "English") for each name
    self.label_tensors = [] # This will store the tensor versions of the labels

    # Find all .txt files in the given path
    text_files = glob.glob(os.path.join(path, '*.txt'))

    for filename in text_files:
      basename = os.path.basename(filename) # e.g., "English.txt"
      label = os.path.splitext(basename)[0] # e.g., "English"
      labels_set.add(label) # Add label to the set of all unique labels
      with open(filename, encoding='utf-8') as file:
        names = file.read().strip().split("\n") # Split file by new lines
        for name in names:
          self.data.append(name)
          self.labels.append(label)

          self.data_tensors.append(name_to_tensor(name))

    # Create a list of all unique labels (e.g., ['English', 'French', ...])
    self.label_uniq = list(labels_set)

    # Convert each label to a tensor (as an index from label_uniq)
    for label in self.labels:
      self.label_tensors.append(torch.tensor([self.label_uniq.index(label)], dtype=torch.long))

  # Return total number of data samples (i.e., number of names)
  def __len__(self):
    return len(self.data)

  # Return a specific item: (name_tensor, label_tensor, name_as_string)
  def __getitem__(self, idx):
    data_item = self.data[idx]
    return self.data_tensors[idx], self.label_tensors[idx], data_item

In [52]:
all_data = NamesDataset("data/data/names")

In [53]:
# we cant use a dataloader to batch the data because each name has different lengths
train_dataset, test_dataset = torch.utils.data.random_split(all_data, [0.8, 0.2])

In [69]:
print(f"Name: {train_dataset[0][-1]}, Tensor Shape: {train_dataset[0][0].shape}")

Name: Toal, Tensor Shape: torch.Size([4, 1, 58])


In [55]:
# Creating the network
import torch.nn as nn
class RNNcell(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()

    # This layer will handle the input at the current time step (x_t)
    self.x_t = nn.Linear(input_size, hidden_size)

    # This layer will handle the hidden state from the previous time step (h_t)
    self.h_t = nn.Linear(hidden_size, hidden_size)

  def forward(self, x, h_x):
    # x is of shape (batch_size, n_rows, n_cols)
    return torch.tanh(self.x_t(x) + self.h_t(h_x))



In [56]:
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, RNNcell=RNNcell):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.RNNcell = RNNcell(input_size, hidden_size)

  def forward(self, x):

    output = []
    # x is of shape (batch_size, n_rows, n_cols)
    n_steps = x.shape[0] # this is the number of data points

    # Initialize the hidden state as a tensor of zeros; hidden state for each data point
    h_x = torch.zeros(x.shape[1], self.hidden_size) # hidden state

    for i in range(n_steps):
      # At each step, feed the character and previous hidden state into the RNN cell
      # x[i] has shape (1, 58)

      h_x = self.RNNcell(x[i], h_x)
      output.append(h_x)

    return torch.stack(output), h_x

In [57]:
class charRNN(nn.Module):
  def __init__(self, input_size=n_letters, hidden_size=128, output_size=len(all_data.label_uniq) ):
    super().__init__()
    self.rnn = RNN(input_size, hidden_size)
    self.h2o = nn.Linear(hidden_size, output_size) # hidden to output

  def forward(self, x):
    rnn_output, hidden = self.rnn(x)
    output = self.h2o(hidden)
    return output

In [58]:
model = charRNN()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

charRNN(
  (rnn): RNN(
    (RNNcell): RNNcell(
      (x_t): Linear(in_features=58, out_features=128, bias=True)
      (h_t): Linear(in_features=128, out_features=128, bias=True)
    )
  )
  (h2o): Linear(in_features=128, out_features=18, bias=True)
)

In [59]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [60]:
import random
import numpy as np
# Train the model
epochs = 27
batch_size = 64
model.train()
for epoch in range(epochs):
  running_loss = 0.0
  batches = list(range(len(train_dataset)))
  random.shuffle(batches)
  batches = np.array_split(batches, len(train_dataset)//batch_size)
  for batch in batches:
    batch_loss = 0.0
    for i in batch:
      data, target, _ = train_dataset[i]
      data, target = data.to(device), target.to(device)
      output = model(data)
      loss = criterion(output, target)
      batch_loss += loss
    batch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    running_loss += batch_loss.item()
  print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_dataset)}")



Epoch 1, Loss: 1.5168744960134086
Epoch 2, Loss: 1.07276706766815
Epoch 3, Loss: 0.9443094840233828
Epoch 4, Loss: 0.868816487726803
Epoch 5, Loss: 0.8022869074478245
Epoch 6, Loss: 0.7566109913817676
Epoch 7, Loss: 0.7251738077973666
Epoch 8, Loss: 0.6961696346848275
Epoch 9, Loss: 0.668678914565375
Epoch 10, Loss: 0.6420803967923631
Epoch 11, Loss: 0.6185171896909569
Epoch 12, Loss: 0.5968777989094165
Epoch 13, Loss: 0.5741230902903404
Epoch 14, Loss: 0.5545712854615777
Epoch 15, Loss: 0.5383377420798334
Epoch 16, Loss: 0.5194439397505479
Epoch 17, Loss: 0.4994999759075502
Epoch 18, Loss: 0.48689501603246477
Epoch 19, Loss: 0.4668450525362198
Epoch 20, Loss: 0.45308816020248244
Epoch 21, Loss: 0.4383830129877568
Epoch 22, Loss: 0.4217781994440784
Epoch 23, Loss: 0.4076407665333445
Epoch 24, Loss: 0.39048308168223606
Epoch 25, Loss: 0.3748656043079989
Epoch 26, Loss: 0.36341478405972644
Epoch 27, Loss: 0.3547410526133118


In [61]:
def get_topk_label(output, output_labels):
  values, indices = output.topk(1)
  label_idx = indices[0].item()
  return output_labels[label_idx], label_idx

In [62]:
# Determine accuracy
model.eval()
with torch.no_grad():
  correct = 0
  total = len(test_dataset)
  for data, target, _ in test_dataset:
    data, target = data.to(device), target.to(device)
    output = model(data)
    label, label_idx = get_topk_label(output, all_data.label_uniq)
    if label_idx == target.item():
      correct += 1
  print(f"Accuracy: {100*correct/total}")

Accuracy: 81.58943697060289


In [63]:
# test the model
import random
model.eval()
with torch.no_grad():
  indices = random.sample(range(len(test_dataset)), 10)
  test_sample = [test_dataset[i] for i in indices]
  for (data, target, name), idx in zip(test_sample, indices):
    data, target = data.to(device), target.to(device)
    output = model(data)
    label, label_idx = get_topk_label(output, all_data.label_uniq)
    print(f"Name: {name}, Predicted: {label}, Actual: {all_data.label_uniq[target.item()]}")




Name: Alman, Predicted: English, Actual: Russian
Name: Pointer, Predicted: English, Actual: English
Name: Junin, Predicted: Russian, Actual: Russian
Name: Maryanov, Predicted: Russian, Actual: Russian
Name: Tsegoev, Predicted: Russian, Actual: Russian
Name: Judin, Predicted: Russian, Actual: Russian
Name: Kennard, Predicted: English, Actual: English
Name: Ventura, Predicted: Spanish, Actual: English
Name: Abadi, Predicted: Arabic, Actual: Arabic
Name: Schlusser, Predicted: German, Actual: German


In [70]:
!jupyter nbconvert --to markdown --output=RNN RNN.ipynb

[NbConvertApp] Converting notebook RNN.ipynb to markdown
[NbConvertApp] Writing 10853 bytes to RNN.md
