In [6]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.datasets import  fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torchvision import transforms
from tqdm import tqdm

In [3]:
!wget https://download.pytorch.org/tutorial/data.zip

--2024-02-22 11:22:22--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 108.138.26.43, 108.138.26.24, 108.138.26.16, ...
Connecting to download.pytorch.org (download.pytorch.org)|108.138.26.43|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip.1’


2024-02-22 11:22:24 (2.14 MB/s) - ‘data.zip.1’ saved [2882130/2882130]



In [None]:
!unzip data.zip 

In [8]:
BASE_DIR = 'data/names/'

In [9]:
os.listdir(BASE_DIR)

['Scottish.txt',
 'Vietnamese.txt',
 'Korean.txt',
 'Irish.txt',
 'French.txt',
 'Italian.txt',
 'Czech.txt',
 'Chinese.txt',
 'Dutch.txt',
 'Portuguese.txt',
 'Arabic.txt',
 'Greek.txt',
 'English.txt',
 'Russian.txt',
 'Spanish.txt',
 'Japanese.txt',
 'German.txt',
 'Polish.txt']

In [11]:
from string import ascii_letters

all_letters = ascii_letters + " .,;'"
n_letters = len(all_letters)
n_letters

57

In [14]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

Slusarski


In [15]:
all_category = {}

for filename in os.listdir(BASE_DIR):
    lang_name = filename.split(".")[0]
    with open(os.path.join(BASE_DIR, filename), "r") as f:
        all_category[lang_name] = [unicodeToAscii(name) for name in f.readlines()]

In [18]:
all_category["Italian"][:5]

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']

In [33]:
def findIndex(char):
    return all_letters.find(char)

def letter2vec(char) -> torch.Tensor:
    result = torch.zeros(n_letters)
    result[findIndex(char)] = 1
    return result

def word2vec(word: str) -> torch.Tensor:
    result = torch.zeros(len(word), n_letters)
    for i, char in enumerate(word):
        result[i] = letter2vec(char)
    return result.unsqueeze(1)

word2vec('Abandonato').size()


torch.Size([10, 1, 57])

In [34]:
rnn = nn.RNN(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [47]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
n_categories = len(all_category)
rnn = RNN(n_letters, n_hidden, n_categories)

In [53]:
input = letter2vec('A').unsqueeze(0)
hidden = rnn.initHidden()

output, next_hidden = rnn(input, hidden)
output.size()

torch.Size([1, 18])

In [54]:
input = word2vec('Albert')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-2.9288, -2.9927, -2.9489, -2.8924, -2.8164, -2.8930, -2.8824, -2.7824,
         -2.9522, -2.7228, -2.9295, -2.8716, -2.9190, -2.9778, -2.8913, -2.8750,
         -2.9026, -2.8863]], grad_fn=<LogSoftmaxBackward0>)


In [58]:
labels_ = list(all_category.keys())
# labels_

In [65]:
dataset = []
for label in all_category:
    for word in all_category.get(label):
        dataset.append((word2vec(word), torch.tensor(labels_.index(label)).long().unsqueeze(0)))


In [71]:
for data, label in dataset:
    print(data.size())
    print(label)
    break

torch.Size([5, 1, 57])
tensor([10])


In [68]:
from random import shuffle

shuffle(dataset)

In [None]:
out, hidden = rnn(time_step, hidden)

In [None]:
# train_loop

epoch_num = 3

for epoch in range(epoch_num):
    for data, label in dataset:
        pass