In [1]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.datasets import  fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torchvision import transforms
from tqdm import tqdm

In [3]:
!wget https://download.pytorch.org/tutorial/data.zip

--2024-02-22 11:22:22--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 108.138.26.43, 108.138.26.24, 108.138.26.16, ...
Connecting to download.pytorch.org (download.pytorch.org)|108.138.26.43|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip.1’


2024-02-22 11:22:24 (2.14 MB/s) - ‘data.zip.1’ saved [2882130/2882130]



In [None]:
!unzip data.zip 

In [2]:
BASE_DIR = 'data/names/'

In [3]:
os.listdir(BASE_DIR)

['Scottish.txt',
 'Vietnamese.txt',
 'Korean.txt',
 'Irish.txt',
 'French.txt',
 'Italian.txt',
 'Czech.txt',
 'Chinese.txt',
 'Dutch.txt',
 'Portuguese.txt',
 'Arabic.txt',
 'Greek.txt',
 'English.txt',
 'Russian.txt',
 'Spanish.txt',
 'Japanese.txt',
 'German.txt',
 'Polish.txt']

In [6]:
from string import ascii_letters

all_letters = ascii_letters + " .,;'-"
n_letters = len(all_letters)
n_letters
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'-"

In [8]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

Slusarski


In [9]:
all_category = {}

for filename in os.listdir(BASE_DIR):
    lang_name = filename.split(".")[0]
    with open(os.path.join(BASE_DIR, filename), "r") as f:
        all_category[lang_name] = [unicodeToAscii(name) for name in f.readlines()]

In [18]:
all_category["Italian"][:5]

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']

In [13]:
def findIndex(char):
    return all_letters.find(char)

def letter2vec(char) -> torch.Tensor:
    result = torch.zeros(n_letters)
    result[findIndex(char)] = 1
    return result

def word2vec(word: str) -> torch.Tensor:
    # [seqLength, letterVecLength]
    result = torch.zeros(len(word), n_letters)
    # encoding for every character in the word!
    for i, char in enumerate(word):
        result[i] = letter2vec(char)
    # batch_size = 1
    return result.unsqueeze(1)

word2vec('Shalgham').size()

torch.Size([8, 1, 58])

In [34]:
rnn = nn.RNN(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [14]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = F.tanh(self.i2h(combined))
        output = self.h2o(hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
n_categories = len(all_category)
rnn = RNN(n_letters, n_hidden, n_categories)

In [53]:
input = letter2vec('A').unsqueeze(0)
hidden = rnn.initHidden()

output, next_hidden = rnn(input, hidden)
output.size()

torch.Size([1, 18])

In [16]:
input = word2vec('Albert')
hidden = rnn.initHidden()

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-0.0600,  0.0108,  0.0827,  0.0740,  0.0695,  0.0083,  0.0758, -0.1013,
         -0.0188, -0.0473, -0.0691, -0.0545, -0.0612, -0.0666, -0.0127, -0.0908,
          0.0555,  0.0888]], grad_fn=<AddmmBackward0>)


In [22]:
labels_ = list(all_category.keys())
labels_

['Scottish',
 'Vietnamese',
 'Korean',
 'Irish',
 'French',
 'Italian',
 'Czech',
 'Chinese',
 'Dutch',
 'Portuguese',
 'Arabic',
 'Greek',
 'English',
 'Russian',
 'Spanish',
 'Japanese',
 'German',
 'Polish']

In [21]:
dataset = []
for label in all_category:
    for word in all_category.get(label):
        dataset.append((word2vec(word), torch.tensor(labels_.index(label)).long().unsqueeze(0)))


In [23]:
for data, label in dataset:
    print(data.size())
    print(label)
    break

torch.Size([5, 1, 58])
tensor([0])


In [24]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, shuffle=True, random_state=42)

In [26]:
for data, label in train_data:
    print(data.size())
    print(label)
    break

torch.Size([9, 1, 58])
tensor([13])


In [27]:
print("train_size", len(train_data))
print("test_size", len(test_data))

train_size 16059
test_size 4015


In [32]:
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)

In [30]:
data, label = next(iter(train_data))
for elm in data:
    print(elm.size())

torch.Size([1, 58])
torch.Size([1, 58])
torch.Size([1, 58])
torch.Size([1, 58])
torch.Size([1, 58])
torch.Size([1, 58])
torch.Size([1, 58])
torch.Size([1, 58])
torch.Size([1, 58])


In [36]:
from random import shuffle
# train_loop

epoch_num = 3

rnn.train()
for epoch in range(epoch_num):
    shuffle(train_data)
    for i, (data, label) in enumerate(train_data):
        hidden = rnn.initHidden()
        for charVec in data:
            output, hidden = rnn(charVec, hidden)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 3000 == 0:
            print(f"Epoch{epoch + 1}/{epoch_num} | Step: {i}/{len(train_data)} | loss: {loss.item()}")

Epoch1/3 | Step: 0/16059 | loss: 0.1533963829278946
Epoch1/3 | Step: 3000/16059 | loss: 0.08053922653198242
Epoch1/3 | Step: 6000/16059 | loss: 2.7965054512023926
Epoch1/3 | Step: 9000/16059 | loss: 1.626822590827942
Epoch1/3 | Step: 12000/16059 | loss: 0.14609412848949432
Epoch1/3 | Step: 15000/16059 | loss: 0.01434278767555952
Epoch2/3 | Step: 0/16059 | loss: 0.6373049020767212
Epoch2/3 | Step: 3000/16059 | loss: 0.10608932375907898
Epoch2/3 | Step: 6000/16059 | loss: 2.552569627761841
Epoch2/3 | Step: 9000/16059 | loss: 2.259949207305908
Epoch2/3 | Step: 12000/16059 | loss: 0.1402139812707901
Epoch2/3 | Step: 15000/16059 | loss: 0.004119719844311476
Epoch3/3 | Step: 0/16059 | loss: 0.25335437059402466
Epoch3/3 | Step: 3000/16059 | loss: 0.028626656159758568
Epoch3/3 | Step: 6000/16059 | loss: 2.337625026702881
Epoch3/3 | Step: 9000/16059 | loss: 0.9881576895713806
Epoch3/3 | Step: 12000/16059 | loss: 0.026338184252381325
Epoch3/3 | Step: 15000/16059 | loss: 0.0037486536893993616


In [41]:
output

tensor([[-2.2384, -2.8640, -1.1613, -2.6243, -2.8200, -0.5234, -0.3238, -0.9883,
         -1.6166, -3.7954, -7.4164, -1.2244, -0.1587,  5.5800, -1.0634,  2.1558,
         -0.7491,  1.2025]], grad_fn=<AddmmBackward0>)

In [48]:
# test loop

rnn.eval()
correct = 0
output
for data, label in test_data:
    hidden = rnn.initHidden()
    for charVec in data:
        output, hidden = rnn(charVec, hidden)
    predict = output.argmax(dim=1)
    correct += (predict == label).item()

print(f"Accuracy: {(correct / len((test_data))*100):.4f}%")

Accuracy: 74.5205%


In [49]:
def predict_lan(word, model):
    model.eval()
    hidden = model.initHidden()
    for charVec in word2vec(word):
        output, hidden = model(charVec, hidden)
    return labels_[output.argmax(dim=1).item()]
    

In [58]:
predict_lan("Brad", rnn)

'English'

In [67]:
class RNNTorch(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        

    def forward(self, x):
        h = self.initHidden()
        x, _ = self.rnn(x, h)
        output = self.fc(x[-1])
        return output

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size)

In [70]:
n_hidden = 128
n_categories = len(all_category)
model = RNNTorch(n_letters, n_hidden, n_categories)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr= 0.001)

In [68]:
from random import shuffle

def train(model,  epoch_num = 3):
    model.train()
    for epoch in range(epoch_num):
        shuffle(train_data)
        for i, (data, label) in enumerate(train_data):
            output = model(data)
            loss = criterion(output, label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 3000 == 0:
                print(f"Epoch{epoch + 1}/{epoch_num} | Step: {i}/{len(train_data)} | loss: {loss.item()}")

In [71]:
train(model)

Epoch1/3 | Step: 0/16059 | loss: 2.8471620082855225
Epoch1/3 | Step: 3000/16059 | loss: 1.1972882747650146
Epoch1/3 | Step: 6000/16059 | loss: 1.2095849514007568
Epoch1/3 | Step: 9000/16059 | loss: 0.7750042676925659
Epoch1/3 | Step: 12000/16059 | loss: 0.4703407883644104
Epoch1/3 | Step: 15000/16059 | loss: 0.3705781102180481
Epoch2/3 | Step: 0/16059 | loss: 0.026021268218755722
Epoch2/3 | Step: 3000/16059 | loss: 2.0122952461242676
Epoch2/3 | Step: 6000/16059 | loss: 1.5778460502624512
Epoch2/3 | Step: 9000/16059 | loss: 0.043493784964084625
Epoch2/3 | Step: 12000/16059 | loss: 2.667238712310791
Epoch2/3 | Step: 15000/16059 | loss: 0.6212583780288696
Epoch3/3 | Step: 0/16059 | loss: 0.0011263700434938073
Epoch3/3 | Step: 3000/16059 | loss: 0.28189361095428467
Epoch3/3 | Step: 6000/16059 | loss: 0.0012041230220347643
Epoch3/3 | Step: 9000/16059 | loss: 0.13934014737606049
Epoch3/3 | Step: 12000/16059 | loss: 0.06082049757242203
Epoch3/3 | Step: 15000/16059 | loss: 2.0688114166259766


In [73]:
def predict_lan_torchrnn(word, model):
    output = model(word2vec(word))
    return labels_[output.argmax(dim=1).item()]

In [75]:
def test(model):
    model.eval()
    correct = 0
    for data, label in test_data:
        output= model(data)
        predict = output.argmax(dim=1)
        correct += (predict == label).item()

    print(f"Accuracy: {(correct / len((test_data))*100):.4f}%")

In [76]:
test(model)

Accuracy: 74.4956%


In [78]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.gru = nn.GRU(input_size, hidden_size, num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        

    def forward(self, x):
        h = self.initHidden()
        x, _ = self.gru(x, h)
        output = self.fc(x[-1])
        return output

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size)

In [79]:
n_hidden = 128
n_categories = len(all_category)
model = GRU(n_letters, n_hidden, n_categories)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr= 0.001)

In [80]:
train(model)

Epoch1/3 | Step: 0/16059 | loss: 2.803572654724121
Epoch1/3 | Step: 3000/16059 | loss: 0.9476546049118042
Epoch1/3 | Step: 6000/16059 | loss: 0.9676955342292786
Epoch1/3 | Step: 9000/16059 | loss: 0.02496783807873726
Epoch1/3 | Step: 12000/16059 | loss: 0.005485719535499811
Epoch1/3 | Step: 15000/16059 | loss: 0.0013106813421472907
Epoch2/3 | Step: 0/16059 | loss: 0.5188653469085693
Epoch2/3 | Step: 3000/16059 | loss: 0.00022635281493421644
Epoch2/3 | Step: 6000/16059 | loss: 0.0007683662115596235
Epoch2/3 | Step: 9000/16059 | loss: 0.6819219589233398
Epoch2/3 | Step: 12000/16059 | loss: 0.010615547187626362
Epoch2/3 | Step: 15000/16059 | loss: 2.9990482330322266
Epoch3/3 | Step: 0/16059 | loss: 1.4543427823809907e-05
Epoch3/3 | Step: 3000/16059 | loss: 0.01032357756048441
Epoch3/3 | Step: 6000/16059 | loss: 0.05435812473297119
Epoch3/3 | Step: 9000/16059 | loss: 0.9762640595436096
Epoch3/3 | Step: 12000/16059 | loss: 2.2053474822314456e-05
Epoch3/3 | Step: 15000/16059 | loss: 0.051877

In [81]:
test(model)

Accuracy: 82.4408%


In [82]:
n_hidden = 128
n_categories = len(all_category)
model = GRU(n_letters, n_hidden, n_categories, num_layers=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr= 0.001)

In [83]:
train(model)

Epoch1/3 | Step: 0/16059 | loss: 2.8702852725982666
Epoch1/3 | Step: 3000/16059 | loss: 0.028216827660799026
Epoch1/3 | Step: 6000/16059 | loss: 6.147946357727051
Epoch1/3 | Step: 9000/16059 | loss: 3.233870267868042
Epoch1/3 | Step: 12000/16059 | loss: 0.4650093615055084
Epoch1/3 | Step: 15000/16059 | loss: 0.0010240792762488127
Epoch2/3 | Step: 0/16059 | loss: 0.00775008462369442
Epoch2/3 | Step: 3000/16059 | loss: 3.5142769813537598
Epoch2/3 | Step: 6000/16059 | loss: 0.0770866870880127
Epoch2/3 | Step: 9000/16059 | loss: 0.007830873131752014
Epoch2/3 | Step: 12000/16059 | loss: 0.41017746925354004
Epoch2/3 | Step: 15000/16059 | loss: 0.008607430383563042
Epoch3/3 | Step: 0/16059 | loss: 3.0312552452087402
Epoch3/3 | Step: 3000/16059 | loss: 4.695867538452148
Epoch3/3 | Step: 6000/16059 | loss: 0.09136418253183365
Epoch3/3 | Step: 9000/16059 | loss: 0.001149708521552384
Epoch3/3 | Step: 12000/16059 | loss: 0.03540949895977974
Epoch3/3 | Step: 15000/16059 | loss: 2.3978610038757324


In [84]:
test(model)

Accuracy: 82.6899%


In [89]:
predict_lan_torchrnn("Yoshua", model)

'Japanese'