In [1]:
import regex as re
import random
import numpy as np

from tqdm import tqdm
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

##### Implement text classifier based on RNN. Classify sentences based on the language (use at least 3 languages). You can use data from here (https://www.manythings.org/anki/) to build training set.
1. Create training set: each sentence should be assigned with a label corresponding to its language. Perform necessary text preparation and normalization if needed.

In [2]:
eng_sent_len = 10

In [3]:
def preprocess(sent):
    sent = re.sub(r"([,.!?«»])", r"", sent.lower())
    sent = re.sub(r"(['])", r" ", sent)
    sent = re.sub(r"\(\w*\)", r"", sent)
    sent = re.sub(r"\xa0—", r"", sent)

    return sent

def read_pairs(path, eng_sent_len, num_sent=5000):
    text = open(path, encoding='utf-8').read()
    text = text.strip().split('\n')
    pairs = []
    for pair in text[:num_sent]:
        sent = pair.split('\t')
        pairs.append((preprocess(sent[0]).strip(), preprocess(sent[1]).strip()))
    for sent in pairs:
        if len(sent[0]) >= eng_sent_len:
            pairs.remove(sent)
            continue
        if len(sent[1]) >= eng_sent_len:
            pairs.remove(sent)


    return pairs

def bag_w(vocab):   
    w_id, id_w = {}, {}
    
    w_id['eos'] = 0
    id_w[0] = 'eos'   

    for i, word in enumerate(vocab):
        w_id[word] = i + 1
        id_w[i + 1] = word

    return w_id, id_w

In [4]:
pairs_bel = read_pairs('data/bel.txt', eng_sent_len=5)
print(len(pairs_bel), pairs_bel[:2])

pairs_ukr = read_pairs('data/ukr.txt', eng_sent_len=5)
print(len(pairs_ukr), pairs_ukr[:2])

pairs_rus = read_pairs('data/rus.txt', eng_sent_len=5)
print(len(pairs_rus), pairs_rus[:2])

1941 [('wow', 'нішто сабе'), ('jump', 'скачы')]
2501 [('go', 'йди'), ('hi', 'привіт')]
2502 [('go', 'марш'), ('go', 'иди')]


In [5]:
vocab = set()

for _, bel_sent in pairs_bel:
    for word in bel_sent.split(" "):
        vocab.add(word)

for _, ukr_sent in pairs_ukr:
    for word in ukr_sent.split(" "):
        vocab.add(word)

for _, rus_sent in pairs_rus:
    for word in rus_sent.split(" "):
        vocab.add(word)

w_id, id_w = bag_w(vocab)

print("Vocab size:", len(vocab))

Vocab size: 6324


In [47]:
w_id

{'eos': 0,
 'маё': 1,
 'чи': 2,
 'сподіваюся': 3,
 'шуміце': 4,
 'пальчаткі': 5,
 'ітак': 6,
 'ўваходзіць': 7,
 'пажаніліся': 8,
 'пиво': 9,
 'довірся': 10,
 'землю': 11,
 'помираємо': 12,
 'партнёрства': 13,
 'свабоду': 14,
 'бак': 15,
 'поверніться': 16,
 'отмени': 17,
 'монах': 18,
 'обеспокоен': 19,
 'давяраю': 20,
 'вижила': 21,
 'быстрей': 22,
 'маску': 23,
 'встречались': 24,
 'трансільваніі': 25,
 'прычыну': 26,
 'прыйдзе': 27,
 'валізи': 28,
 'чорным': 29,
 'стріляй': 30,
 'поговоріть': 31,
 'выиграла': 32,
 'японцы': 33,
 'відмовився': 34,
 'рой': 35,
 'сошёл': 36,
 'ўмееш': 37,
 'руж': 38,
 'смачно': 39,
 'мудры': 40,
 'е': 41,
 'будынкам': 42,
 'роки': 43,
 'шахматы': 44,
 'швидкий': 45,
 'гадзіннік': 46,
 'дасканалым': 47,
 'протестувала': 48,
 'сосредоточься': 49,
 'отойдите': 50,
 'вінавачу': 51,
 'іще': 52,
 'первый': 53,
 'казаць': 54,
 'жывяце': 55,
 'нью-ёрку': 56,
 'тэнісіст': 57,
 'відчиняй': 58,
 'достаньте': 59,
 'замкніть': 60,
 'павярнуся': 61,
 'працы': 62,
 '

In [6]:
bel_data, ukr_data, rus_data = [], [], []

for _, bel_sent in pairs_bel[0:1500]:
    bel_data.append(([w_id[word] for word in bel_sent.split(" ")], 0))

for _, ukr_sent in pairs_ukr[0:1500]:
    ukr_data.append(([w_id[word] for word in ukr_sent.split(" ")], 1))

for _, rus_sent in pairs_rus[0:1500]:
    rus_data.append(([w_id[word] for word in rus_sent.split(" ")], 2))

all_data = bel_data + ukr_data + rus_data

In [27]:
bs = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x = np.zeros((len(all_data), eng_sent_len+1), dtype=np.int32)
y = np.zeros((len(all_data), 3), dtype=np.float64)

for index, (sent, label) in enumerate(all_data):
    x[index, :len(sent)] = sent
    y[index, label] = 1

train_data = TensorDataset(torch.LongTensor(x).to(device), torch.FloatTensor(y).to(device))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

##### 2. Build RNN model (you can use pre-built rnn units from pytorch or tensorflow). Your model would have one RNN layer and one linear layer followed by softmax to predict class probability. You can optionally use an embedding layer of pytorch to avoid creating word vectors. 

In [8]:
class RNN(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, num_class):
        super().__init__()
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=emb_size) #100
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True) #50
        self.output = nn.Linear(hidden_size, num_class) #3
        #self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        # print(x.shape)
        emb = self.embedding(x)
        #print(emb.shape)
        # x = emb.view(len(x), 32, -1)
        # print(x.shape)
        _, hid = self.gru(emb)
        #print(hid.shape)
        # print(hid[0].shape)
        out = self.output(hid)
        
        return out

##### 3. Perform training of the model.

In [9]:
learning_rate = 0.001
epochs = 10

In [10]:
rnn = RNN(input_size=len(vocab), emb_size=100, hidden_size=50, num_class=3)
rnn = rnn.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)

loss_list = []

for epoch in range(epochs):
    loss_sum = 0
    for x_tensor, y_tensor in tqdm(train_dataloader):
        optimizer.zero_grad()
        y_pred = rnn(x_tensor)
        loss = loss_fn(y_pred, y_tensor.reshape(y_pred.size()))
        loss_sum += loss.item()
        loss.backward()
        optimizer.step()
    loss_list.append(loss.item())
    print('epoch: {}, loss {}'.format(epoch, loss_sum/len(train_dataloader)))


100%|██████████| 141/141 [00:00<00:00, 228.90it/s]


epoch: 0, loss 34.75879268781513


100%|██████████| 141/141 [00:00<00:00, 288.82it/s]


epoch: 1, loss 32.18474576151963


100%|██████████| 141/141 [00:00<00:00, 293.66it/s]


epoch: 2, loss 30.55444026838803


100%|██████████| 141/141 [00:00<00:00, 296.22it/s]


epoch: 3, loss 29.382725539782367


100%|██████████| 141/141 [00:00<00:00, 298.59it/s]


epoch: 4, loss 28.598963940397223


100%|██████████| 141/141 [00:00<00:00, 288.86it/s]


epoch: 5, loss 27.574151269087555


100%|██████████| 141/141 [00:00<00:00, 291.23it/s]


epoch: 6, loss 27.197062438261426


100%|██████████| 141/141 [00:00<00:00, 275.29it/s]


epoch: 7, loss 26.714005449984935


100%|██████████| 141/141 [00:00<00:00, 286.61it/s]


epoch: 8, loss 26.370326461521447


100%|██████████| 141/141 [00:00<00:00, 286.59it/s]

epoch: 9, loss 26.210150833670976





In [11]:
torch.save(rnn.state_dict(), "rnn_checkpoint.pt")

In [15]:
rnn = RNN(input_size=len(vocab), emb_size=100, hidden_size=50, num_class=3)
rnn = rnn.to(device)
rnn.load_state_dict(torch.load("rnn_checkpoint.pt"))
rnn.eval()

RNN(
  (embedding): Embedding(6324, 100)
  (gru): GRU(100, 50, batch_first=True)
  (output): Linear(in_features=50, out_features=3, bias=True)
)

In [23]:
random.shuffle(all_data)

In [46]:
sent, label = all_data[21]

print(" ".join([id_w[word] for word in sent]))

x = torch.tensor(sent, dtype=torch.long, device=device)
y_pred = rnn(x)

#print(y_pred)
#print(y_pred.argmax().item())

if y_pred.argmax().item() == 0:
    print('Belarusian')

if y_pred.argmax().item() == 1:
    print('Ukrainian')

if y_pred.argmax().item() == 2:
    print('Russian')

я крут
Russian


In [35]:
def predict(sent_my):
    sent_my_w = sent_my.split()
    sent_my_id = [w_id[word] for word in sent_my_w]
    x = torch.tensor(sent, dtype=torch.long, device=device)
    y_pred = rnn(x)

    if y_pred.argmax().item() == 0:
        print('Belarusian')

    if y_pred.argmax().item() == 1:
        print('Ukrainian')

    if y_pred.argmax().item() == 2:
        print('Russian')

In [37]:
sent_my_rus = 'я иду домой' #rus
sent_my_ukr = 'я йду додому' #ukr
sent_my_bel = 'я іду дадому' #bel

predict(sent_my_rus)
predict(sent_my_ukr)
predict(sent_my_bel)

Belarusian
Belarusian
Belarusian
