In [14]:
import subprocess
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
DOWNLOAD_DATA_URL = 'https://www.dropbox.com/s/yy2zqh34dyhv07i/data.txt?dl=1'
FULL_DATA_PATH = 'data.txt'
numbers = "".join(list(map(str, range(10))))
BOS = "<bos>"
EOS = "<eos>"
UNK = "<unk>"
PAD = "<pad>"
signs = "!*?.,-' "
MAX_LENGTH = 100
BATCH_SIZE = 32
HIDDEN_SIZE = 256
EMBEDDING_DIM = 256
CORRECT_ENGLISH = "".join(list(map(chr, range(97, 123)))) + signs + numbers
CORRECT_RUSSIAN = "".join(list(map(chr, range(1072, 1104)))) + signs + numbers
subprocess.check_call(['wget', DOWNLOAD_DATA_URL, '-O', FULL_DATA_PATH])

--2024-10-15 13:05:47--  https://www.dropbox.com/s/yy2zqh34dyhv07i/data.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6021:18::a27d:4112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/scl/fi/mw8tdyetqboqwkn5ma886/data.txt?rlkey=t9fmsizx27ikh0vak0ir265a6&dl=1 [following]
--2024-10-15 13:05:47--  https://www.dropbox.com/scl/fi/mw8tdyetqboqwkn5ma886/data.txt?rlkey=t9fmsizx27ikh0vak0ir265a6&dl=1
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucd146df9380bfb16393f459b05e.dl.dropboxusercontent.com/cd/0/inline/Ccf7JmazjwQEKzZfmmPjLZ5BC9eMlxQj6MrC7H5al36tiUhYUopfDKsJGQTVjCQZ-tF_NNCKlD9ujQvvIrm1ZOKEYswyPEB9HjdWrWp2kpufkiK1ch6jXaBbZi8pwegg0ZY/file?dl=1# [following]
--2024-10-15 13:05:48--  https://ucd146df9380bfb16393f459b05e.dl.dropboxusercontent.com/cd/0/inline/Ccf7JmazjwQ

0

In [3]:
def clean_text(sentence):
    sentence = sentence.lower()
    return "".join([letter for letter in sentence if letter in CORRECT_ENGLISH or letter in CORRECT_RUSSIAN])

In [4]:
eng_texts = []
rus_texts = []
with open(FULL_DATA_PATH, 'r') as f:
    for line in f.readlines():
        eng, rus = [clean_text(sentence) for sentence in line.strip().split("\t")]
        eng_texts.append(eng)
        rus_texts.append(rus)
        
eng_texts = np.array(eng_texts)
rus_texts = np.array(rus_texts)

In [5]:
eng_rus = list(set(CORRECT_RUSSIAN + CORRECT_ENGLISH))
token_to_idx = {token: i for i, token in enumerate(eng_rus + [BOS, EOS, UNK, PAD])}
idx_to_token = {i: token for i, token in enumerate(eng_rus + [BOS, EOS, UNK, PAD])}

In [6]:
X_train, X_test, y_train, y_test = train_test_split(eng_texts, rus_texts, test_size = .1, shuffle=True)

In [7]:
def tokenize(sentence):
    tokenized_sentence = list(sentence)
    return [BOS] + tokenized_sentence + [EOS]

def text_to_idx(sentence):
    sentence = tokenize(sentence)
    list_idx = [token_to_idx.get(letter, token_to_idx["<unk>"]) for letter in sentence]
    return list_idx

def idx_to_text(sentence):
    list_letters = [idx_to_token.get(letter, "<unk>") for letter in sentence]
    return list_letters

In [8]:
class EngRusDataset(Dataset):
    
    def __init__(self, eng, rus):
        super().__init__()
        self.eng = eng
        self.rus = rus
        
    def __len__(self):
        return len(self.eng)
    
    def __getitem__(self, idx):
        eng = text_to_idx(self.eng[idx])
        rus = text_to_idx(self.rus[idx])
        
        if len(eng) > 100:
            eng = eng[:100]
        elif len(eng) < 100:
            eng += (MAX_LENGTH - min(MAX_LENGTH, len(eng))) * [token_to_idx["<pad>"]] 
        
        if len(rus) > 100:
            rus = rus[:100]
        elif len(rus) < 100:
            rus += (MAX_LENGTH - min(MAX_LENGTH, len(rus))) * [token_to_idx["<pad>"]]
        
        eng, rus = torch.Tensor(eng).long(), torch.Tensor(rus).long()
        return eng, rus

In [9]:
train_dataset = EngRusDataset(X_train, y_train)
test_dataset = EngRusDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [10]:
class EncoderDecoder(nn.Module):
    def __init__(self, length, hidden_size, num_layers, embedding_dim, device, is_bidirectional=False):
        super().__init__()
        
        self.length = length
        self.hidden_size = hidden_size
        self.device = device
        
        self.embed1 = nn.Embedding(num_embeddings=len(token_to_idx), embedding_dim = embedding_dim)
        self.embed2 = nn.Embedding(num_embeddings=len(token_to_idx), embedding_dim = embedding_dim)
        
        self.encoder = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=is_bidirectional, dropout=0.0)
        
        self.decoder = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=is_bidirectional, dropout=0.0)
        
        self.layer1 = nn.Linear(hidden_size, len(token_to_idx))
        
    def forward(self, eng, rus):
        
        length_tensor = (eng != token_to_idx['<pad>']).sum(axis=1).cpu()
        
        eng_embed = self.embed1(eng)
        
        eng_embed = pack_padded_sequence(eng_embed, length_tensor, batch_first=True, enforce_sorted=False)
        
        _, (hn, cn) = self.encoder(eng_embed)
        bos_token = (torch.ones_like(rus)[:,0:1,...] * token_to_idx['<bos>']).to(self.device)
        rus_embed = self.embed2(bos_token)
        output = torch.zeros((rus.shape[0], self.length, len(token_to_idx))).to(self.device)
        index = 0
        while True:    
            logits, (hn, cn) = self.decoder(rus_embed, (hn, cn))
            temp_output = self.layer1(logits)
            argmax_output = torch.argmax(temp_output, dim=-1)
            rus_embed = self.embed2(argmax_output)
            output[:,index:index+1,:] = temp_output
            index += 1
                
            if index == self.length:
                return output

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = EncoderDecoder(length=MAX_LENGTH, hidden_size=HIDDEN_SIZE, num_layers=3, device=device, embedding_dim=EMBEDDING_DIM)
model = model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=token_to_idx['<pad>'])

In [19]:
def train(model, optim, loss_fn, loader, device):
    for index, (x, y) in enumerate(tqdm(loader)):
        model.train()
        x = x.to(device)
        y = y.to(device)
        
        output = model(x,y)
        loss = loss_fn(output.permute(0, 2, 1), y.long())
        
        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optim.step()
        
        if index % 500 == 0:
            sentence1 = "".join(idx_to_text(x[0].tolist())).replace("<pad>", "").replace("<bos>", "").replace("<eos>", "")
            sentence2 = "".join(idx_to_text(torch.argmax(output, dim=-1)[0].tolist())).replace("<pad>", "").replace("<bos>", "").replace("<eos>", "")
            sentence3 = "".join(idx_to_text(y[0].tolist())).replace("<pad>", "").replace("<bos>", "").replace("<eos>", "")
            print(loss.item())
            print("Source Sentence")
            print(sentence1)
            print("Model Output")
            print(sentence2)
            print("Target Output")
            print(sentence3)
            

In [None]:
for epoch in range(20):
    print(epoch, end ='\n')
    train(model, optim, loss_fn, train_loader, device)
    

0


  0%|          | 1/1407 [00:00<08:56,  2.62it/s]

2.903298854827881
Source Sentence
guests can prepare homemade meals in the kitchen equipped with a stove, microwave and refrigerator.
Model Output
гости могут оосооооо  ооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооооо
Target Output
гости могут готовить самостоятельно на кухне, оснащенной плитой, микроволновой печью и холодильнико


 36%|███▌      | 501/1407 [01:55<03:27,  4.36it/s]

2.9782614707946777
Source Sentence
wi-fi is free throughout the premises, and free indoor parking is provided on site.
Model Output
н услугам гостей беспаатн                                                      аааааааааааааааааааа
Target Output
пользование wi-fi на территории отеля бесплатно. также вы можете бесплатно воспользоваться парковко


 71%|███████   | 1001/1407 [03:48<01:39,  4.09it/s]

2.934478759765625
Source Sentence
extras include a desk and bed linen.
Model Output
в сие т гое с   остт                 ееееееееееееееееееееееееееееееееееееееееееееееееееееееееее
Target Output
кроме того, в номерах установлен письменный стол и предоставляется постельное белье.


100%|██████████| 1407/1407 [05:20<00:00,  4.40it/s]


1


  0%|          | 1/1407 [00:00<06:20,  3.70it/s]

2.956096887588501
Source Sentence
each room includes a spacious work desk.
Model Output
в касдо  оомер   оотт                                                        
Target Output
в каждом номере установлен большой рабочий стол.


 36%|███▌      | 501/1407 [01:54<03:27,  4.37it/s]

2.9639172554016113
Source Sentence
room service can be requested for in-room dining comforts.
Model Output
по зеп ото  оооооо                                           оооооооооооооооооооо
Target Output
по запросу осуществляется доставка еды и напитков в номер.


 71%|███████   | 1001/1407 [03:49<01:34,  4.31it/s]

2.941469430923462
Source Sentence
centrally located in cesena, bb al re offers free wi-fi and elegant accommodation with air conditio
Model Output
комлллой то  с                                                                                     
Target Output
отель типа постель и завтрак al re расположен в центре города чезена. к услугам гостей бесплатный w


 80%|███████▉  | 1125/1407 [04:16<01:22,  3.42it/s]

In [97]:
for index, (x, y) in enumerate(tqdm(train_loader)):
        x = x.to(device)
        y = y.to(device)
        break

  0%|          | 0/1407 [00:00<?, ?it/s]


In [111]:
model.eval()
with torch.no_grad():
    temp = model.embed1(x)
    _, (hn, cn) = model.encoder(temp)
    
#     start = torch.tensor([rus_token_to_idx['<bos>']]).unsqueeze(dim=0).to(device)
#     start = model.embed2(start)
#     output = torch.tensor([])
#     result = ''
    
#     while output.tolist() != rus_token_to_idx['<eos>']:
#         logits, (hn, cn) = model.decoder(start, (hn, cn))
#         output = model.layer1(logits)
#         output = torch.argmax(output, dim=-1)[0]
#         start = output.unsqueeze(dim=0)
#         start = model.embed2(start)
#         result += "".join(idx_to_text(output.tolist(), lang='rus'))
#         if len(result) > 100:
#             break
#     print(result)

    start = y[0:1,...]
    start = model.embed2(start)
    output = torch.tensor([])
    result = ''
    index = 0
    while output.tolist() != rus_token_to_idx['<eos>']:
        index += 1
        logits, (hn, cn) = model.decoder(start, (hn, cn))
        output = model.layer1(logits)
        output = torch.argmax(output, dim=-1)[0]
        start = y[index:index+1,...]
        start = model.embed2(start)
        result += "".join(idx_to_text(output.tolist(), lang='rus'))
        if len(result) > 100:
            break
    print(result)    
    

<bos>билеты можно приобрести на территории отеля.<eos><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


EncoderDecoder(
  (embed1): Embedding(48, 32)
  (embed2): Embedding(54, 32)
  (encoder): LSTM(32, 256, num_layers=3, batch_first=True, dropout=0.5)
  (decoder): LSTM(32, 256, num_layers=3, batch_first=True, dropout=0.5)
  (layer1): Linear(in_features=256, out_features=54, bias=True)
)