# Local variable

In [1]:
BASE_PATH = "."

# Imports

In [253]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook, tqdm
from nltk.tokenize.regexp import WordPunctTokenizer
from sklearn.preprocessing import LabelBinarizer
from scipy import sparse

import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
# from torch.nn.utils.rnn import pack_sequence

from gensim.models import FastText

In [3]:
RU_LABEL = "RU"
EN_LABEL = "EN"

RU_DS_LABEL = "RU"
EN_DS_LABEL = "EN"

EOS_LABEL = "<EOS>"
SOS_LABEL = "<SOS>"

In [4]:
def load_data(word_dir:str)-> pd.DataFrame:
    
    path = os.path.join(word_dir, "data/corpus.en_ru.1m.en")
    data_en = load_corpus(path)

    path = os.path.join(word_dir, "data/corpus.en_ru.1m.ru")
    data_ru = load_corpus(path)
    
    df = pd.DataFrame({RU_LABEL: data_ru, EN_LABEL: data_en})
    return df

def load_corpus(path:str)->list:
    with open(path, mode="r") as file:
        data = file.readlines()
    data = [s.strip().lower() for s in data]
    return data

In [5]:
class WordPunctTokenizerWrapper:
    
    def __init__(self, with_SOS=False):
        self.tokenizer = WordPunctTokenizer()
        self.with_SOS = with_SOS
        
    def __call__(self, text):
        token_list = self.tokenizer.tokenize(text)
        if self.with_SOS:
            token_list.insert(0, SOS_LABEL)
        token_list.append(EOS_LABEL)
        return token_list

def tokenizer_factory(factory_name):
    tokenizer = None
    if factory_name == "wpt":
        tokenizer = WordPunctTokenizerWrapper(with_SOS=True)
    elif factory_name == "ru_tok":
        tokenizer = WordPunctTokenizerWrapper()
    return tokenizer  

In [6]:
class Vocabular:
    
    def __init__(self):
        self.word_to_index = {}
        self.index_to_word = {}
        self.max_index = -1
        
    def fit(self, X, Y=None):
        for sample in X:
            for feature in sample:
                if feature not in self.word_to_index:
                    self.max_index += 1
                    self.word_to_index[feature] = self.max_index
                    
        for word, index in self.word_to_index.items():
            self.index_to_word[index] = word
            
    def transform(self, X):
        sentense_list = []
        for sample in X:
            sentence = []
            sentense_list.append(sentence)
            for feature in sample:
                index = self.word_to_index[feature]
                sentence.append(index)
        return sentense_list
    
    def inverse_transform(self, X):
        sentense_list = []
        for sample in X:
            sentence = []
            sentense_list.append(sentence)
            for feature in sample:
                word = self.index_to_word[feature]
                sentence.append(word)
        return sentense_list

In [None]:
def allignment(sentence_list, encoder, max_length):
    result = []
    for sentence in tqdm_notebook(sentence_list):
        temp = encoder.transform(sentence)
#         size = temp.shape[0]
#         if size >= max_length:
#             temp = temp[:max_length]
#         zero_vector_np = sparse.csr_matrix((max_length - temp.shape[0], temp.shape[1]), dtype=np.int8)
# #         temp = sparse.csr_matrix(temp, dtype=np.int8)
#         temp = sparse.vstack([temp, zero_vector_np], dtype=np.int8)
        result.append(temp)
    return result

In [7]:
class FastTextWrapper:
    
    def __init__(self, embedder):
        self.embedder = embedder
    
    def transform(self, data):
        res = []
        for sentence in data:
            temp = []
            for x in sentence:
                if x in self.embedder.wv:
                    temp.append(x)
            vector_np = self.embedder.wv[temp]
            res.append(vector_np)
        return res

In [127]:
class RUENDataset(Dataset):
    
    def __init__(self, ru_data_list, en_data_list, ru_encoder, en_encoder):
        self.ru_data = ru_data_list
        self.en_data = en_data_list
        self.en_encoder = en_encoder
        self.ru_encoder = ru_encoder
        
    def __len__(self):
        return len(self.en_data)
    
    def __getitem__(self, pos):
        en_vector = self._get_en_sentence(pos)
        en_vector = torch.tensor(en_vector, dtype=torch.float32)
        ru_vector = self._get_ru_sentence(pos)
        ru_vector = torch.tensor(ru_vector, dtype=torch.float32)
        
        return {RU_DS_LABEL:ru_vector, EN_DS_LABEL:en_vector}
    
    def _get_ru_sentence(self, pos):
        ru_sentence_list = self.ru_data[pos]
        ru_sentence_list = [ru_sentence_list]
        vector = self.ru_encoder.transform(ru_sentence_list)
        vector = vector[0]
        return vector
    
    def _get_en_sentence(self, pos):
        en_sentence_list = self.en_data[pos]
        sentence_list = self.en_encoder.transform(en_sentence_list)
        return sentence_list

In [257]:
def show_translation(ru_list, en_list):
    index = random.randint(0, len(ru_list)-1)
    ru_sent = " ".join(ru_list[index])
    en_sent = " ".join(en_list[index])
    print(ru_sent)
    print(en_sent)

# Load data

In [9]:
corpus_df = load_data(BASE_PATH)

In [None]:
corpus_df.head()

In [10]:
corpus_df = corpus_df.iloc[:10]

# Convert English tokens in one hot vectors

In [37]:
tokenizer = tokenizer_factory("wpt")

In [38]:
english_vocab = Vocabular()
english_tokens = corpus_df.apply(lambda x: tokenizer(x[EN_LABEL]), axis=1)
english_vocab.fit(english_tokens)

In [39]:
en_encoder = LabelBinarizer(sparse_output=False)
en_encoder.fit(range(english_vocab.max_index+1))

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [40]:
en_sentence_list = english_vocab.transform(english_tokens)

In [None]:
max_lenth = np.max([len(x) for x in en_sentence_list])
max_lenth += 1
print(f"Max sequence lenth is {max_lenth}")

In [None]:
# vectors = allignment(en_sentence_list, en_encoder, max_lenth)

## Dump

In [None]:
import pickle

In [None]:
path = os.path.join(BASE_PATH, "dump/english_ohe.pkl")
with open(path, mode="wb") as file:
    pickle.dump(vectors, file)
    
del path

In [None]:
path = os.path.join(BASE_PATH, "dump/english_ohe.pkl")
with open(path, mode="rb") as file:
    vectors = pickle.load(file)

del path

# Length histogram

In [None]:
hist_data = np.histogram( [len(x) for x in sentence_list], bins=max_lenth)

plt.bar(range(max_lenth), hist_data[0])
plt.title("Histogram of token amount in sentence")
plt.xlabel("Amount of tokens")
plt.ylabel("Amount of sentences")
plt.show()

# Convert Russian tokens in vectors

In [15]:
path = os.path.join(BASE_PATH, "embeddings/skipgram_fasttext/araneum_none_fasttextskipgram_300_5_2018.model")
model = FastText.load(path)
ru_embedder = FastTextWrapper(model)
del path

In [16]:
ru_tokenizer = tokenizer_factory("ru_tok")

In [17]:
russian_tokens = corpus_df.apply(lambda x: ru_tokenizer(x[RU_LABEL]), axis=1)

# Wrap into Dataset

In [128]:
data_set = RUENDataset(russian_tokens, en_sentence_list, ru_embedder, en_encoder)
dataloader = DataLoader(data_set, batch_size=1, shuffle=True)

In [131]:
iterator = iter(dataloader)
data = next(iterator)

# NN

In [201]:
class Encoder(nn.Module):
    
    def __init__(self, input_size, hidden_vector_size):
        super().__init__()
        self.encoder = nn.LSTM(batch_first=True, input_size=input_size, hidden_size=hidden_vector_size)
    
    def forward(self, X):
#         X = X.view(1,-1, X.size()[2])
        output, hidden_states = self.encoder(X)
        output = output[0][-1]
        return output, hidden_states

class Decoder(nn.Module):
    
    def __init__(self, input_size, hidden_vector_size):
        super().__init__()
        self.decoder = nn.LSTM(batch_first=True, input_size=input_size, hidden_size=hidden_vector_size)
        
    def forward(self, X, hidden_state):
        """
        Return: X shape (1,1,VOC_SIZE)
        """
        X, hidden_state = self.decoder(X, hidden_state)
        X = nn.functional.softmax(X,dim=2)
        return X, hidden_state

In [241]:
class Trainer:
    
    def __init__(self, model_save_path,
                 encoder,
                 decoder,
                 encoder_optimizer,
                 decoder_optimizer,
                 loss,
                 input_size, hidden_size,
                 EOS,
                 SOS,
                 epoch):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.encoder = encoder
        self.decoder = decoder
        self.encoder_optimizer = encoder_optimizer
        self.decoder_optimizer = decoder_optimizer
        self.loss = loss
        self.model_save_path = model_save_path
        self.EOS = EOS
        self.SOS = SOS
        self.epoch = epoch

    
    def train(self, dataloader):
        for i in range(self.epoch):
            for batch in dataloader:
                ru_vector = batch[RU_DS_LABEL]
                eng_vector = batch[EN_DS_LABEL]
                self.process_one_pair(ru_vector, eng_vector)
    
    def predict(self, dataloader):
        with torch.no_grad():
            result = []
            for batch in dataloader:
                sentence = []
                result.append(sentence)
                
                ru_vector = batch[RU_DS_LABEL]
                eng_vector = batch[EN_DS_LABEL]
                
                X, hidden_state = self.encoder(ru_vector)
                Y = self.SOS
                for i in range(1, ru_vector.shape[1]):
                    Y, hidden_state = self.decoder(Y, hidden_state)
                    
                    _, word_index = Y.topk(1)
                    word_index = word_index.item()
                    sentence.append(word_index)
                    if word_index == self.EOS:
                        print("Achived EOS")
                        break
                    else:
                        Y = torch.zeros(1,1, Y.size()[2])
                        Y[0,0, word_index] =1
            return result
                
    
    def process_one_pair(self, ru_vector, eng_vector):
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        X, hidden_state = self.encoder(ru_vector)

        loss_val = 0
        loss_torch = None
        
        Y = self.SOS

        for i in range(1, eng_vector.shape[1]):
            token = eng_vector[0][i]
            token = token.view(1, -1, token.size()[0])
            class_index = torch.argmax(token,dim=1)
            
            Y, hidden_state = self.decoder(Y, hidden_state)
                        
            temp_loss = self.loss(Y, class_index)            
            if loss_torch is None:
                loss_torch = temp_loss
            else:
                loss_torch += temp_loss
            loss_val += temp_loss.item()
            
            _, word_index = Y.topk(1)
            word_index = word_index.item()
            if word_index == self.EOS:
                print("Achived EOS")
                break
            else:
                Y = torch.zeros(1,1, Y.size()[2])
                Y[0,0, word_index] =1

        loss_torch.backward()
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

In [242]:
input_size = model.vector_size
decoder_input_size = english_vocab.max_index + 1
hidden_size = decoder_input_size

encoder = Encoder(input_size, hidden_size)
decoder = Decoder(decoder_input_size, hidden_size)

encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=0.01)

loss_function = nn.NLLLoss()

In [243]:
EOS_vector = [[EOS_LABEL]]
EOS_vector = english_vocab.transform(EOS_vector)
EOS_vector = EOS_vector[0][0]

SOS_vector = [[SOS_LABEL]]
SOS_vector = english_vocab.transform(SOS_vector)
SOS_vector = en_encoder.transform(SOS_vector)
SOS_vector = torch.tensor(SOS_vector, dtype=torch.float32)
SOS_vector = SOS_vector.view(1, -1, SOS_vector.size()[1])

In [244]:
save_path = ""

trainer = Trainer(model_save_path=save_path,
                  encoder=encoder,
                  decoder=decoder, 
                  encoder_optimizer=encoder_optimizer,
                  decoder_optimizer=decoder_optimizer, 
                  loss=loss_function,
                  input_size=input_size,
                  hidden_size=hidden_size,
                  EOS=EOS_vector,
                  SOS=SOS_vector,
                  epoch=2)

In [245]:
trainer.train(dataloader)

In [246]:
prediction = trainer.predict(dataloader)
predicted_sentences_list = english_vocab.inverse_transform(prediction)

In [258]:
show_translation(russian_tokens, predicted_sentences_list)

обычно я просыпался в 7 : 30 , спускался вниз и видел что дверь в дом открыта , на кухне и в гостиной стоит 600 банок пива и дома никого нет . <EOS>
albums go albums human human no no human no no human no no no no no no no no no no no no no no
