In [None]:
BASE_PATH = "."

# Import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import json
import pickle
import torch
import translator_constants.global_constant as glc
from text_utils.utils import tokenize_corpus
from text_utils.vocabulary import Vocabulary
from text_utils.fast_text import FastTextWrapper
from gensim.models import FastText

# Load dataset

In [None]:
path = os.path.join(BASE_PATH, "data/shrinked_corpus.csv")
corpus_df = pd.read_csv(path)

In [None]:
corpus_df.head()

In [None]:
corpus_df[glc.RU_LABEL] = corpus_df[glc.RU_LABEL].str.lower()
corpus_df[glc.EN_LABEL] = corpus_df[glc.EN_LABEL].str.lower()

corpus_df[glc.RU_LABEL] = corpus_df[glc.RU_LABEL].str.strip()
corpus_df[glc.EN_LABEL] = corpus_df[glc.EN_LABEL].str.strip()

# Vectorization

## English

In [None]:
english_tokens = tokenize_corpus(corpus_df, glc.EN_LABEL)

In [None]:
english_vocab = Vocabulary()
english_vocab.fit(english_tokens)

In [None]:
en_sentence_list = english_vocab.transform(english_tokens)

In [None]:
path = os.path.join(BASE_PATH, "data/english_meta/english_vocab.json")
english_vocab.save(path)

In [None]:
path = os.path.join(BASE_PATH, "data/english_meta/english_token.pkl")
with open(path, mode="wb") as file:
    pickle.dump(en_sentence_list, file)

## Russian

In [None]:
russian_tokens = tokenize_corpus(corpus_df, glc.RU_LABEL)
russian_tokens = list(russian_tokens)
del corpus_df

In [None]:
russian_vocab = Vocabulary()
russian_vocab.fit(russian_tokens)
ru_sentence_list = russian_vocab.transform(russian_tokens)

In [None]:
path = os.path.join(BASE_PATH, "data/russian_meta/russian_vocab.json")
russian_vocab.save(path)

In [None]:
path = os.path.join(BASE_PATH, "data/russian_meta/russian_token.pkl")
with open(path, mode="wb") as file:
    pickle.dump(ru_sentence_list, file)
    
del ru_sentence_list
del file
del path

In [None]:
path = os.path.join(BASE_PATH, "embeddings/skipgram_fasttext/araneum_none_fasttextskipgram_300_5_2018.model")
model = FastText.load(path)
ru_embedder = FastTextWrapper(model)

In [None]:
vocabulary_np = np.zeros((russian_vocab.max_index + 1, 300), dtype=np.float32)
for word_str, index in russian_vocab.word_to_index.items():
    vector_np = ru_embedder.transform([[word_str]])[0]
    vocabulary_np[index] = vector_np

In [None]:
vocabulary_np = torch.tensor(vocabulary_np, dtype=torch.float32)
path = os.path.join(BASE_PATH, "data/russian_meta/vectorized_vocabulary.trch")
torch.save(vocabulary_np, path)