In [None]:
BASE_PATH = "."

# Import

In [None]:
import sys
import os
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import translator_constants.global_constant as glc
from tokenizer.word_punct_tokenizer import tokenizer_factory

In [None]:
def load_data(word_dir: str) -> pd.DataFrame:
    path = os.path.join(word_dir, "data/corpus.en_ru.1m.en")
    data_en = load_corpus(path)

    path = os.path.join(word_dir, "data/corpus.en_ru.1m.ru")
    data_ru = load_corpus(path)

    df = pd.DataFrame({glc.RU_LABEL: data_ru, glc.EN_LABEL: data_en})
    return df


def load_corpus(path: str) -> list:
    with open(path, mode="r") as file:
        data = file.readlines()
    data = [s.strip().lower() for s in data]
    return data

In [None]:
corpus_df = load_data(BASE_PATH)

In [None]:
corpus_df.head()

# Evaluate length

## Definition

In [None]:
def description_length(tokens_seq, percemtile = 90):
    temp_length_list = []
    for sentence in tokens_seq:
        length_int = len(sentence)
        temp_length_list.append(length_int)
    length_np = np.array(temp_length_list)
    
    print(f"Max length: {length_np.max()}")
    print(f"Min length: {length_np.min()}")
    print(f"Mean length: {length_np.mean()}")
    print(f"Median length: {np.median(length_np)}")
    print(f"{percemtile} percemtile length: {np.percentile(length_np, percemtile)}")


## English

In [None]:
tokenizer = tokenizer_factory(glc.WORD_PUNCT_TOKENIZER_WITH_SOS)
english_tokens = corpus_df.apply(lambda x: tokenizer(x[glc.EN_LABEL]), axis=1)

In [None]:
description_length(english_tokens)

## Russian

In [None]:
ru_tokenizer = tokenizer_factory(glc.WORD_PUNCT_TOKENIZER_WITHOUT_SOS)
russian_tokens = corpus_df.apply(lambda x: ru_tokenizer(x[glc.RU_LABEL]), axis=1)

In [None]:
description_length(russian_tokens, percemtile=90)

# Form dataset

In [None]:
MAX_ENGLISH_SEQUENCE_LEN = 47
MAX_RUSSIAN_SEQUENCE_LEN = 41

In [None]:
print(f"Initial corpus size: {corpus_df.shape[0]}")
temp_list = []
for i in tqdm(range(corpus_df.shape[0])):
    prediction = len(english_tokens[i]) < MAX_ENGLISH_SEQUENCE_LEN and len(russian_tokens[i]) < MAX_RUSSIAN_SEQUENCE_LEN
    temp_list.append(prediction)
sub_corpus = corpus_df[temp_list]

print(f"Shrinked corpus size: {sub_corpus.shape[0]}")
del temp_list

In [None]:
sub_corpus.iloc[0][glc.RU_LABEL]

In [None]:
sub_corpus.iloc[0][glc.EN_LABEL]

# Save

In [None]:
path = os.path.join(BASE_PATH, "data/shrinked_corpus.csv")
sub_corpus.to_csv(path, index=False)