In [None]:
# python -m venv .venv
# source ./.venv/bin/activate
# pip install ipykernel
# ipython kernel install --user --name=translational_variability

In [None]:
# ! pip install simalign
# ! python3 -m spacy download en_core_web_lg
# ! python3 -m spacy download ru_core_news_lg

In [1]:
import spacy
from pprint import pprint
from tqdm import tqdm
from simalign import SentenceAligner

In [8]:
with open("./corpora/karenina/Books_ru.txt", "rt", encoding="utf-8") as file:
    ru_corpus_raw = [line.rstrip() for line in file]

ru_corpus_raw[:5]

['Анна Каренина',
 'Толстой Лев Николаевич',
 'Мне отмщение, и аз воздам',
 'ЧАСТЬ ПЕРВАЯ',
 'Все счастливые семьи похожи друг на друга, каждая несчастливая семья несчастлива по-своему.']

In [9]:
with open("./corpora/karenina/Books_en.txt", "rt", encoding="utf-8") as file:
    en_corpus_raw = [line.rstrip() for line in file]

en_corpus_raw[:5]

['Anna Karenina',
 'Leo Tolstoy',
 'Vengeance is mine; I will repay.',
 'VOLUME ONE PART I',
 'ALL HAPPY FAMILIES resemble one another, but each unhappy family is unhappy in its own way.']

In [10]:
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

2023-01-04 21:21:47,299 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [23]:
en_nlp_lg = spacy.load("en_core_web_lg")
ru_nlp_lg = spacy.load("ru_core_news_lg")

In [None]:
ru_tokenizer = ru_nlp_lg.tokenizer
en_tokenizer = en_nlp_lg.tokenizer

In [39]:
def spacy_format_to_list_of_str(spacy_tokens):
    normal_tokens = []
    for sentence in spacy_tokens:
        list_sentence = []
        for token in sentence:
            list_sentence.append(str(token))
        normal_tokens.append(list_sentence)
    return normal_tokens


In [56]:
ru_tokens = [ru_tokenizer(sentence) for sentence in ru_corpus_raw]

In [57]:
en_tokens = [en_tokenizer(sentence) for sentence in en_corpus_raw]

In [61]:
ru_tokens = spacy_format_to_list_of_str(ru_tokens)
en_tokens = spacy_format_to_list_of_str(en_tokens)

In [67]:
text = ''
for i in tqdm(range(len(ru_tokens))):
    src, trg = (ru_tokens[i], en_tokens[i])
    alignments = myaligner.get_word_aligns(src, trg)
    for first, second in alignments['itermax']:
        text += f"{src[first]}\t{trg[second]}\n"
    text += '\n'

100%|██████████| 17255/17255 [4:55:50<00:00,  1.03s/it]   


In [68]:
with open('aligned.txt', 'wt') as f:
    f.write(text)

In [69]:
with open('karenina_aligned_data.txt', 'wt') as f:
    f.write(text)