In [1]:
# python -m venv .venv
# source ./.venv/bin/activate
# pip install ipykernel
# ipython kernel install --user --name=translational_variability

In [2]:
# ! pip install simalign
# ! python3 -m spacy download en_core_web_lg
# ! python3 -m spacy download ru_core_news_lg

In [1]:
import spacy
from pprint import pprint
from tqdm import tqdm
from simalign import SentenceAligner

In [2]:
with open("./corpora/karenina/Books_ru.txt", "rt", encoding="utf-8") as file:
    ru_corpus_raw = [line.rstrip() for line in file]

ru_corpus_raw[:5]

['Анна Каренина',
 'Толстой Лев Николаевич',
 'Мне отмщение, и аз воздам',
 'ЧАСТЬ ПЕРВАЯ',
 'Все счастливые семьи похожи друг на друга, каждая несчастливая семья несчастлива по-своему.']

In [3]:
with open("./corpora/karenina/Books_en.txt", "rt", encoding="utf-8") as file:
    en_corpus_raw = [line.rstrip() for line in file]

en_corpus_raw[:5]

['Anna Karenina',
 'Leo Tolstoy',
 'Vengeance is mine; I will repay.',
 'VOLUME ONE PART I',
 'ALL HAPPY FAMILIES resemble one another, but each unhappy family is unhappy in its own way.']

In [4]:
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

In [5]:
en_nlp_lg = spacy.load("en_core_web_lg")
ru_nlp_lg = spacy.load("ru_core_news_lg")

In [6]:
ru_tokenizer = ru_nlp_lg.tokenizer
en_tokenizer = en_nlp_lg.tokenizer

In [None]:
def spacy_format_to_list_of_str(spacy_tokens):
    normal_tokens = []
    for sentence in spacy_tokens:
        list_sentence = []
        for token in sentence:
            list_sentence.append(str(token))
        normal_tokens.append(list_sentence)
    return normal_tokens

In [7]:
ru_tokens = [ru_tokenizer(sentence) for sentence in ru_corpus_raw]

In [8]:
en_tokens = [en_tokenizer(sentence) for sentence in en_corpus_raw]

In [14]:
en_tokens[10][]

On the third day after his quarrel with his wife, Prince Stephen Arkadyevich Oblonsky – Steve, as he was called in his set in Society – woke up at his usual time, eight o'clock, not in his wife's bedroom but on the morocco leather-covered sofa in his study.

In [9]:
def spacy_format_to_list_of_str_lemmas(spacy_tokens):
    normal_tokens = []
    for sentence in spacy_tokens:
        list_sentence = []
        for token in sentence:
            list_sentence.append(str(token.lemma_))
        normal_tokens.append(list_sentence)
    return normal_tokens

In [10]:
ru_tokens_lemmas = spacy_format_to_list_of_str_lemmas(ru_tokens)
en_tokens_lemmas = spacy_format_to_list_of_str_lemmas(en_tokens)

In [11]:
ru_lemmas_tokenized = ""
en_lemmas_tokenized = ""

for i in tqdm(range(len(ru_tokens))):
    ru_lemmas_tokenized += " ".join(ru_tokens_lemmas[i])
    ru_lemmas_tokenized += "\n"
    en_lemmas_tokenized += " ".join(en_tokens_lemmas[i])
    en_lemmas_tokenized += "\n"

100%|██████████| 17255/17255 [00:00<00:00, 608143.42it/s]


In [12]:
with open('corpora/woland/ru_lemmas_tokenized.txt', 'wt') as f:
    f.write(ru_lemmas_tokenized)

In [None]:
ru_tokens = spacy_format_to_list_of_str(ru_tokens)
en_tokens = spacy_format_to_list_of_str(en_tokens)

In [None]:
text = ''
for i in tqdm(range(len(ru_tokens))):
    src, trg = (ru_tokens[i], en_tokens[i])
    alignments = myaligner.get_word_aligns(src, trg)
    for first, second in alignments['itermax']:
        text += f"{src[first]}\t{trg[second]}\n"
    text += '\n'

100%|██████████| 17255/17255 [4:55:50<00:00,  1.03s/it]   


In [None]:
with open('aligned.txt', 'wt') as f:
    f.write(text)

In [None]:
with open('karenina_aligned_data.txt', 'wt') as f:
    f.write(text)