In [1]:
import spacy
from pprint import pprint
from tqdm import tqdm
from simalign import SentenceAligner

In [2]:
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-01-23 01:32:57,457 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [3]:
en_nlp_lg = spacy.load("en_core_web_lg")
ru_nlp_lg = spacy.load("ru_core_news_lg")
ru_tokenizer = ru_nlp_lg.tokenizer
en_tokenizer = en_nlp_lg.tokenizer

In [4]:
def spacy_format_to_list_of_str(spacy_tokens):
    normal_tokens = []
    for sentence in spacy_tokens:
        list_sentence = []
        for token in sentence:
            list_sentence.append(str(token))
        normal_tokens.append(list_sentence)
    return normal_tokens

# Karenina

In [5]:
with open("./corpora/karenina/Books_ru.txt", "rt", encoding="utf-8") as file:
    ru_corpus_raw = [line.rstrip() for line in file]

ru_corpus_raw[:5]

['Анна Каренина',
 'Толстой Лев Николаевич',
 'Мне отмщение, и аз воздам',
 'ЧАСТЬ ПЕРВАЯ',
 'Все счастливые семьи похожи друг на друга, каждая несчастливая семья несчастлива по-своему.']

In [6]:
with open("./corpora/karenina/Books_en.txt", "rt", encoding="utf-8") as file:
    en_corpus_raw = [line.rstrip() for line in file]

en_corpus_raw[:5]

['Anna Karenina',
 'Leo Tolstoy',
 'Vengeance is mine; I will repay.',
 'VOLUME ONE PART I',
 'ALL HAPPY FAMILIES resemble one another, but each unhappy family is unhappy in its own way.']

In [7]:
ru_tokens = [ru_tokenizer(sentence) for sentence in ru_corpus_raw]
en_tokens = [en_tokenizer(sentence) for sentence in en_corpus_raw]

In [8]:
ru_tokens = spacy_format_to_list_of_str(ru_tokens)
en_tokens = spacy_format_to_list_of_str(en_tokens)

In [9]:
mwmf = ""
itermax = ""
inter = ""

for i in tqdm(range(len(ru_tokens))):
    src, trg = (ru_tokens[i], en_tokens[i])
    alignments = myaligner.get_word_aligns(src, trg)
    mwmf += " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
    itermax += " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
    inter += " ".join([f"{x}-{y}" for x, y in alignments["inter"]])
    mwmf += "\n"
    itermax += "\n"
    inter += "\n"

100%|██████████| 17255/17255 [4:52:04<00:00,  1.02s/it]   


In [10]:
with open('corpora/karenina/mwmf.txt', 'wt') as f:
    f.write(mwmf)

In [11]:
with open('corpora/karenina/itermax.txt', 'wt') as f:
    f.write(itermax)

In [12]:
with open('corpora/karenina/inter.txt', 'wt') as f:
    f.write(inter)

# Woland

In [13]:
ru_tokens = [ru_tokenizer(sentence) for sentence in ru_corpus_raw]
en_tokens = [en_tokenizer(sentence) for sentence in en_corpus_raw]

In [14]:
ru_tokens = spacy_format_to_list_of_str(ru_tokens)
en_tokens = spacy_format_to_list_of_str(en_tokens)

In [15]:
mwmf = ""
itermax = ""
inter = ""

for i in tqdm(range(len(ru_tokens))):
    src, trg = (ru_tokens[i], en_tokens[i])
    alignments = myaligner.get_word_aligns(src, trg)
    mwmf += " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
    itermax += " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
    inter += " ".join([f"{x}-{y}" for x, y in alignments["inter"]])
    mwmf += "\n"
    itermax += "\n"
    inter += "\n"

100%|██████████| 17255/17255 [4:43:58<00:00,  1.01it/s]   


In [16]:
with open('corpora/woland/mwmf.txt', 'wt') as f:
    f.write(mwmf)

In [17]:
with open('corpora/woland/itermax.txt', 'wt') as f:
    f.write(itermax)

In [18]:
with open('corpora/woland/inter.txt', 'wt') as f:
    f.write(inter)