In [2]:
# ! pip install spacy
# ! python -m spacy download en_core_web_lg
# ! python -m spacy download en_core_web_trf
# ! python -m spacy download ru_core_news_lg
# ! pip install matplotlib
# ! pip install numpy
# ! pip install simalign

In [2]:
import spacy
from pprint import pprint
from tqdm import tqdm
from simalign import SentenceAligner

In [None]:
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-03-29 12:13:06,063 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [8]:
en_nlp_lg = spacy.load("en_core_web_lg")
ru_nlp_lg = spacy.load("ru_core_news_lg")
ru_tokenizer = ru_nlp_lg.tokenizer
en_tokenizer = en_nlp_lg.tokenizer

In [10]:
def spacy_format_to_list_of_str(spacy_tokens):
    normal_tokens = []
    for sentence in spacy_tokens:
        list_sentence = []
        for token in sentence:
            list_sentence.append(str(token))
        normal_tokens.append(list_sentence)
    return normal_tokens

In [15]:
def spacy_to_list_of_lemmas(spacy_tokens):
    normal_lemmas = []
    for sentence in spacy_tokens:
        list_sentence = []
        for token in sentence:
            list_sentence.append(str(token.lemma_))
        normal_lemmas.append(list_sentence)
    return normal_lemmas

# Karenina

In [3]:
with open("./corpora/karenina/Books_ru.txt", "rt", encoding="utf-8") as file:
    ru_corpus_raw = [line.rstrip() for line in file]

ru_corpus_raw[:5]

['Анна Каренина',
 'Толстой Лев Николаевич',
 'Мне отмщение, и аз воздам',
 'ЧАСТЬ ПЕРВАЯ',
 'Все счастливые семьи похожи друг на друга, каждая несчастливая семья несчастлива по-своему.']

In [4]:
with open("./corpora/karenina/Books_en.txt", "rt", encoding="utf-8") as file:
    en_corpus_raw = [line.rstrip() for line in file]

en_corpus_raw[:5]

['Anna Karenina',
 'Leo Tolstoy',
 'Vengeance is mine; I will repay.',
 'VOLUME ONE PART I',
 'ALL HAPPY FAMILIES resemble one another, but each unhappy family is unhappy in its own way.']

In [13]:
ru_tokens = [ru_tokenizer(sentence) for sentence in ru_corpus_raw]
en_tokens = [en_tokenizer(sentence) for sentence in en_corpus_raw]

In [14]:
ru_tokens = spacy_format_to_list_of_str(ru_tokens)
en_tokens = spacy_format_to_list_of_str(en_tokens)

In [18]:
ru_orig_tokenized = ""
en_orig_tokenized = ""

for i in tqdm(range(len(ru_tokens))):
    ru_orig_tokenized += " ".join(ru_tokens[i])
    ru_orig_tokenized += "\n"
    en_orig_tokenized += " ".join(en_tokens[i])
    en_orig_tokenized += "\n"

100%|██████████| 17255/17255 [00:00<00:00, 189472.75it/s]


In [19]:
with open('corpora/karenina/ru_orig_tokenized.txt', 'wt') as f:
    f.write(ru_orig_tokenized)

In [20]:
with open('corpora/karenina/en_orig_tokenized.txt', 'wt') as f:
    f.write(en_orig_tokenized)

In [None]:
mwmf = ""
itermax = ""
inter = ""

for i in tqdm(range(len(ru_tokens))):
    src, trg = (ru_tokens[i], en_tokens[i])
    alignments = myaligner.get_word_aligns(src, trg)
    mwmf += " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
    itermax += " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
    inter += " ".join([f"{x}-{y}" for x, y in alignments["inter"]])
    mwmf += "\n"
    itermax += "\n"
    inter += "\n"

In [10]:
with open('corpora/karenina/mwmf.txt', 'wt') as f:
    f.write(mwmf)

In [11]:
with open('corpora/karenina/itermax.txt', 'wt') as f:
    f.write(itermax)

In [12]:
with open('corpora/karenina/inter.txt', 'wt') as f:
    f.write(inter)

In [5]:
en_nlp_lg = spacy.load("en_core_web_lg")
ru_nlp_lg = spacy.load("ru_core_news_lg")

In [6]:
ru_nlp_docs = [ru_nlp_lg(sentence) for sentence in ru_corpus_raw]

In [7]:
en_nlp_docs = [en_nlp_lg(sentence) for sentence in en_corpus_raw]

In [8]:
ru_nlp_docs[10][2].lemma_

'день'

In [18]:
ru_lemmas = spacy_to_list_of_lemmas(ru_nlp_docs)
en_lemmas = spacy_to_list_of_lemmas(en_nlp_docs)

ru_orig_lemmas = ""
en_orig_lemmas = ""

for i in tqdm(range(len(ru_lemmas))):
    ru_orig_lemmas += " ".join(ru_lemmas[i])
    ru_orig_lemmas += "\n"
    en_orig_lemmas += " ".join(en_lemmas[i])
    en_orig_lemmas += "\n"

100%|██████████| 17255/17255 [00:00<00:00, 222723.66it/s]


In [19]:
with open('corpora/karenina/ru_orig_lemmas.txt', 'wt') as f:
    f.write(ru_orig_lemmas)
with open('corpora/karenina/en_orig_lemmas.txt', 'wt') as f:
    f.write(en_orig_lemmas)

# Woland

In [21]:
ru_tokens = [ru_tokenizer(sentence) for sentence in ru_corpus_raw]
en_tokens = [en_tokenizer(sentence) for sentence in en_corpus_raw]

In [22]:
ru_tokens = spacy_format_to_list_of_str(ru_tokens)
en_tokens = spacy_format_to_list_of_str(en_tokens)

In [23]:
ru_orig_tokenized = ""
en_orig_tokenized = ""

for i in tqdm(range(len(ru_tokens))):
    ru_orig_tokenized += " ".join(ru_tokens[i])
    ru_orig_tokenized += "\n"
    en_orig_tokenized += " ".join(en_tokens[i])
    en_orig_tokenized += "\n"

100%|██████████| 17255/17255 [00:00<00:00, 211888.73it/s]


In [24]:
with open('corpora/woland/ru_orig_tokenized.txt', 'wt') as f:
    f.write(ru_orig_tokenized)

In [25]:
with open('corpora/woland/en_orig_tokenized.txt', 'wt') as f:
    f.write(en_orig_tokenized)

In [15]:
mwmf = ""
itermax = ""
inter = ""

for i in tqdm(range(len(ru_tokens))):
    src, trg = (ru_tokens[i], en_tokens[i])
    alignments = myaligner.get_word_aligns(src, trg)
    mwmf += " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
    itermax += " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
    inter += " ".join([f"{x}-{y}" for x, y in alignments["inter"]])
    mwmf += "\n"
    itermax += "\n"
    inter += "\n"

100%|██████████| 17255/17255 [4:43:58<00:00,  1.01it/s]   


In [16]:
with open('corpora/woland/mwmf.txt', 'wt') as f:
    f.write(mwmf)

In [17]:
with open('corpora/woland/itermax.txt', 'wt') as f:
    f.write(itermax)

In [18]:
with open('corpora/woland/inter.txt', 'wt') as f:
    f.write(inter)