In [22]:
import os
from pprint import pprint
from tqdm import tqdm
import  spacy
from spacy.tokens import DocBin
from collections import Counter

In [2]:
from easynmt import EasyNMT
model = EasyNMT('opus-mt')

In [4]:
with open("subtitles_raw/en_raw_0-900.txt", "rt", encoding="utf-8") as f:
    en_all = [line.strip() for line in f]

with open("subtitles_raw/ru_raw_0-900.txt", "rt", encoding="utf-8") as f:
    ru_all = [line.strip() for line in f]

with open("corpora/subtitles/ru_subtitles_spacy_dump.bin", "rb") as f:
    restored_bytes_data = f.read()

nlp = spacy.blank("ru")
doc_bin = DocBin().from_bytes(restored_bytes_data)
ru_all_docs = list(doc_bin.get_docs(nlp.vocab))

In [5]:
def most_common_lemmas(docs, n: int):
    words = []
    for doc in docs:
        for token in doc:
            if not token.is_stop and not token.is_punct:
                words.append(token.lemma_.lower())
    word_freq = Counter(words)
    return word_freq.most_common(n)

def most_common_lemmas_tagged(docs, tag, n: int):
    words = []
    for doc in docs:
        for token in doc:
            if not token.is_stop and not token.is_punct:
                if token.pos_ == tag:
                    words.append(token.lemma_.lower())
    word_freq = Counter(words)
    return word_freq.most_common(n)

In [24]:
common_nouns = most_common_lemmas_tagged(ru_all_docs, "VERB", 100)
common_nouns

[('знать', 27172),
 ('сказать', 18957),
 ('говорить', 14248),
 ('думать', 13659),
 ('видеть', 10555),
 ('сделать', 9162),
 ('идти', 8690),
 ('делать', 8078),
 ('пойти', 7671),
 ('хотеть', 6727),
 ('любить', 5621),
 ('дать', 5565),
 ('найти', 5433),
 ('давать', 5338),
 ('прийти', 5308),
 ('понимать', 5112),
 ('взять', 4637),
 ('вернуться', 4525),
 ('помочь', 4211),
 ('ждать', 3986),
 ('слышать', 3838),
 ('просить', 3823),
 ('посмотреть', 3755),
 ('смочь', 3671),
 ('жить', 3645),
 ('убить', 3641),
 ('случиться', 3449),
 ('понять', 3438),
 ('остаться', 3391),
 ('смотреть', 3363),
 ('увидеть', 3361),
 ('бояться', 3324),
 ('нравиться', 3184),
 ('уйти', 3180),
 ('простить', 3103),
 ('работать', 3029),
 ('оставить', 2975),
 ('стоить', 2938),
 ('забыть', 2859),
 ('получить', 2798),
 ('помнить', 2677),
 ('выйти', 2674),
 ('собираться', 2665),
 ('слушать', 2590),
 ('уходить', 2467),
 ('спать', 2407),
 ('умереть', 2398),
 ('рассказать', 2391),
 ('надеяться', 2366),
 ('поговорить', 2346),
 ('подум

In [25]:
common = [word for word, _ in common_nouns]
common = common[9:]
common

['хотеть',
 'любить',
 'дать',
 'найти',
 'давать',
 'прийти',
 'понимать',
 'взять',
 'вернуться',
 'помочь',
 'ждать',
 'слышать',
 'просить',
 'посмотреть',
 'смочь',
 'жить',
 'убить',
 'случиться',
 'понять',
 'остаться',
 'смотреть',
 'увидеть',
 'бояться',
 'нравиться',
 'уйти',
 'простить',
 'работать',
 'оставить',
 'стоить',
 'забыть',
 'получить',
 'помнить',
 'выйти',
 'собираться',
 'слушать',
 'уходить',
 'спать',
 'умереть',
 'рассказать',
 'надеяться',
 'поговорить',
 'подумать',
 'иметь',
 'позволить',
 'узнать',
 'подождать',
 'верить',
 'решить',
 'играть',
 'звать',
 'поехать',
 'чувствовать',
 'пытаться',
 'прийтись',
 'приехать',
 'искать',
 'извинить',
 'считать',
 'уехать',
 'принести',
 'хватить',
 'здравствовать',
 'выглядеть',
 'показать',
 'ехать',
 'купить',
 'пройти',
 'начать',
 'оставаться',
 'послушать',
 'стать',
 'ходить',
 'вести',
 'приходить',
 'волноваться',
 'выпить',
 'принять',
 'спросить',
 'написать',
 'сидеть',
 'потерять',
 'произойти',
 'в

In [26]:
def find_lines_with_word(original_word: str, docs) -> list:
    indexes = []
    for i, sentence in enumerate(docs):
        for token in sentence:
            if token.lemma_.lower() == original_word:
                indexes.append(i)
                break
    return indexes

In [27]:
indexes = []
for original_word in common:
    indexes += find_lines_with_word(original_word, ru_all_docs)
len(indexes)

250775

In [6]:
folder_name = f"corpora/subtitles/translations/{original_word}"
os.mkdir(folder_name)
with open(f"{folder_name}/indexes", "wt", encoding="utf-8") as f:
    for number in indexes:
        f.write(str(number))
        f.write('\n')

In [28]:
filename_translated_all = "corpora/subtitles/translations/opus10_whole.txt"
with open(filename_translated_all, "rt", encoding="utf-8") as f:
    translated_all = [line.rstrip() for line in f.readlines()]
len(translated_all)

900000

In [30]:
chunk_size = 500
latest_chunk = 0

for chunk_start in range(latest_chunk, len(indexes), chunk_size):

    for i in tqdm(range(chunk_start, min(chunk_start + chunk_size, len(indexes)))):
        if translated_all[indexes[i]] != "":
            continue
        sentence = ru_all[indexes[i]]
        translation = model.translate(sentence, source_lang='ru', target_lang='en', beam_size=10, max_length=200)
        translated_all[indexes[i]] = translation

    with open(filename_translated_all, 'w', encoding="utf-8") as f:
        for line in translated_all:
            f.write(line)
            f.write('\n')

100%|██████████| 500/500 [10:14<00:00,  1.23s/it]
100%|██████████| 500/500 [08:19<00:00,  1.00it/s]
100%|██████████| 500/500 [08:16<00:00,  1.01it/s]
100%|██████████| 500/500 [06:54<00:00,  1.21it/s]
100%|██████████| 500/500 [08:35<00:00,  1.03s/it]
100%|██████████| 500/500 [07:30<00:00,  1.11it/s]
100%|██████████| 500/500 [08:46<00:00,  1.05s/it]
100%|██████████| 500/500 [07:42<00:00,  1.08it/s]
100%|██████████| 500/500 [07:24<00:00,  1.12it/s]
100%|██████████| 500/500 [07:43<00:00,  1.08it/s]
100%|██████████| 500/500 [07:00<00:00,  1.19it/s]
100%|██████████| 500/500 [08:04<00:00,  1.03it/s]
100%|██████████| 500/500 [07:20<00:00,  1.13it/s]
100%|██████████| 500/500 [10:00<00:00,  1.20s/it]
100%|██████████| 500/500 [09:29<00:00,  1.14s/it]
100%|██████████| 500/500 [09:08<00:00,  1.10s/it]
100%|██████████| 500/500 [07:41<00:00,  1.08it/s]
100%|██████████| 500/500 [07:41<00:00,  1.08it/s]
100%|██████████| 500/500 [07:50<00:00,  1.06it/s]
100%|██████████| 500/500 [07:23<00:00,  1.13it/s]


KeyboardInterrupt: 

In [51]:
translated_all[:20]

['ab',
 '',
 '',
 '',
 '',
 '10',
 '',
 '',
 '',
 'ff',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [53]:
with open(filename_translated_all, 'w', encoding="utf-8") as f:
    for line in translated_all:
        f.write(line)
        f.write('\n')

выравнивание

дамп доков

In [11]:
en_nlp_lg = spacy.load("en_core_web_lg")

In [12]:
len(translated_all)

900000

In [13]:
empty = en_nlp_lg("")

docs = []

for sentence in tqdm(translated_all):
    if sentence == "":
        docs.append(empty)
    else:
        spacy_doc = en_nlp_lg(sentence)
        docs.append(spacy_doc)

doc_bin = DocBin()
for doc in tqdm(docs):
    doc_bin.add(doc)
bytes_data = doc_bin.to_bytes()

with open("corpora/subtitles/translations/opus10_spacy.bin", "wb") as file:
    file.write(bytes_data)

100%|██████████| 900000/900000 [04:24<00:00, 3396.59it/s]
100%|██████████| 900000/900000 [00:20<00:00, 43743.95it/s]


In [38]:
folder_name = "corpora/subtitles/translations"
fname_mwmf = f"{folder_name}/mwmf"
fname_itermax = f"{folder_name}/itermax"
fname_inter = f"{folder_name}/inter"

In [39]:
with open(fname_mwmf, "rt", encoding="utf-8") as f:
    mwmf = [line.rstrip() for line in f.readlines()]
with open(fname_itermax, "rt", encoding="utf-8") as f:
    itermax = [line.rstrip() for line in f.readlines()]
with open(fname_inter, "rt", encoding="utf-8") as f:
    inter = [line.rstrip() for line in f.readlines()]

In [27]:
mwmf = ["" for _ in range(900000)]
itermax = ["" for _ in range(900000)]
inter = ["" for _ in range(900000)]

In [17]:
translated_all_docs = docs

In [35]:
from simalign import SentenceAligner
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-05-04 19:19:58,710 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [21]:
chunk_size = 20000
latest_chunk = 0
for chunk_start in range(latest_chunk, 900000, chunk_size):

    for i in tqdm(range(chunk_start, chunk_start + chunk_size)):
        if translated_all[i] == "":
            continue
        if mwmf[i] != "":
            continue
        ru_tokens = [token.text for token in ru_all_docs[i]]
        translated_tokens = [token.text for token in translated_all_docs[i]]
        # print(f"{ru_tokens}\n{translated_tokens}\n")
        src, trg = (ru_tokens, translated_tokens)
        alignments = myaligner.get_word_aligns(src, trg)
        mwmf[i] = " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
        itermax[i] = " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
        inter[i] = " ".join([f"{x}-{y}" for x, y in alignments["inter"]])

    with open(fname_mwmf, 'w', encoding="utf-8") as f:
        for line in mwmf:
            f.write(line)
            f.write('\n')

    with open(fname_itermax, 'w', encoding="utf-8") as f:
        for line in itermax:
            f.write(line)
            f.write('\n')

    with open(fname_inter, 'w', encoding="utf-8") as f:
        for line in inter:
            f.write(line)
            f.write('\n')

100%|██████████| 20000/20000 [01:51<00:00, 180.08it/s]
100%|██████████| 20000/20000 [01:41<00:00, 196.78it/s]
100%|██████████| 20000/20000 [01:30<00:00, 221.43it/s]
100%|██████████| 20000/20000 [01:43<00:00, 193.37it/s] 
100%|██████████| 20000/20000 [01:29<00:00, 223.95it/s] 
100%|██████████| 20000/20000 [01:23<00:00, 239.34it/s]
100%|██████████| 20000/20000 [01:29<00:00, 222.64it/s]
100%|██████████| 20000/20000 [01:29<00:00, 222.68it/s] 
100%|██████████| 20000/20000 [01:23<00:00, 239.95it/s] 
100%|██████████| 20000/20000 [01:17<00:00, 259.64it/s] 
100%|██████████| 20000/20000 [01:41<00:00, 197.82it/s]
100%|██████████| 20000/20000 [02:09<00:00, 154.52it/s]
100%|██████████| 20000/20000 [01:52<00:00, 178.41it/s]
100%|██████████| 20000/20000 [02:20<00:00, 142.16it/s] 
100%|██████████| 20000/20000 [01:39<00:00, 200.73it/s]
100%|██████████| 20000/20000 [01:53<00:00, 176.27it/s]
100%|██████████| 20000/20000 [02:01<00:00, 164.04it/s]
100%|██████████| 20000/20000 [02:00<00:00, 166.02it/s]
100%

merge two translations from different laptops

In [31]:
filename_another_translated_all = "translations_to_merge/translations/opus10_whole.txt"
with open(filename_translated_all, "rt", encoding="utf-8") as f:
    another_translated_all = [line.rstrip() for line in f.readlines()]
len(another_translated_all)

900000

In [36]:
another_folder_name = "translations_to_merge/translations"
another_fname_mwmf = f"{folder_name}/mwmf"
another_fname_itermax = f"{folder_name}/itermax"
another_fname_inter = f"{folder_name}/inter"

In [40]:
with open(another_fname_mwmf, "rt", encoding="utf-8") as f:
    another_mwmf = [line.rstrip() for line in f.readlines()]
with open(another_fname_itermax, "rt", encoding="utf-8") as f:
    another_itermax = [line.rstrip() for line in f.readlines()]
with open(another_fname_inter, "rt", encoding="utf-8") as f:
    another_inter = [line.rstrip() for line in f.readlines()]

In [32]:
for i in range(len(translated_all)):
    if translated_all[i] == "" and another_translated_all[i] != "":
        translated_all[i] = another_translated_all[i] 

In [33]:
with open(filename_translated_all, 'w', encoding="utf-8") as f:
    for line in translated_all:
        f.write(line)
        f.write('\n')

In [41]:
for i in range(len(mwmf)):
    if mwmf[i] == "" and another_mwmf[i] != "":
        mwmf[i] = another_mwmf[i] 
        itermax[i] = another_itermax[i]
        inter[i] = another_inter[i]

In [None]:
with open(fname_mwmf, 'w', encoding="utf-8") as f:
    for line in mwmf:
        f.write(line)
        f.write('\n')

with open(fname_itermax, 'w', encoding="utf-8") as f:
    for line in itermax:
        f.write(line)
        f.write('\n')

with open(fname_inter, 'w', encoding="utf-8") as f:
    for line in inter:
        f.write(line)
        f.write('\n')