In [110]:
import os
from pprint import pprint
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
from collections import Counter

In [111]:
from easynmt import EasyNMT
model = EasyNMT('opus-mt')

In [107]:
from simalign import SentenceAligner
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-05-10 13:55:54,105 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [3]:
with open("subtitles_raw/en_raw_0-900.txt", "rt", encoding="utf-8") as f:
    en_all = [line.strip() for line in f]

with open("subtitles_raw/ru_raw_0-900.txt", "rt", encoding="utf-8") as f:
    ru_all = [line.strip() for line in f]

TypeError: object of type '_io.TextIOWrapper' has no len()

In [2]:
en_all[100000]

"I 'm not here to discuss Miss Swallow . I 'm looking for George . I 'll find him if ..."

In [77]:
with open("corpora/subtitles/ru_subtitles_spacy_dump.bin", "rb") as f:
    restored_bytes_data = f.read()

nlp = spacy.blank("ru")
doc_bin = DocBin().from_bytes(restored_bytes_data)
ru_all_docs = list(doc_bin.get_docs(nlp.vocab))

In [113]:
filename_translated_all = "corpora/subtitles/translations/opus10_whole.txt"
with open(filename_translated_all, "rt", encoding="utf-8") as f:
    translated_all = [line.rstrip() for line in f.readlines()]
len(translated_all)

900000

In [88]:
def find_lines_with_word(original_word: str, docs) -> list:
    indexes = []
    for i, sentence in enumerate(docs):
        for token in sentence:
            if token.lemma_.lower() == original_word:
                indexes.append(i)
                break
    return indexes

In [66]:
def most_common_lemmas(docs, n: int):
    words = []
    for doc in docs:
        for token in doc:
            if not token.is_stop and not token.is_punct:
                words.append(token.lemma_.lower())
    word_freq = Counter(words)
    return word_freq.most_common(n)

def most_common_lemmas_tagged(docs, tag, n: int):
    words = []
    for doc in docs:
        for token in doc:
            if not token.is_stop and not token.is_punct:
                if token.pos_ == tag:
                    words.append(token.lemma_.lower())
    word_freq = Counter(words)
    return word_freq.most_common(n)

In [72]:
most_common_lemmas_tagged(ru_all_docs, "NOUN", 100)

[('человек', 10941),
 ('спасибо', 8292),
 ('дело', 8209),
 ('время', 7552),
 ('мистер', 6779),
 ('день', 6667),
 ('год', 5763),
 ('раз', 5616),
 ('ночь', 5060),
 ('жизнь', 4963),
 ('сэр', 4919),
 ('деньга', 4842),
 ('отец', 4505),
 ('дом', 4493),
 ('место', 4146),
 ('друг', 4141),
 ('женщина', 3996),
 ('ребёнок', 3993),
 ('вечер', 3844),
 ('правда', 3706),
 ('работа', 3619),
 ('доктор', 3430),
 ('рука', 3306),
 ('утро', 3297),
 ('мисс', 3284),
 ('жена', 3277),
 ('мама', 3205),
 ('что-то', 3181),
 ('привет', 3166),
 ('час', 3097),
 ('девушка', 3038),
 ('господин', 2969),
 ('порядок', 2842),
 ('парень', 2821),
 ('слово', 2716),
 ('минута', 2667),
 ('имя', 2481),
 ('машина', 2429),
 ('мир', 2420),
 ('вещь', 2398),
 ('мужчина', 2353),
 ('муж', 2342),
 ('город', 2237),
 ('конец', 2207),
 ('свидание', 2152),
 ('голова', 2124),
 ('папа', 2091),
 ('случай', 2091),
 ('миссис', 2091),
 ('вид', 2058),
 ('вопрос', 1963),
 ('глаз', 1928),
 ('мать', 1923),
 ('любовь', 1922),
 ('комната', 1874),
 ('д

In [87]:
common = ['номер', 'смочь', 'получить', 'надеяться', 'вид', 'порядок', 'делать', 'дом', 'встреча', 'чёрт']

In [89]:
indexes = []
for original_word in common:
    indexes += find_lines_with_word(original_word, ru_all_docs)
len(indexes)

30411

In [84]:
chunk_size = 300
latest_chunk = 0

for chunk_start in range(latest_chunk, len(indexes), chunk_size):

    for i in tqdm(range(chunk_start, min(chunk_start + chunk_size, len(indexes)))):
        if translated_all[indexes[i]] != "":
            continue
        sentence = ru_all[indexes[i]]
        translation = model.translate(sentence, source_lang='ru', target_lang='en', beam_size=10, max_length=200)
        translated_all[indexes[i]] = translation

    with open(filename_translated_all, 'w', encoding="utf-8") as f:
        for line in translated_all:
            f.write(line)
            f.write('\n')

100%|██████████| 300/300 [05:45<00:00,  1.15s/it]
100%|██████████| 300/300 [05:45<00:00,  1.15s/it]
100%|██████████| 300/300 [04:23<00:00,  1.14it/s]
100%|██████████| 300/300 [05:11<00:00,  1.04s/it]
100%|██████████| 300/300 [04:03<00:00,  1.23it/s]
100%|██████████| 300/300 [05:38<00:00,  1.13s/it]
100%|██████████| 300/300 [06:06<00:00,  1.22s/it]
100%|██████████| 300/300 [06:33<00:00,  1.31s/it]
100%|██████████| 300/300 [04:12<00:00,  1.19it/s]
100%|██████████| 300/300 [04:17<00:00,  1.17it/s]
100%|██████████| 300/300 [04:13<00:00,  1.18it/s]
100%|██████████| 300/300 [06:41<00:00,  1.34s/it] 
100%|██████████| 300/300 [05:25<00:00,  1.08s/it]
100%|██████████| 300/300 [06:08<00:00,  1.23s/it]
100%|██████████| 300/300 [04:32<00:00,  1.10it/s]
100%|██████████| 300/300 [04:08<00:00,  1.20it/s]
100%|██████████| 300/300 [03:59<00:00,  1.25it/s]
100%|██████████| 300/300 [03:58<00:00,  1.26it/s]
100%|██████████| 300/300 [06:01<00:00,  1.20s/it]
100%|██████████| 300/300 [05:27<00:00,  1.09s/it]

выравнивание

дамп доков

In [86]:
en_nlp_lg = spacy.load("en_core_web_lg")

In [85]:
with open("corpora/subtitles/translations/opus10_spacy_final.bin", "rb") as file:
    en_translated_bytes_data = file.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(en_translated_bytes_data)
en_translated_docs = list(doc_bin.get_docs(nlp.vocab))

KeyboardInterrupt: 

In [102]:
empty = en_nlp_lg("")

docs = []

for sentence in tqdm(translated_all):
    if sentence == "":
        docs.append(empty)
    else:
        spacy_doc = en_nlp_lg(sentence)
        docs.append(spacy_doc)

doc_bin = DocBin()
for doc in tqdm(docs):
    doc_bin.add(doc)
bytes_data = doc_bin.to_bytes()

with open("corpora/subtitles/translations/opus10_spacy_final.bin", "wb") as file:
    file.write(bytes_data)

100%|██████████| 900000/900000 [22:07<00:00, 677.73it/s]  
100%|██████████| 900000/900000 [00:34<00:00, 26182.07it/s]


In [103]:
en_translated_docs = docs

In [None]:
filename_translated_all = "corpora/subtitles/translations/opus10_whole.txt"
with open(filename_translated_all, "rt", encoding="utf-8") as f:
    translated_all = [line.rstrip() for line in f.readlines()]
len(translated_all)

In [109]:
folder_name = "corpora/subtitles/translations"
fname_mwmf = f"{folder_name}/mwmf"
fname_itermax = f"{folder_name}/itermax"
fname_inter = f"{folder_name}/inter"

with open(fname_mwmf, "rt", encoding="utf-8") as f:
    mwmf = [line.rstrip() for line in f.readlines()]
with open(fname_itermax, "rt", encoding="utf-8") as f:
    itermax = [line.rstrip() for line in f.readlines()]
with open(fname_inter, "rt", encoding="utf-8") as f:
    inter = [line.rstrip() for line in f.readlines()]

In [106]:
translated_all_docs = en_translated_docs

In [108]:
chunk_size = 20000
latest_chunk = 0
for chunk_start in range(latest_chunk, 900000, chunk_size):

    for i in tqdm(range(chunk_start, chunk_start + chunk_size)):
        if translated_all[i] == "":
            continue
        if mwmf[i] != "":
            continue
        ru_tokens = [token.text for token in ru_all_docs[i]]
        translated_tokens = [token.text for token in translated_all_docs[i]]
        # print(f"{ru_tokens}\n{translated_tokens}\n")
        src, trg = (ru_tokens, translated_tokens)
        alignments = myaligner.get_word_aligns(src, trg)
        mwmf[i] = " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
        itermax[i] = " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
        inter[i] = " ".join([f"{x}-{y}" for x, y in alignments["inter"]])

    with open(fname_mwmf, 'w', encoding="utf-8") as f:
        for line in mwmf:
            f.write(line)
            f.write('\n')

    with open(fname_itermax, 'w', encoding="utf-8") as f:
        for line in itermax:
            f.write(line)
            f.write('\n')

    with open(fname_inter, 'w', encoding="utf-8") as f:
        for line in inter:
            f.write(line)
            f.write('\n')

100%|██████████| 20000/20000 [05:24<00:00, 61.59it/s] 
100%|██████████| 20000/20000 [04:34<00:00, 72.84it/s] 
100%|██████████| 20000/20000 [03:41<00:00, 90.30it/s] 
100%|██████████| 20000/20000 [04:35<00:00, 72.49it/s] 
100%|██████████| 20000/20000 [04:31<00:00, 73.78it/s] 
100%|██████████| 20000/20000 [04:23<00:00, 75.87it/s] 
100%|██████████| 20000/20000 [04:37<00:00, 72.13it/s] 
100%|██████████| 20000/20000 [05:23<00:00, 61.83it/s] 
100%|██████████| 20000/20000 [04:28<00:00, 74.57it/s] 
100%|██████████| 20000/20000 [04:22<00:00, 76.09it/s] 
100%|██████████| 20000/20000 [05:05<00:00, 65.56it/s] 
100%|██████████| 20000/20000 [05:28<00:00, 60.92it/s] 
100%|██████████| 20000/20000 [04:53<00:00, 68.20it/s] 
100%|██████████| 20000/20000 [05:58<00:00, 55.77it/s]  
100%|██████████| 20000/20000 [04:24<00:00, 75.54it/s] 
100%|██████████| 20000/20000 [04:54<00:00, 67.83it/s] 
100%|██████████| 20000/20000 [04:49<00:00, 69.04it/s] 
100%|██████████| 20000/20000 [04:49<00:00, 69.00it/s] 
100%|████

 # ВОТ ТУТ СТОП

merge two translations from different laptops

In [96]:
filename_another_translated_all = "corpora/subtitles/translations/opus10_whole_nikitas.txt"
with open(filename_another_translated_all, "rt", encoding="utf-8") as f:
    another_translated_all = [line.rstrip() for line in f.readlines()]
len(another_translated_all)

900000

In [19]:
another_folder_name = "tmp_alignment"
another_fname_mwmf = f"{another_folder_name}/mwmf (2)"
another_fname_itermax = f"{another_folder_name}/itermax (2)"
another_fname_inter = f"{another_folder_name}/inter (2)"

In [20]:
with open(another_fname_mwmf, "rt", encoding="utf-8") as f:
    another_mwmf = [line.rstrip() for line in f.readlines()]
with open(another_fname_itermax, "rt", encoding="utf-8") as f:
    another_itermax = [line.rstrip() for line in f.readlines()]
with open(another_fname_inter, "rt", encoding="utf-8") as f:
    another_inter = [line.rstrip() for line in f.readlines()]

In [7]:
len(translated_all)

900000

In [98]:
for i in range(len(translated_all)):
    if translated_all[i] == "" and another_translated_all[i] != "":
        translated_all[i] = another_translated_all[i] 

In [45]:
lost_translation = []
lost_alignment = []
for i in range(len(translated_all)):
    if len(docs[i]) == 0 and len(mwmf[i]) != 0:
        lost_translation.append(i)
    elif  len(docs[i]) != 0 and len(mwmf[i]) == 0:
        lost_alignment.append(i)

In [46]:
lost_alignment

[100146,
 100187,
 100189,
 100195,
 100199,
 100251,
 100337,
 100422,
 100665,
 100667,
 100694,
 100708,
 100749,
 100753,
 100789,
 100816,
 100975,
 101002,
 101040,
 101099,
 101147,
 101224,
 101336,
 101488,
 101551,
 101555,
 101709,
 101710,
 101926,
 101927,
 102035,
 102036,
 102083,
 102157,
 102169,
 102173,
 102249,
 102261,
 102300,
 102450,
 102533,
 102581,
 102664,
 102670,
 102684,
 102703,
 102705,
 102736,
 102756,
 102761,
 102795,
 102808,
 102845,
 102886,
 102925,
 102989,
 103034,
 103066,
 103221,
 103264,
 103279,
 103296,
 103355,
 103393,
 103747,
 103756,
 103766,
 103947,
 103953,
 103969,
 103971,
 104012,
 104033,
 104039,
 104158,
 104226,
 104354,
 104361,
 104434,
 104436,
 104452,
 104486,
 104489,
 104509,
 104689,
 104690,
 104691,
 104726,
 104811,
 104820,
 104834,
 105002,
 105004,
 105019,
 105025,
 105037,
 105075,
 105110,
 105127,
 105131,
 105195,
 105217,
 105228,
 105229,
 105238,
 105274,
 105290,
 105302,
 105327,
 105344,
 105420,
 

In [13]:
another_translated_all[75]

''

In [12]:
translated_all[75]

''

In [99]:
with open(filename_translated_all, 'w', encoding="utf-8") as f:
    for line in translated_all:
        f.write(line)
        f.write('\n')

In [38]:
for i in range(len(mwmf)):
    if mwmf[i] == "" and another_mwmf[i] != "":
        mwmf[i] = another_mwmf[i] 
        itermax[i] = another_itermax[i]
        inter[i] = another_inter[i]

In [39]:
with open(fname_mwmf, 'w', encoding="utf-8") as f:
    for line in mwmf:
        f.write(line)
        f.write('\n')

with open(fname_itermax, 'w', encoding="utf-8") as f:
    for line in itermax:
        f.write(line)
        f.write('\n')

with open(fname_inter, 'w', encoding="utf-8") as f:
    for line in inter:
        f.write(line)
        f.write('\n')

синк доков

In [1]:
with open("corpora/subtitles/translations/opus10_spacy_final.bin", "rb") as file:
    en_translated_bytes_data = file.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(en_translated_bytes_data)
en_translated_docs = list(doc_bin.get_docs(nlp.vocab))
len(en_translated_docs)

NameError: name 'spacy' is not defined

In [77]:
with open("translations_after_req_words/translations/opus10_spacy_check2.bin", "rb") as file:
    another_en_translated_bytes_data = file.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(another_en_translated_bytes_data)
another_en_translated_docs = list(doc_bin.get_docs(nlp.vocab))
len(another_en_translated_docs)

900000

In [9]:
len(en_translated_docs[820])

7

In [11]:
en_nlp_lg = spacy.load("en_core_web_lg")

In [13]:
len(en_translated_docs)

900000

In [59]:
len(translated_all)

900000

In [79]:
empty = en_nlp_lg("")
docs = []

for i, sentence in tqdm(enumerate(translated_all)):
    if sentence == "":
        docs.append(empty)
    elif len(en_translated_docs[i]) != 0:
        docs.append(en_translated_docs[i])
    else:
        spacy_doc = en_nlp_lg(sentence)
        docs.append(spacy_doc)

doc_bin = DocBin()
for doc in tqdm(docs):
    doc_bin.add(doc)
bytes_data = doc_bin.to_bytes()

with open("corpora/subtitles/translations/opus10_spacy_final.bin", "wb") as file:
    file.write(bytes_data)

900000it [00:00, 2368423.31it/s]
100%|██████████| 900000/900000 [00:28<00:00, 31274.75it/s]


In [80]:
len(docs)

900000

In [47]:
translated_all_docs = docs

In [44]:
len(translated_all_docs)

900000

In [53]:
chunk_size = 10000
latest_chunk = 100000
for chunk_start in range(latest_chunk, 500000, chunk_size):

    for i in tqdm(range(chunk_start, chunk_start + chunk_size)):
        if translated_all[i] == "":
            continue
        if mwmf[i] != "":
            continue
        ru_tokens = [token.text for token in ru_all_docs[i]]
        translated_tokens = [token.text for token in translated_all_docs[i]]
        # print(f"{ru_tokens}\n{translated_tokens}\n")
        src, trg = (ru_tokens, translated_tokens)
        alignments = myaligner.get_word_aligns(src, trg)
        mwmf[i] = " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
        itermax[i] = " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
        inter[i] = " ".join([f"{x}-{y}" for x, y in alignments["inter"]])

    with open(fname_mwmf, 'w', encoding="utf-8") as f:
        for line in mwmf:
            f.write(line)
            f.write('\n')

    with open(fname_itermax, 'w', encoding="utf-8") as f:
        for line in itermax:
            f.write(line)
            f.write('\n')

    with open(fname_inter, 'w', encoding="utf-8") as f:
        for line in inter:
            f.write(line)
            f.write('\n')

100%|██████████| 10000/10000 [00:38<00:00, 262.44it/s]
100%|██████████| 10000/10000 [00:37<00:00, 266.13it/s]
100%|██████████| 10000/10000 [00:49<00:00, 203.71it/s]
100%|██████████| 10000/10000 [00:46<00:00, 214.62it/s]
100%|██████████| 10000/10000 [00:46<00:00, 213.69it/s]
100%|██████████| 10000/10000 [00:33<00:00, 294.90it/s]
100%|██████████| 10000/10000 [01:00<00:00, 164.22it/s]
100%|██████████| 10000/10000 [00:35<00:00, 281.37it/s]
100%|██████████| 10000/10000 [00:46<00:00, 214.23it/s]
100%|██████████| 10000/10000 [00:46<00:00, 217.07it/s]
100%|██████████| 10000/10000 [00:39<00:00, 252.42it/s]
100%|██████████| 10000/10000 [00:52<00:00, 192.30it/s]
100%|██████████| 10000/10000 [00:34<00:00, 291.96it/s]
100%|██████████| 10000/10000 [01:09<00:00, 143.82it/s]
100%|██████████| 10000/10000 [00:40<00:00, 245.35it/s]
100%|██████████| 10000/10000 [00:43<00:00, 231.53it/s]
100%|██████████| 10000/10000 [00:55<00:00, 178.66it/s]
100%|██████████| 10000/10000 [01:08<00:00, 146.08it/s]
100%|█████