In [11]:
import os
from pprint import pprint
from tqdm import tqdm
import  spacy
from spacy.tokens import DocBin
from collections import Counter

In [45]:
from easynmt import EasyNMT
model = EasyNMT('opus-mt')

In [4]:
with open("subtitles_raw/en_raw_0-900.txt", "rt", encoding="utf-8") as f:
    en_all = [line.strip() for line in f]

with open("subtitles_raw/ru_raw_0-900.txt", "rt", encoding="utf-8") as f:
    ru_all = [line.strip() for line in f]

In [3]:
with open("corpora/subtitles/ru_subtitles_spacy_dump.bin", "rb") as f:
    restored_bytes_data = f.read()

nlp = spacy.blank("ru")
doc_bin = DocBin().from_bytes(restored_bytes_data)
ru_all_docs = list(doc_bin.get_docs(nlp.vocab))

In [19]:
def most_common_lemmas(docs, n: int):
    words = []
    for doc in docs:
        for token in doc:
            if not token.is_stop and not token.is_punct:
                words.append(token.lemma_.lower())
    word_freq = Counter(words)
    return word_freq.most_common(n)

def most_common_lemmas_tagged(docs, tag, n: int):
    words = []
    for doc in docs:
        for token in doc:
            if not token.is_stop and not token.is_punct:
                if token.pos_ == tag:
                    words.append(token.lemma_.lower())
    word_freq = Counter(words)
    return word_freq.most_common(n)

In [43]:
common_nouns = most_common_lemmas_tagged(ru_all_docs, "NOUN", 100)
common_nouns

[('человек', 10941),
 ('спасибо', 8292),
 ('дело', 8209),
 ('время', 7552),
 ('мистер', 6779),
 ('день', 6667),
 ('год', 5763),
 ('раз', 5616),
 ('ночь', 5060),
 ('жизнь', 4963),
 ('сэр', 4919),
 ('деньга', 4842),
 ('отец', 4505),
 ('дом', 4493),
 ('место', 4146),
 ('друг', 4141),
 ('женщина', 3996),
 ('ребёнок', 3993),
 ('вечер', 3844),
 ('правда', 3706),
 ('работа', 3619),
 ('доктор', 3430),
 ('рука', 3306),
 ('утро', 3297),
 ('мисс', 3284),
 ('жена', 3277),
 ('мама', 3205),
 ('что-то', 3181),
 ('привет', 3166),
 ('час', 3097),
 ('девушка', 3038),
 ('господин', 2969),
 ('порядок', 2842),
 ('парень', 2821),
 ('слово', 2716),
 ('минута', 2667),
 ('имя', 2481),
 ('машина', 2429),
 ('мир', 2420),
 ('вещь', 2398),
 ('мужчина', 2353),
 ('муж', 2342),
 ('город', 2237),
 ('конец', 2207),
 ('свидание', 2152),
 ('голова', 2124),
 ('папа', 2091),
 ('случай', 2091),
 ('миссис', 2091),
 ('вид', 2058),
 ('вопрос', 1963),
 ('глаз', 1928),
 ('мать', 1923),
 ('любовь', 1922),
 ('комната', 1874),
 ('д

In [48]:
def find_lines_with_word(original_word: str, docs) -> list:
    indexes = []
    for i, sentence in enumerate(docs):
        for token in sentence:
            if token.lemma_.lower() == original_word:
                indexes.append(i)
                break
    return indexes

In [49]:
common = ['милый', 'отличный', 'ужасный', 'красивый', 'полный', 'маленький', 'странный', 'старый', 'новый', 'нравиться', 'бояться', 'просить', 'позволить', 'считать', 'решить', 'рука', 'история', 'путь', 'место', 'дело', 'случай']

In [50]:
indexes = []
for original_word in common:
    indexes += find_lines_with_word(original_word, ru_all_docs)
len(indexes)

53251

In [5]:
filename_translated_all = "corpora/subtitles/translations/opus10_whole.txt"
with open(filename_translated_all, "rt", encoding="utf-8") as f:
    translated_all = [line.rstrip() for line in f.readlines()]
len(translated_all)

900000

In [54]:
chunk_size = 500
latest_chunk = 1000

for chunk_start in range(latest_chunk, len(indexes) // 2, chunk_size):

    for i in tqdm(range(chunk_start, min(chunk_start + chunk_size, len(indexes)))):
        if translated_all[indexes[i]] != "":
            continue
        sentence = ru_all[indexes[i]]
        translation = model.translate(sentence, source_lang='ru', target_lang='en', beam_size=10, max_length=200)
        translated_all[indexes[i]] = translation

    with open(filename_translated_all, 'w', encoding="utf-8") as f:
        for line in translated_all:
            f.write(line)
            f.write('\n')

100%|██████████| 500/500 [08:35<00:00,  1.03s/it]
100%|██████████| 500/500 [12:03<00:00,  1.45s/it] 
100%|██████████| 500/500 [10:25<00:00,  1.25s/it]
100%|██████████| 500/500 [12:38<00:00,  1.52s/it]
100%|██████████| 500/500 [08:17<00:00,  1.01it/s]
100%|██████████| 500/500 [12:29<00:00,  1.50s/it] 
100%|██████████| 500/500 [10:53<00:00,  1.31s/it]
100%|██████████| 500/500 [11:44<00:00,  1.41s/it]
100%|██████████| 500/500 [12:52<00:00,  1.54s/it]  
100%|██████████| 500/500 [10:12<00:00,  1.22s/it]
100%|██████████| 500/500 [09:21<00:00,  1.12s/it]
100%|██████████| 500/500 [10:38<00:00,  1.28s/it]
100%|██████████| 500/500 [11:26<00:00,  1.37s/it]
100%|██████████| 500/500 [13:19<00:00,  1.60s/it]
100%|██████████| 500/500 [13:04<00:00,  1.57s/it]
100%|██████████| 500/500 [11:55<00:00,  1.43s/it]
100%|██████████| 500/500 [12:11<00:00,  1.46s/it]
100%|██████████| 500/500 [13:35<00:00,  1.63s/it] 
100%|██████████| 500/500 [14:03<00:00,  1.69s/it]
100%|██████████| 500/500 [12:49<00:00,  1.54s

In [53]:
with open(filename_translated_all, 'w', encoding="utf-8") as f:
    for line in translated_all:
        f.write(line)
        f.write('\n')

выравнивание

дамп доков

In [14]:
en_nlp_lg = spacy.load("en_core_web_lg")

In [12]:
len(translated_all)

900000

In [12]:
with open("corpora/subtitles/translations/opus10_spacy_final.bin", "rb") as file:
    en_translated_bytes_data = file.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(en_translated_bytes_data)
en_translated_docs = list(doc_bin.get_docs(nlp.vocab))

In [16]:
empty = en_nlp_lg("")

docs = []

for sentence in tqdm(translated_all):
    if sentence == "":
        docs.append(empty)
    elif len(en_translated_docs[i]) != 0:
        docs.append(en_translated_docs[i])
    else:
        spacy_doc = en_nlp_lg(sentence)
        docs.append(spacy_doc)

doc_bin = DocBin()
for doc in tqdm(docs):
    doc_bin.add(doc)
bytes_data = doc_bin.to_bytes()

with open("corpora/subtitles/translations/opus10_spacy_final.bin", "wb") as file:
    file.write(bytes_data)

100%|██████████| 900000/900000 [15:09<00:00, 990.06it/s] 
100%|██████████| 900000/900000 [00:22<00:00, 39981.31it/s]


In [17]:
en_translated_docs = docs

In [28]:
with open("corpora/subtitles/translations/opus10_spacy_final.bin", "rb") as file:
    en_translated_bytes_data = file.read()
 
nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(en_translated_bytes_data)
en_translated_docs = list(doc_bin.get_docs(nlp.vocab))

In [23]:
folder_name = "corpora/subtitles/translations"
fname_mwmf = f"{folder_name}/mwmf"
fname_itermax = f"{folder_name}/itermax"
fname_inter = f"{folder_name}/inter"

In [24]:
with open(fname_mwmf, "rt", encoding="utf-8") as f:
    mwmf = [line.rstrip() for line in f.readlines()]
with open(fname_itermax, "rt", encoding="utf-8") as f:
    itermax = [line.rstrip() for line in f.readlines()]
with open(fname_inter, "rt", encoding="utf-8") as f:
    inter = [line.rstrip() for line in f.readlines()]

In [27]:
mwmf = ["" for _ in range(900000)]
itermax = ["" for _ in range(900000)]
inter = ["" for _ in range(900000)]

In [29]:
translated_all_docs = en_translated_docs

In [31]:
len(mwmf)

900000

In [57]:
from simalign import SentenceAligner
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-05-09 14:09:37,401 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [39]:
chunk_size = 20000
latest_chunk = 180000
for chunk_start in range(latest_chunk, 900000, chunk_size):

    for i in tqdm(range(chunk_start, chunk_start + chunk_size)):
        if translated_all[i] == "":
            continue
        if mwmf[i] != "":
            continue
        ru_tokens = [token.text for token in ru_all_docs[i]]
        translated_tokens = [token.text for token in translated_all_docs[i]]
        # print(f"{ru_tokens}\n{translated_tokens}\n")
        src, trg = (ru_tokens, translated_tokens)
        alignments = myaligner.get_word_aligns(src, trg)
        mwmf[i] = " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
        itermax[i] = " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
        inter[i] = " ".join([f"{x}-{y}" for x, y in alignments["inter"]])

    with open(fname_mwmf, 'w', encoding="utf-8") as f:
        for line in mwmf:
            f.write(line)
            f.write('\n')

    with open(fname_itermax, 'w', encoding="utf-8") as f:
        for line in itermax:
            f.write(line)
            f.write('\n')

    with open(fname_inter, 'w', encoding="utf-8") as f:
        for line in inter:
            f.write(line)
            f.write('\n')

100%|██████████| 20000/20000 [00:21<00:00, 921.12it/s]  
100%|██████████| 20000/20000 [03:11<00:00, 104.19it/s]
100%|██████████| 20000/20000 [04:10<00:00, 79.74it/s] 
100%|██████████| 20000/20000 [03:09<00:00, 105.67it/s]
100%|██████████| 20000/20000 [02:57<00:00, 112.36it/s] 
100%|██████████| 20000/20000 [02:41<00:00, 123.96it/s]
100%|██████████| 20000/20000 [03:06<00:00, 107.27it/s]
100%|██████████| 20000/20000 [02:41<00:00, 124.22it/s]
100%|██████████| 20000/20000 [02:45<00:00, 120.69it/s]
100%|██████████| 20000/20000 [02:39<00:00, 125.24it/s]
100%|██████████| 20000/20000 [02:26<00:00, 136.98it/s]
100%|██████████| 20000/20000 [02:27<00:00, 135.66it/s]
100%|██████████| 20000/20000 [02:42<00:00, 123.07it/s]
100%|██████████| 20000/20000 [02:10<00:00, 152.90it/s]
100%|██████████| 20000/20000 [03:20<00:00, 99.79it/s] 
100%|██████████| 20000/20000 [03:14<00:00, 102.83it/s]
100%|██████████| 20000/20000 [03:08<00:00, 106.13it/s]
100%|██████████| 20000/20000 [02:12<00:00, 151.19it/s]
100%|██

merge two translations from different laptops

In [2]:
filename_another_translated_all = "corpora/subtitles/translations/opus10_whole_case.txt"
with open(filename_another_translated_all, "rt", encoding="utf-8") as f:
    another_translated_all = [line.rstrip() for line in f.readlines()]
len(another_translated_all)

900000

In [6]:
for i in range(70, 80):
    print(i, another_translated_all[i])

70 
71 So when he, uh, asked for help, I couldn't say no.
72 
73 
74 
75 Stay where you are!
76 
77 It's a risky case, but the risk is well paid.
78 
79 In one week, the fisherman earns more than he earns for two years.


In [19]:
another_folder_name = "tmp_alignment"
another_fname_mwmf = f"{another_folder_name}/mwmf (2)"
another_fname_itermax = f"{another_folder_name}/itermax (2)"
another_fname_inter = f"{another_folder_name}/inter (2)"

In [20]:
with open(another_fname_mwmf, "rt", encoding="utf-8") as f:
    another_mwmf = [line.rstrip() for line in f.readlines()]
with open(another_fname_itermax, "rt", encoding="utf-8") as f:
    another_itermax = [line.rstrip() for line in f.readlines()]
with open(another_fname_inter, "rt", encoding="utf-8") as f:
    another_inter = [line.rstrip() for line in f.readlines()]

In [7]:
len(translated_all)

900000

In [7]:
for i in range(len(translated_all)):
    if translated_all[i] == "" and another_translated_all[i] != "":
        translated_all[i] = another_translated_all[i] 

In [40]:
lost_translation = []
lost_alignment = []
for i in range(len(translated_all)):
    if len(translated_all[i]) == 0 and len(mwmf[i]) != 0:
        lost_translation.append(i)
    elif  len(translated_all[i]) != 0 and len(mwmf[i]) == 0:
        lost_alignment.append(i)

In [41]:
lost_alignment

[100146,
 100187,
 100189,
 100195,
 100199,
 100251,
 100337,
 100422,
 100665,
 100667,
 100694,
 100708,
 100749,
 100753,
 100789,
 100816,
 100975,
 101002,
 101040,
 101099,
 101147,
 101224,
 101336,
 101488,
 101551,
 101555,
 101709,
 101710,
 101926,
 101927,
 102035,
 102036,
 102083,
 102157,
 102169,
 102173,
 102249,
 102261,
 102300,
 102450,
 102533,
 102581,
 102664,
 102670,
 102684,
 102703,
 102705,
 102736,
 102756,
 102761,
 102795,
 102808,
 102845,
 102886,
 102925,
 102989,
 103034,
 103066,
 103221,
 103264,
 103279,
 103296,
 103355,
 103393,
 103747,
 103756,
 103766,
 103947,
 103953,
 103969,
 103971,
 104012,
 104033,
 104039,
 104158,
 104226,
 104354,
 104361,
 104434,
 104436,
 104452,
 104486,
 104489,
 104509,
 104689,
 104690,
 104691,
 104726,
 104811,
 104820,
 104834,
 105002,
 105004,
 105019,
 105025,
 105037,
 105075,
 105110,
 105127,
 105131,
 105195,
 105217,
 105228,
 105229,
 105238,
 105274,
 105290,
 105302,
 105327,
 105344,
 105420,
 

In [13]:
another_translated_all[75]

''

In [12]:
translated_all[75]

''

In [8]:
with open(filename_translated_all, 'w', encoding="utf-8") as f:
    for line in translated_all:
        f.write(line)
        f.write('\n')

In [38]:
for i in range(len(mwmf)):
    if mwmf[i] == "" and another_mwmf[i] != "":
        mwmf[i] = another_mwmf[i] 
        itermax[i] = another_itermax[i]
        inter[i] = another_inter[i]

In [39]:
with open(fname_mwmf, 'w', encoding="utf-8") as f:
    for line in mwmf:
        f.write(line)
        f.write('\n')

with open(fname_itermax, 'w', encoding="utf-8") as f:
    for line in itermax:
        f.write(line)
        f.write('\n')

with open(fname_inter, 'w', encoding="utf-8") as f:
    for line in inter:
        f.write(line)
        f.write('\n')

синк доков

In [1]:
with open("corpora/subtitles/translations/opus10_spacy_final.bin", "rb") as file:
    en_translated_bytes_data = file.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(en_translated_bytes_data)
en_translated_docs = list(doc_bin.get_docs(nlp.vocab))
len(en_translated_docs)

NameError: name 'spacy' is not defined

In [77]:
with open("translations_after_req_words/translations/opus10_spacy_check2.bin", "rb") as file:
    another_en_translated_bytes_data = file.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(another_en_translated_bytes_data)
another_en_translated_docs = list(doc_bin.get_docs(nlp.vocab))
len(another_en_translated_docs)

900000

In [9]:
len(en_translated_docs[820])

7

In [11]:
en_nlp_lg = spacy.load("en_core_web_lg")

In [13]:
len(en_translated_docs)

900000

In [59]:
len(translated_all)

900000

In [79]:
empty = en_nlp_lg("")
docs = []

for i, sentence in tqdm(enumerate(translated_all)):
    if sentence == "":
        docs.append(empty)
    elif len(en_translated_docs[i]) != 0:
        docs.append(en_translated_docs[i])
    else:
        spacy_doc = en_nlp_lg(sentence)
        docs.append(spacy_doc)

doc_bin = DocBin()
for doc in tqdm(docs):
    doc_bin.add(doc)
bytes_data = doc_bin.to_bytes()

with open("corpora/subtitles/translations/opus10_spacy_final.bin", "wb") as file:
    file.write(bytes_data)

900000it [00:00, 2368423.31it/s]
100%|██████████| 900000/900000 [00:28<00:00, 31274.75it/s]


In [80]:
len(docs)

900000

In [82]:
translated_all_docs = docs

In [63]:
len(translated_all_docs)

900000

In [89]:
chunk_size = 20000
latest_chunk = 720000
for chunk_start in range(latest_chunk, 800000, chunk_size):

    for i in tqdm(range(chunk_start, chunk_start + chunk_size)):
        if translated_all[i] == "":
            continue
        if mwmf[i] != "":
            continue
        ru_tokens = [token.text for token in ru_all_docs[i]]
        translated_tokens = [token.text for token in translated_all_docs[i]]
        # print(f"{ru_tokens}\n{translated_tokens}\n")
        src, trg = (ru_tokens, translated_tokens)
        alignments = myaligner.get_word_aligns(src, trg)
        mwmf[i] = " ".join([f"{x}-{y}" for x, y in alignments["mwmf"]])
        itermax[i] = " ".join([f"{x}-{y}" for x, y in alignments["itermax"]])
        inter[i] = " ".join([f"{x}-{y}" for x, y in alignments["inter"]])

    with open(fname_mwmf, 'w', encoding="utf-8") as f:
        for line in mwmf:
            f.write(line)
            f.write('\n')

    with open(fname_itermax, 'w', encoding="utf-8") as f:
        for line in itermax:
            f.write(line)
            f.write('\n')

    with open(fname_inter, 'w', encoding="utf-8") as f:
        for line in inter:
            f.write(line)
            f.write('\n')

100%|██████████| 20000/20000 [00:44<00:00, 450.22it/s]  
100%|██████████| 20000/20000 [03:28<00:00, 96.07it/s] 
 38%|███▊      | 7649/20000 [01:40<02:42, 76.20it/s] 


KeyboardInterrupt: 