In [1]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from transformers import pipeline
from collections import defaultdict

In [2]:
model = pipeline('fill-mask', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
pred = model("I love pizza and [MASK].")
pred

[{'score': 0.18336044251918793,
  'token': 10733,
  'token_str': 'pizza',
  'sequence': 'i love pizza and pizza.'},
 {'score': 0.12329741567373276,
  'token': 5404,
  'token_str': 'beer',
  'sequence': 'i love pizza and beer.'},
 {'score': 0.050967153161764145,
  'token': 4511,
  'token_str': 'wine',
  'sequence': 'i love pizza and wine.'},
 {'score': 0.045108597725629807,
  'token': 24857,
  'token_str': 'pasta',
  'sequence': 'i love pizza and pasta.'},
 {'score': 0.03131162002682686,
  'token': 8808,
  'token_str': 'cheese',
  'sequence': 'i love pizza and cheese.'}]

In [4]:
bert_dict = defaultdict(lambda : defaultdict(list))

In [5]:
with open("subtitles_raw/en_raw_0-900.txt", "rt", encoding="utf-8") as f:
    en_all = [line.strip() for line in f]

with open("subtitles_raw/ru_raw_0-900.txt", "rt", encoding="utf-8") as f:
    ru_all = [line.strip() for line in f]

In [6]:
with open("corpora/subtitles/ru_subtitles_spacy_dump.bin", "rb") as f:
    restored_bytes_data = f.read()

nlp = spacy.blank("ru")
doc_bin = DocBin().from_bytes(restored_bytes_data)
ru_all_docs = list(doc_bin.get_docs(nlp.vocab))

In [7]:
with open("corpora/subtitles/en_subtitles_spacy_dump.bin", "rb") as f:
    restored_bytes_data = f.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(restored_bytes_data)
en_all_docs = list(doc_bin.get_docs(nlp.vocab))

In [15]:
with open("corpora/subtitles/translations/opus10_spacy.bin", "rb") as file:
    en_translated_bytes_data = file.read()

nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(en_translated_bytes_data)
en_translated_docs = list(doc_bin.get_docs(nlp.vocab))

In [8]:
def find_lines_with_word(original_word: str, docs) -> list:
    indexes = []
    for i, sentence in enumerate(docs):
        for token in sentence:
            if token.lemma_.lower() == original_word:
                indexes.append(i)
                break
    return indexes

In [9]:
# indexes = []
# for original_word in common:
#     indexes += find_lines_with_word(original_word, ru_all_docs)
# len(indexes)

In [16]:
def fill_bert_dict_for_one_word(original_word: str, aligned_text: list):
    MASK_TOKEN = '[MASK]'
    bert_word_dict = defaultdict(list)
    for line_idx, ru_doc in tqdm(enumerate(ru_all_docs)):
        if len(aligned_text[line_idx]) == 0:
            continue
        ru_token_idx = -1;
        for j, token in enumerate(ru_doc):
            if token.lemma_.lower() == original_word:
                if ru_token_idx != -1: 
                    ru_token_idx = -1
                    break
                else:
                    ru_token_idx = j
        if ru_token_idx != -1:
            en_token_idx = -1
            en_tokens = [token.text for token in en_all_docs[line_idx]]
            # print(line_idx)
            for src, tgt in aligned_text[line_idx]:
                if src == ru_token_idx:
                    if en_token_idx != -1:
                        en_token_idx = -1
                        break
                    else:
                        en_token_idx = tgt
            if en_token_idx != -1:
                en_tokens[en_token_idx] = MASK_TOKEN
                pred = model(' '.join(en_tokens))
                bert_word_dict[str(line_idx)] = pred
    return bert_word_dict

In [11]:
with open("corpora/subtitles/numerical_alignment/subtitles_inter.txt", "rt", encoding="utf-8") as file:
    aligned_inter = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

with open("corpora/subtitles/numerical_alignment/subtitles_itermax.txt", "rt", encoding="utf-8") as file:
    aligned_itermax = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

with open("corpora/subtitles/numerical_alignment/subtitles_mwmf.txt", "rt", encoding="utf-8") as file:
    aligned_mwmf = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

In [53]:
print(aligned_inter[80])

[(0, 0), (1, 1), (2, 3), (3, 4), (4, 5), (5, 6), (6, 8), (8, 9), (9, 10)]


In [12]:
bert_word_dict = fill_bert_dict_for_one_word("новый", aligned_inter)
bert_word_dict

900000it [05:33, 2697.69it/s] 


defaultdict(list,
            {'374': [{'score': 0.2799505293369293,
               'token': 2047,
               'token_str': 'new',
               'sequence': "shouldn't you be over there with your new friends?"},
              {'score': 0.10067027807235718,
               'token': 2060,
               'token_str': 'other',
               'sequence': "shouldn't you be over there with your other friends?"},
              {'score': 0.08709819614887238,
               'token': 2190,
               'token_str': 'best',
               'sequence': "shouldn't you be over there with your best friends?"},
              {'score': 0.06691211462020874,
               'token': 2214,
               'token_str': 'old',
               'sequence': "shouldn't you be over there with your old friends?"},
              {'score': 0.03306974843144417,
               'token': 2210,
               'token_str': 'little',
               'sequence': "shouldn't you be over there with your little friends?"}],
   

In [60]:
len(bert_word_dict.keys())

2185

In [14]:
with open("corpora/subtitles/numerical_alignment/subtitles_inter.txt", "rt", encoding="utf-8") as file:
    aligned_inter = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

with open("corpora/subtitles/numerical_alignment/subtitles_itermax.txt", "rt", encoding="utf-8") as file:
    aligned_itermax = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

with open("corpora/subtitles/numerical_alignment/subtitles_mwmf.txt", "rt", encoding="utf-8") as file:
    aligned_mwmf = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

with open("corpora/subtitles/translations/inter", "rt", encoding="utf-8") as file:
    t_aligned_inter = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

with open("corpora/subtitles/translations/itermax", "rt", encoding="utf-8") as file:
    t_aligned_itermax = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

with open("corpora/subtitles/translations/mwmf", "rt", encoding="utf-8") as file:
    t_aligned_mwmf = [[tuple(map(int, pair.split('-'))) for pair in line.strip().split()] for line in file]

In [17]:
ru_docs = ru_all_docs
en_docs = en_all_docs

In [18]:
import spacy
from spacy.tokens import DocBin
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
ru_stopwords = [] # set(stopwords.words("russian"))
en_stopwords = [] # set(stopwords.words("english"))

# ru_stopwords.add('это')
# ru_stopwords.add('всё')
# ru_stopwords.add('свой')
# ru_stopwords.add('ещё')
# ru_stopwords.add('весь')

# en_stopwords.add("'s")

from string import punctuation
punct_list = [c for c in punctuation]
punct_list.append('--')
punct_list.append('...')
punct_list.append('–')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\warri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def get_variety_dict_from_alignment(original_word: str, aligned_text: list, t_aligned_text: list):
    variety_dict = defaultdict(lambda : set())
    translation_dict = defaultdict(lambda : set())
    hell = []
    for i, num_sentence_pairs in tqdm(enumerate(aligned_text)):
        # if len(num_sentence_pairs) == 0 or len(en_translated_docs[i]) == 0:
        #     continue
        
        phrase = []
        for src_idx, tgt_idx in num_sentence_pairs:
            # try:
                if ru_docs[i][src_idx].lemma_.lower() == original_word \
                    and en_docs[i][tgt_idx].text.lower() not in en_stopwords \
                        and en_docs[i][tgt_idx].text.lower() not in punct_list:
                    phrase.append(en_docs[i][tgt_idx].lemma_.lower())

            # except:
            #     print("Original:", i, src_idx, tgt_idx)
            #     continue

        if phrase:
            # phrase.sort()
            # phrase = ' '.join(phrase)
            # variety_dict[phrase].add(i)

            translated_phrase = []
            for tsrc_idx, ttgt_idx in t_aligned_text[i]:
                # try:
                    tlemma = en_translated_docs[i][ttgt_idx].lemma_.lower()
                    if ru_docs[i][tsrc_idx].lemma_.lower() == original_word \
                        and tlemma not in en_stopwords \
                            and tlemma not in punct_list:
                        # translation_dict[tlemma].append(i)
                        translated_phrase.append(tlemma)
                # except:
                #     print("Translation:", i, tsrc_idx, ttgt_idx)
                #     continue
            
            if translated_phrase:
                phrase.sort()
                phrase = ' '.join(phrase)
                variety_dict[phrase].add(i)
                translated_phrase.sort()
                translated_phrase = ' '.join(translated_phrase)
                translation_dict[translated_phrase].add(i)
            else:
                hell.append(i)

    return variety_dict, translation_dict, hell

In [20]:
word = "работа"
orig, trans, hell = get_variety_dict_from_alignment(word, aligned_inter, t_aligned_inter)

900000it [00:31, 28865.84it/s]


In [21]:
plot_variety_pie(trans, word)

NameError: name 'plot_variety_pie' is not defined