In [1]:
from PyPDF2 import PdfReader
from razdel import sentenize
import docx
import re
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

In [3]:
# вытащили текст из docx файлов
bur = getText('АнгархаевАЛ_МунхэНогоонХасуури.docx')
rus = getText('АнгархаевАЛ_ВечныйЦвет.docx')

In [4]:
# разбили на параграфы
bur_para = bur.split('\n')
rus_para = rus.split('\n')

In [5]:
# в бурятской книге главы пронумерованы римскими цифрами. Заменяю их на арабские

roman = []
for i in range(len(bur_para[1:-1])):
    if bur_para[i-1] == '' and bur_para[i+1] == '' and bur_para[i] != '' and len(bur_para[i])<10:
        roman.append(bur_para[i])

i = 0
arab = range(1, 41)
for j, para in enumerate(bur_para):
    if i < 40:
        if para == roman[i] :
            bur_para[j] = str(arab[i])
            i += 1        

почему-то в бурятском варианте 40 глав, а в русском переводе 46

In [6]:
ru_dict = dict.fromkeys(range(1,47))
bur_dict = dict.fromkeys(range(1,41))

In [7]:
ru_text = ' '.join(rus_para)
ru_chapters = re.split(r'  \d+  ', ru_text)
bur_text = ' '.join(bur_para)
bur_chapters = re.split(r'  \d+  ', bur_text)

In [8]:
len(bur_chapters)

41

In [9]:
len(ru_chapters)

47

# alignment

In [10]:
mname = 'SaranaAbidueva/labse_bur'
tokenizer = AutoTokenizer.from_pretrained(mname)
model = AutoModel.from_pretrained(mname)

Some weights of the model checkpoint at SaranaAbidueva/labse_bur were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def clean_text(book):
    # убираем лишние символы
    book_text = ' '.join(book)
    book_text = book_text.replace('\xa0', ' ')
    book_text = book_text.replace('\n', '')
    book_text = book_text.replace('■', ' ')
    book_text = book_text.replace('•', ' ')
    book_text = book_text.replace('*', '')
    book_text = book_text.replace('|', '')
    book_text = book_text.replace('^', '')
    book_text = book_text.replace('\xad', '')
    book_text = book_text.replace('Ь', 'h')
    book_text = book_text.replace('һ', 'h')
    # разделяем на предложения
    book_text = sentenize(book_text)
    book_text = [sent.text for sent in book_text]
    book_text = [x for x in book_text if x != '' and x != ' ' and x != '  ']
    return book_text

In [12]:
def embed(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.inference_mode():
        model_output = model(**encoded_input.to(model.device))
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

def center_norm(v):
    v = v - v.mean(0)
    return v /  (v**2).sum(1, keepdims=True) ** 0.5


def center_dot(x, y):
    m = (x.sum(0) + y.sum(0)) / (x.shape[0] + y.shape[0])
    x = x - m
    y = y - m
    x =  x /  (x**2).sum(1, keepdims=True) ** 0.5
    y =  y /  (y**2).sum(1, keepdims=True) ** 0.5
    return np.dot(x, y.T)
def get_top_mean_by_row(x, k=5):
    m, n = x.shape
    k = min(k, n)
    topk_indices = np.argpartition(x, -k, axis=1)[:, -k:]
    rows, _ = np.indices((m, k))
    return x[rows, topk_indices].mean(1)
def align(sims):
    
    #sims = np.dot(center_norm(orig_vecs), center_norm(sum_vecs).T) ** 3
    #sims = center_dot(orig_embeds, sum_embeds) #** 3

    rewards = np.zeros_like(sims)
    choices = np.zeros_like(sims).astype(int)  # 1: choose this pair, 2: decrease i, 3: decrease j

    # алгоритм, разрешающий пропускать сколько угодно пар, лишь бы была монотонность
    for i in range(sims.shape[0]):
        for j in range(0, sims.shape[1]):
            # вариант первый: выровнять i-тое предложение с j-тым
            score_add = sims[i, j]
            if i > 0 and j > 0:  # вот как тогда выровняются предыдущие 
                score_add += rewards[i-1, j-1]
                choices[i, j] = 1
            best = score_add
            if i > 0 and rewards[i-1, j] > best:
                best = rewards[i-1, j]
                choices[i, j] = 2
            if j > 0 and rewards[i, j-1] > best:
                best = rewards[i, j-1]
                choices[i, j] = 3
            rewards[i, j] = best
    alignment = []
    i = sims.shape[0] - 1
    j = sims.shape[1] - 1
    while i > 0 and j > 0:
        if choices[i, j] == 1:
            alignment.append([i, j])
            i -= 1
            j -= 1
        elif choices[i, j] == 2:
            i -= 1
        else:
            j -= 1
    return alignment[::-1]

In [1]:
# посмотрела кол-во предложений в каждой главе, надеялась, что будут совпадения
for i, chapter in enumerate(ru_chapters):
    if i < 40:
        sents_bxr = [s.text for s in sentenize(bur_chapters[i])]
        sents_ru = [s.text for s in sentenize(ru_chapters[i])]
        print(i, ' ', len(sents_bxr), ' ', len(sents_ru))
    else:
        sents_ru = [s.text for s in sentenize(ru_chapters[i])]
        print(i, ' ', len(sents_ru))

NameError: name 'ru_chapters' is not defined

In [None]:
pairs = []
for i, chapter in enumerate(tqdm(ru_chapters)):
    sents_ru = clean_text(chapter)
    sents_bxr = clean_text(bur_chapters[i] + bur_chapters[i+1])
    print(0)
    emb_ru = np.stack([embed(s) for s in sents_ru])
    emb_er = np.stack([embed(s) for s in sents_bxr])
    print(0.5)
    pen = np.array([[min(len(x), len(y)) / max(len(x), len(y)) for x in sents_bxr] for y in sents_ru])
    sims = np.maximum(0, np.dot(emb_ru, emb_er.T)) ** 1 * pen
    print(1)
    alpha = 0.2
    penalty = 0.2
    sims_rel = (sims.T - get_top_mean_by_row(sims) * alpha).T - get_top_mean_by_row(sims.T) * alpha - penalty

    alignment = align3(sims_rel)

    total_score = sum(sims[i, j] for i, j in alignment) / min(sims.shape)
    if total_score < 0.15:
        continue
    
    for i, j in alignment:
        if sims[i, j] >= 0.50: # порог высоковат; часть предложений мы потеряем, но полученные зато будут чистыми
            pairs.append([sents_bxr[j], sents_ru[i]])
    tq.set_description(str(len(pairs)))

  0%|                                                                                           | 0/47 [00:00<?, ?it/s]

0
0.5
