In [10]:
import os
import re
import json

import bert_score

In [11]:
def load_dict_from_json(path):
    with open(path, 'r', encoding='utf-8') as json_file:
        dictionary = json.load(json_file)
    return dictionary


def split_page_and_metadata(full_text):
    if not full_text.startswith("[(page"):
        return None, full_text

    pattern = r'\[\(page \d+\) \(latest_image_id \d+\)\]'
    match = re.search(pattern, full_text)
    if match:
        metadata = match.group(0)
        other_content = re.sub(pattern, '', full_text).strip()
        
        return metadata, other_content
    else:
        return None, full_text

In [16]:
all_raw_articles = sorted(os.listdir("./../txt_articles/Фрагменты"))

def remove_digits(text):
    return ''.join(char for char in text if not char.isdigit())


def merge_paragraphs(text_file_path):
    if "Очистка_текста" in text_file_path:
        paragraphs = load_dict_from_json(text_file_path)["cleaned_text"]
    else:
        paragraphs = load_dict_from_json(text_file_path)["fragments"]
    merged_text = ""

    for i in paragraphs:
        _, now_par = split_page_and_metadata(i)
        merged_text += now_par
    return merged_text 

all_raw_text = []
all_cleaned_text = []

for now_raw_article in all_raw_articles:
    print(f"Processing: {now_raw_article}")
    full_file_path = os.path.join("./../txt_articles/Фрагменты", now_raw_article)
    article_name = str(".".join(now_raw_article.split(".")[:-1]))

    cleaned_path = str(full_file_path).replace("Фрагменты", "Очистка_текста").replace("_paragraphs_cleaned", "_paragraphs_cleaned_cleaned")
    
    article_text = merge_paragraphs(full_file_path)
    cleaned_text = merge_paragraphs(cleaned_path)

    article_text = remove_digits(article_text)

    if not article_text.strip() and not cleaned_text.strip():
        continue

    all_raw_text.append(article_text)
    all_cleaned_text.append(cleaned_text)

from bert_score import score

P, R, F1 = score(all_cleaned_text, all_raw_text, lang="ru", model_type="bert-base-multilingual-cased")

# Средние значения
print("Precision:", P.mean().item())
print("Recall:", R.mean().item())
print("F1:", F1.mean().item())

Processing: 41479_paragraphs_cleaned.json
Processing: HYDROGENENERGETICS_HYDROGENSTORAGEINABOUNDSTATE_paragraphs_cleaned.json
Processing: Novikov_Vodorodnaja jenergetika_paragraphs_cleaned.json
Processing: Автореферат Спирин АВ_paragraphs_cleaned.json
Processing: Диссертация Кораблева ЕА_paragraphs_cleaned.json
Processing: Диссертация Липилин АС_paragraphs_cleaned.json
Processing: Диссертация Новик НН_paragraphs_cleaned.json
Processing: Книга Керамика ZrO2_paragraphs_cleaned.json
Processing: МАТЕРИАЛЫ ДЛЯ ВОДОРОДНОЙ ЭНЕРГЕТИКИ_paragraphs_cleaned.json
Processing: ОСОБЕННОСТИ ТЕХНОЛОГИИ ПОЛУЧЕНИЯ ВОДОРОДА С ИСПОЛЬЗОВАНИЕМ ЭАВ_paragraphs_cleaned.json
Processing: РАЗРАБОТКА ПРОЦЕССА ПОЛУЧЕНИЯ ВОДОРОДА ИЗ ВОДЫ С ИСПОЛЬЗОВАНИЕМ ЭАВ_paragraphs_cleaned.json
Processing: Статья Немудрый АП_paragraphs_cleaned.json
Processing: Статья Твёрдые электролиты ZrO2–Sc2O3_paragraphs_cleaned.json
Processing: ЭНЕРГОАККУМУЛИРУЮЩИЕ ВЕЩЕСТВА В ВОДОРОДНОЙ ЭНЕРГЕТИКЕ_paragraphs_cleaned.json
Processing: Энергоакк