In [2]:
!pip install python-docx sentence-transformers rouge-score bert-score

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=ca9c7ff3674d279b8547abb288238b68f45497abe5abf4650f2a09b559cd845b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/46106

In [3]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [6]:
import os
import json
from google.colab import files
from docx import Document
from sentence_transformers import SentenceTransformer
from rouge import Rouge
from bert_score import BERTScorer

#Извлечение текста из документа DOCX, включая параграфы и таблицы
def parse_docx(file_path):
    doc = Document(file_path)
    text_data = []
    num_tables = len(doc.tables)

    # Извлечение текста из параграфов
    for para in doc.paragraphs:
        if para.text.strip():  # Проверка на пустые строки
            text_data.append(para.text.strip())

    # Извлечение текста из таблиц
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                if cell.text.strip():
                    text_data.append(cell.text.strip())

    return text_data, num_tables

#Векторизация текста с использованием модели BERT
def vectorize_text(text_data):
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    vectors = model.encode(text_data)
    return vectors

#Оценка качества ответов с использованием метрик ROUGE и BertScore
def evaluate_responses(generated_responses, reference_responses):

    # ROUGE
    rouge = Rouge()
    rouge_scores = rouge.get_scores(generated_responses, reference_responses, avg=True)

    # BertScore
    scorer = BERTScorer(lang='en', rescale_with_baseline=True)
    P, R, F1 = scorer.score(generated_responses, reference_responses)

    return {
        'rouge': rouge_scores,
        'bert_score': {
            'precision': P.mean().item(),
            'recall': R.mean().item(),
            'f1': F1.mean().item()
        }
    }

# Загрузка файла
uploaded = files.upload()

# Получаем имя загруженного файла
docx_file_path = list(uploaded.keys())[0]

# Проверка существования файла
if not os.path.exists(docx_file_path):
    print(f"Файл не найден: {docx_file_path}")
else:
    # Парсинг документа
    text_data, num_tables = parse_docx(docx_file_path)

    # Вывод информации о парсинге
    print(f"Извлечено {len(text_data)} элементов текста из файла '{docx_file_path}':")
    print(f"Количество извлеченных параграфов: {len([para for para in text_data if para.strip()])}")
    print(f"Количество извлеченных таблиц: {num_tables}")

    print("Извлеченный текст:")
    for item in text_data:
        print(f"- {item}")

    # Векторизация
    vectors = vectorize_text(text_data)
    print("Векторы текста:")
    print(vectors)

    # Оценка качества ответов
    evaluation_results = evaluate_responses(generated_responses, reference_responses)
    print("Оценка качества ответов:")
    print(json.dumps(evaluation_results, indent=2))

Saving example.docx to example (2).docx
Извлечено 10 элементов текста из файла 'example (2).docx':
Количество извлеченных параграфов: 10
Количество извлеченных таблиц: 1
Извлеченный текст:
- Привет 1-03.
- 1
- 2
- 3
- аа
- бб
- вв
- А2а
- Б3б
- И5и
Векторы текста:
[[-1.3845092  -0.36675274  0.24337459 ... -0.3546616   0.5580823
   0.23140013]
 [-0.7217686  -0.10978892  0.14601806 ... -0.20666307  0.1631821
  -0.04253246]
 [-0.27175066 -0.20269282  0.49825206 ... -0.03542368  0.04647548
  -0.16507316]
 ...
 [-0.9785894   0.10704031  0.16611525 ... -0.12310312  0.44577533
  -0.12866648]
 [-0.24296033  0.27862197  0.3335077  ... -0.1856792   0.08455054
  -0.43749207]
 [-1.3600903  -0.2866409   0.18361102 ... -1.0180781   0.3478168
   0.2460928 ]]


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Оценка качества ответов:
{
  "rouge": {
    "rouge-1": {
      "r": 0.3333333333333333,
      "p": 0.25,
      "f": 0.2857142808163266
    },
    "rouge-2": {
      "r": 0.0,
      "p": 0.0,
      "f": 0.0
    },
    "rouge-l": {
      "r": 0.3333333333333333,
      "p": 0.25,
      "f": 0.2857142808163266
    }
  },
  "bert_score": {
    "precision": 0.2888505756855011,
    "recall": 0.5310782194137573,
    "f1": 0.408188134431839
  }
}
