In [5]:
!pip install openai
!pip install evaluate
!pip install llama-cpp-python
!pip install pinecone-client
!pip install langchain==0.0.300
!pip install --upgrade chromadb
!pip install sentence-transformers==2.2.2

Collecting sentence-transformers==2.2.2
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.4.1
    Uninstalling sentence-transformers-3.4.1:
      Successfully uninstalled sentence-transformers-3.4.1
Successfully installed sentence-transformers-2.2.2


In [6]:
!pip uninstall -y sentence-transformers huggingface_hub
!pip install "sentence-transformers>=2.2.2" "huggingface_hub<1.0.0"

Found existing installation: sentence-transformers 2.2.2
Uninstalling sentence-transformers-2.2.2:
  Successfully uninstalled sentence-transformers-2.2.2
Found existing installation: huggingface-hub 0.29.1
Uninstalling huggingface-hub-0.29.1:
  Successfully uninstalled huggingface-hub-0.29.1
Collecting sentence-transformers>=2.2.2
  Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting huggingface_hub<1.0.0
  Using cached huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Using cached sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Using cached huggingface_hub-0.29.1-py3-none-any.whl (468 kB)
Installing collected packages: huggingface_hub, sentence-transformers
Successfully installed huggingface_hub-0.29.1 sentence-transformers-3.4.1


In [1]:
from langchain.document_loaders import PDFMinerLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader, UnstructuredHTMLLoader
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from multiprocessing.pool import ThreadPool
from langchain.vectorstores import Chroma
from langchain.schema import Document as LCDocument
from chromadb.config import Settings
from typing import Any
from tqdm import tqdm

import uuid
from dataclasses import dataclass
from typing import List, Dict

import chromadb
from sentence_transformers import SentenceTransformer

In [2]:
import pandas as pd
import statistics
import time
import tqdm
import glob
import os

#Лабораторная работа №5

##Loader

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from dataclasses import dataclass
from typing import List
import pyarrow.parquet as pq
import hashlib

In [5]:
@dataclass
class Document:
    content: str
    metadata: dict

In [6]:
class Loader:
    def load_documents(self, source_dir: str) -> List[Document]:
        """Загрузка данных из Parquet-файла"""
        # Чтение Parquet-файла
        df = pd.read_parquet(source_dir)

        documents = []
        for _, row in df.iterrows():
            doc = Document(
                content=row['text'],
                metadata={
                    'class': row['label'],
                    'doc_id': str(uuid.uuid4())  # Генерация уникального ID
                }
            )
            documents.append(doc)

        return documents

    def load_chunked_documents(self, source_dir: str, chunk_size: int = 1000) -> List[Document]:
        """Постепенная загрузка больших файлов"""
        parquet_file = pq.ParquetFile(source_dir)
        documents = []

        for batch in parquet_file.iter_batches(batch_size=chunk_size):
            df = batch.to_pandas()
            for _, row in df.iterrows():
                doc = Document(
                    content=row['text'],
                    metadata={
                        'class': row['label'],
                        'doc_id': str(uuid.uuid4())
                    }
                )
                documents.append(doc)

        return documents

##Splitter

In [7]:
class Splitter:
    def __init__(
        self,
        chunk_size: int = 128,
        chunk_overlap: int = 32,
        model_name: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    ):
        self.text_splitter = SentenceTransformersTokenTextSplitter(
            model_name=model_name,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            add_start_index=True
        )
        self.tokenizer = SentenceTransformer(model_name).tokenizer

    def split_documents(self, documents: List[Document]) -> List[Document]:
        chunks = []
        for doc in documents:
            text_chunks = self.text_splitter.split_text(doc.content)
            tokenized_chunks = self.tokenizer(text_chunks, padding=False, truncation=False)["input_ids"]

            for chunk_text, chunk_tokens in zip(text_chunks, tokenized_chunks):
                new_metadata = doc.metadata.copy()
                new_metadata.update({
                    "chunk_tokens": len(chunk_tokens),
                    "original_length": len(doc.content),
                    "chunk_hash": hash(chunk_text)
                })
                chunks.append(Document(content=chunk_text, metadata=new_metadata))

        return chunks

##Vector database

In [8]:
class Collector:
    def add(self, texts: list[str], metadatas: list[dict]):
        raise NotImplementedError

    def add_from_directory(self, dir_path: str):
        raise NotImplementedError

    def get(self, search_strings: list[str], n_results: int) -> list[Document]:
        raise NotImplementedError

    def get_documents(self, search_string: str, n_results: int, score_threshold: float) -> list[Document]:
        raise NotImplementedError

    def clear(self):
        raise NotImplementedError

In [9]:
class Embedder:
    def get_embedder(self):
        raise NotImplementedError

In [10]:
class HuggingFaceEmbedder(Embedder):
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"):
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name

    def get_embedder(self):
        return self.model

    def encode(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(
            texts,
            convert_to_tensor=False,
            normalize_embeddings=True
        ).tolist()


In [11]:
from tqdm import tqdm
import os
import shutil
from google.colab import drive

In [12]:
class ChromaCollector(Collector):
    def __init__(
        self,
        embedder: HuggingFaceEmbedder,
        loader: Loader = None,
        splitter: Splitter = None,
        collection_name: str = "documents",
        persist_dir: str = "chroma_db",
        chunk_size: int = 128,
        chunk_overlap: int = 32,
        max_retries: int = 3,
        batch_size: int = 1000,
    ):
        self.embedder = embedder
        self.loader = loader or Loader()
        self.splitter = splitter or Splitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            model_name=embedder.model_name
        )
        self.persist_dir = persist_dir
        self.collection_name = collection_name
        self.max_retries = max_retries
        self.batch_size = batch_size
        self.distance_metric = "cosine"

        self._init_client()


    def _init_client(self):
        self.client = chromadb.PersistentClient(path=self.persist_dir)
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"hnsw:space": self.distance_metric}
        )

    def save_to_gdrive(self, gdrive_path: str):
        """Сохраняет индекс на Google Drive"""
        try:
            drive.mount('/content/drive')
            source_dir = self.persist_dir
            target_dir = f"/content/drive/MyDrive/{gdrive_path}"

            # Копируем с перезаписью
            if os.path.exists(target_dir):
                shutil.rmtree(target_dir)
            shutil.copytree(source_dir, target_dir)

            print(f"Индекс сохранен в Google Drive: {target_dir}")
        except Exception as e:
            raise RuntimeError(f"Ошибка сохранения: {str(e)}")

    def load_from_gdrive(self, gdrive_path: str):
        """Загружает индекс с Google Drive"""
        try:
            drive.mount('/content/drive')
            source_dir = f"/content/drive/MyDrive/{gdrive_path}"
            target_dir = self.persist_dir

            if not os.path.exists(source_dir):
                raise FileNotFoundError("Индекс не найден на Google Drive")

            # Очищаем локальную копию
            if os.path.exists(target_dir):
                shutil.rmtree(target_dir)

            # Копируем данные
            shutil.copytree(source_dir, target_dir)

            # Переинициализируем клиента
            self._init_client()
            print(f"Индекс загружен из Google Drive: {source_dir}")
        except Exception as e:
            raise RuntimeError(f"Ошибка загрузки: {str(e)}")


    def _process_texts(self, texts: List[str], metadatas: List[dict]) -> tuple:
        temp_docs = [
            Document(content=text, metadata=meta)
            for text, meta in zip(texts, metadatas)
        ]

        chunks = self.splitter.split_documents(temp_docs)

        chunk_texts = [chunk.content for chunk in chunks]
        chunk_metas = [chunk.metadata for chunk in chunks]

        return chunk_texts, chunk_metas

    def add(self, texts: list[str], metadatas: list[dict]):
      chunks, metas = self._process_texts(texts, metadatas)

      # Рассчитываем общее количество батчей
      total_batches = (len(chunks) + self.batch_size - 1) // self.batch_size

      # Создаем прогресс-бар
      with tqdm(total=total_batches, desc="Добавление батчей", unit="batch") as pbar:
          for i in range(0, len(chunks), self.batch_size):
              batch_chunks = chunks[i:i+self.batch_size]
              batch_metas = metas[i:i+self.batch_size]
              success = False

              for attempt in range(self.max_retries):
                  try:
                      # Кодирование и добавление
                      embeddings = self.embedder.encode(batch_chunks)
                      ids = [str(uuid.uuid4()) for _ in batch_chunks]

                      self.collection.add(
                          ids=ids,
                          embeddings=embeddings,
                          metadatas=batch_metas,
                          documents=batch_chunks
                      )
                      success = True
                      break
                  except Exception as e:
                      pbar.write(f"Ошибка в батче {i//self.batch_size}: {str(e)}")
                      if attempt < self.max_retries - 1:
                          sleep_time = 2 ** attempt
                          pbar.write(f"Повтор через {sleep_time} сек...")
                          time.sleep(sleep_time)

              if success:
                  pbar.update(1)
                  pbar.set_postfix({
                      "добавлено": f"{min(i+self.batch_size, len(chunks))}/{len(chunks)}"
                  })
              else:
                  raise RuntimeError(f"Не удалось добавить батч {i//self.batch_size} после {self.max_retries} попыток")



    def add_from_directory(self, dir_path: str):
        documents = self.loader.load_documents(dir_path)
        texts = [d.content for d in documents]
        metas = [d.metadata for d in documents]
        self.add(texts, metas)

    def get(self, search_strings: list[str], n_results: int) -> list[Document]:
        embeddings = self.embedder.encode(search_strings)
        results = self.collection.query(
            query_embeddings=embeddings,
            n_results=n_results,
            include=["documents", "metadatas"]
        )
        return self._format_results(results)

    def get_documents(self, search_string: str, n_results: int, score_threshold: float) -> list[Document]:
        embedding = self.embedder.encode([search_string])[0]

        results = self.collection.query(
            query_embeddings=[embedding],
            n_results=n_results,
            include=["documents", "metadatas", "distances"]
        )

        # Фильтрация результатов по порогу
        filtered = []
        for doc, meta, distance in zip(results["documents"][0],
                                      results["metadatas"][0],
                                      results["distances"][0]):
            # Конвертируем расстояние в схожесть для косинусной метрики
            if self.distance_metric == "cosine":
                similarity = 1 - distance
            else:
                similarity = distance

            if similarity >= score_threshold:
                new_meta = meta.copy()
                new_meta["similarity"] = similarity
                filtered.append(Document(content=doc, metadata=new_meta))

        # Сортировка по убыванию схожести
        filtered.sort(key=lambda x: x.metadata["similarity"], reverse=True)

        return filtered[:n_results]

    def clear(self):
        self.client.reset()

    def _format_results(self, results) -> list[Document]:
        documents = []
        for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
            documents.append(Document(
                content=doc,
                metadata=meta
            ))
        return documents

###Implementation vector database

In [13]:
loader = Loader()
embedder = HuggingFaceEmbedder()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [14]:
collector = ChromaCollector(
    embedder=embedder,
    loader=loader,
    persist_dir="documents",
    chunk_size=128
)

In [24]:
collector.add_from_directory("/content/drive/MyDrive/NLP Labs/Lab 5/data/train.parquet")

Token indices sequence length is longer than the specified maximum sequence length for this model (130 > 128). Running this sequence through the model will result in indexing errors
Добавление батчей: 100%|██████████| 125/125 [15:54<00:00,  7.63s/batch, добавлено=124804/124804]


In [None]:
collector.save_to_gdrive("MyDrive/NLP Labs/Lab 5/indexes")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Индекс сохранен в Google Drive: /content/drive/MyDrive/MyDrive/NLP Labs/Lab 5/indexes


In [None]:
!mv /content/documents '/content/drive/MyDrive/NLP Labs/Lab 5/indexes/documents/'

##Search

In [26]:
query = 'Donald Trump' #@param {type:"string"}
n_results = 3 #@param {type:"integer"}
score_threshold = 0.5 # @param {type:"slider", min:0, max:1, step:0.1}


In [27]:
query_results = collector.get_documents(query, n_results, score_threshold)

In [28]:
query_results

[Document(content='Donald Trumps Field Luke Donald shoots a 4-under-par 68 for a two-stroke lead after three rounds of the Dunhill Links Championship in St. Andrews, Scotland on Saturday.', metadata={'chunk_hash': -4657731494467214896, 'chunk_tokens': 43, 'class': 1, 'doc_id': '3840d261-69a9-4713-acda-502afeed62c6', 'original_length': 168, 'similarity': 0.5858850479125977}),
 Document(content='Donald Trumps Field Luke Donald shoots a 4-under-par 68 for a two-stroke lead after three rounds of the Dunhill Links Championship in St. Andrews, Scotland on Saturday.', metadata={'chunk_hash': 7604359559524370935, 'chunk_tokens': 43, 'class': 1, 'doc_id': 'e1839375-3b12-4df6-ade2-9199debaceb9', 'original_length': 168, 'similarity': 0.5858850479125977}),
 Document(content='Donald is fired-up to carry form to Detroit Luke Donald hailed yesterday #39;s European Masters victory in Switzerland as even more satisfying than his Scandinavian Masters success a month ago.', metadata={'chunk_hash': -15122

In [29]:
collector.get_documents("Presidential election results", n_results, 0.6)

[Document(content='Results of Presidential Campaign Polls (AP) AP - Results of recent polls on the presidential race. Listed above each set of results is the name of the 2000 winner in a given state, the organization that conducted the poll, the dates, the number interviewed, whether it was adults, registered voters (RV) or likely voters (LV) and the margin of error (MoE). Results may not total 100 percent because of rounding.', metadata={'chunk_hash': -859200330647925587, 'chunk_tokens': 102, 'class': 0, 'doc_id': 'cbe586aa-25d7-4b08-afed-147ea460e460', 'original_length': 411, 'similarity': 0.7503176927566528}),
 Document(content='Results of Presidential Campaign Polls (AP) AP - Results of recent polls on the presidential race. Listed above each set of results is the name of the 2000 winner in a given state, the organization that conducted the poll, the dates, the number interviewed, whether it was adults, registered voters (RV) or likely voters (LV) and the margin of error (MoE). Res

In [30]:
collector.get_documents("Brexit negotiation", n_results, 0.6)

[Document(content='Britain, Ireland push for Northern Ireland deal At a fairytale English castle, the prime ministers of Britain and Ireland will this week attempt to broker an unlikely marriage of convenience between the two extremes of Northern Irish politics.', metadata={'chunk_hash': 5808954147847351333, 'chunk_tokens': 52, 'class': 0, 'doc_id': '51922f1b-a0f9-4e6e-96eb-28bbc1ed9bf8', 'original_length': 243, 'similarity': 0.654464602470398}),
 Document(content='Britain, Ireland push for Northern Ireland deal At a fairytale English castle, the prime ministers of Britain and Ireland will this week attempt to broker an unlikely marriage of convenience between the two extremes of Northern Irish politics.', metadata={'chunk_hash': -4267666134119383305, 'chunk_tokens': 52, 'class': 0, 'doc_id': 'b8abf2ed-84a1-4c3f-bf36-a1ac57d90663', 'original_length': 243, 'similarity': 0.654464602470398}),
 Document(content='Blair in Northern Ireland bargaining Tony Blair is preparing to urge unionists

In [31]:
collector.get_documents("Grand Theft Auto", n_results, 0.5)

[Document(content='Car Theft Without Penalty of Prison in #39;GTA: San Andreas #39; quot;Grand Theft Auto: San Andreas quot; takes gamers back to the days of gangster rap, low-riders and quot;messing quot; with the police.', metadata={'chunk_hash': 8407607549837734116, 'chunk_tokens': 65, 'class': 3, 'doc_id': '7a32281b-44a4-4eab-a048-a32cb08cfbc2', 'original_length': 206, 'similarity': 0.7211950421333313}),
 Document(content='Car Theft Without Penalty of Prison in #39;GTA: San Andreas #39; quot;Grand Theft Auto: San Andreas quot; takes gamers back to the days of gangster rap, low-riders and quot;messing quot; with the police.', metadata={'chunk_hash': -7238619119816768451, 'chunk_tokens': 65, 'class': 3, 'doc_id': '190b3d21-3ada-4cf6-8c5e-fb3c8e9dd2f0', 'original_length': 206, 'similarity': 0.7211950421333313}),
 Document(content='\'Grand Theft Auto\': Keeping America safe from crime Turns out that "Grand Theft Auto," the game that many of America\'s moral watchdogs fear will turn the

In [32]:
collector.get_documents("Oscar winner", n_results, 0.5)

[Document(content='Win, lose Oscar is no gamble LAS VEGAS -- There is another reason 1992 Olympic champion Oscar De La Hoya is so appropriately called The Golden Boy.', metadata={'chunk_hash': 3802681004122757604, 'chunk_tokens': 36, 'class': 1, 'doc_id': '9b0a4f2a-fdc4-4f33-a95c-8a1e343dca60', 'original_length': 147, 'similarity': 0.5801349878311157}),
 Document(content='Win, lose Oscar is no gamble LAS VEGAS -- There is another reason 1992 Olympic champion Oscar De La Hoya is so appropriately called The Golden Boy.', metadata={'chunk_hash': -3646040845517606146, 'chunk_tokens': 36, 'class': 1, 'doc_id': '064432f0-7686-4939-ba6c-8d8fa862c4c9', 'original_length': 147, 'similarity': 0.5801349878311157}),
 Document(content="Story is too good for words Every once in a while an Oscar winner gets up there and wings an acceptance speech because quot;I never thought I'd win, so I didn't prepare anything. quot;", metadata={'chunk_hash': 230508244584051013, 'chunk_tokens': 49, 'class': 1, 'doc_

In [33]:
collector.get_documents("Bitcoin", n_results, 0.5)

[Document(content='Crypto researchers abuzz over flaws Presenters at the Crypto 2004 conference identify faster ways to forge digital signatures with common security algorithms.', metadata={'chunk_hash': 5957659949392002447, 'chunk_tokens': 34, 'class': 3, 'doc_id': '242b6077-d136-4cf3-83af-2784424785a7', 'original_length': 158, 'similarity': 0.5027420520782471}),
 Document(content='Crypto researchers abuzz over flaws Presenters at the Crypto 2004 conference identify faster ways to forge digital signatures with common security algorithms.', metadata={'chunk_hash': -4275013436857274757, 'chunk_tokens': 34, 'class': 3, 'doc_id': '51c9d5ee-5c5b-4f1d-8360-fb7f2e337dd4', 'original_length': 158, 'similarity': 0.5027420520782471})]

In [34]:
collector.get_documents("Artificial Intellegence", n_results, 0.5)

[Document(content='Software Tutors Offer Help and Customized Hints Broadly defined, an intelligent tutoring system is educational software containing an artificial intelligence component.', metadata={'chunk_hash': 4634018994060838319, 'chunk_tokens': 34, 'class': 3, 'doc_id': '97513169-5d9c-4539-a3e3-8f8f5c5387ec', 'original_length': 168, 'similarity': 0.5824275016784668}),
 Document(content='Software Tutors Offer Help and Customized Hints Broadly defined, an intelligent tutoring system is educational software containing an artificial intelligence component.', metadata={'chunk_hash': -5870643244644449535, 'chunk_tokens': 34, 'class': 3, 'doc_id': '63cafbb3-079c-411d-90ba-b77bf028676e', 'original_length': 168, 'similarity': 0.5824275016784668}),
 Document(content="Computers with multiple personalities The jury's still out on whether a computer can ever truly be intelligent, but there's no question that it can have multiple personalities. It's just a matter of software.", metadata={'chun

# Lab 6

In [35]:
import torch
from transformers import pipeline
from typing import List

class QASystem:
    def __init__(self, chroma_collector):
        # Инициализация Chroma
        self.chroma = chroma_collector

        # Инициализация пайплайна
        self.pipe = pipeline(
            "text-generation",
            model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )

    def generate_answer(self, question: str, top_n: int = 2) -> str:
        # Получение контекста
        context_docs = self.chroma.get_documents(question, n_results=top_n, score_threshold=0.5)
        context = "\n".join([doc.content for doc in context_docs]) if context_docs else "No context available"

        # Формирование сообщений
        messages = [
            {
                "role": "system",
                "content": f"Answer the question using only this context: {context}"
            },
            {
                "role": "user",
                "content": question
            }
        ]

        # Создание промпта
        prompt = self.pipe.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Генерация ответа
        outputs = self.pipe(
            prompt,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95
        )

        # Извлечение ответа
        full_response = outputs[0]["generated_text"]
        return full_response.split("<|assistant|>")[-1].strip()

In [36]:
general_qa_pairs = [
    (
        "What is photosynthesis?",
        "Photosynthesis is the process by which plants convert sunlight, water, and carbon dioxide into glucose and oxygen using chlorophyll."
    ),
    (
        "How does a blockchain work?",
        "Blockchain is a decentralized digital ledger that records transactions across multiple computers in encrypted blocks linked in a chronological chain."
    ),
    (
        "What causes earthquakes?",
        "Earthquakes occur due to sudden energy release from tectonic plate movements along fault lines in Earth's crust."
    ),
    (
        "What are the main components of blood?",
        "Blood consists of plasma (55%), red blood cells (erythrocytes), white blood cells (leukocytes), and platelets (thrombocytes)."
    ),
    (
        "How do vaccines work?",
        "Vaccines stimulate the immune system by introducing weakened or inactivated pathogens to create antibody memory."
    ),
    (
        "What is the greenhouse effect?",
        "The greenhouse effect is the trapping of heat in Earth's atmosphere by gases like CO2 and methane, maintaining Earth's temperature."
    ),
    (
        "Explain Newton's laws of motion",
        "1) Inertia: Objects maintain motion unless acted on; 2) F=ma; 3) Every action has equal and opposite reaction."
    ),
    (
        "What is machine learning?",
        "Machine learning is a subset of AI where algorithms improve automatically through experience and data patterns."
    ),
    (
        "How do airplanes stay airborne?",
        "Airplanes achieve lift through wing shape (airfoil) creating pressure difference between upper and lower surfaces."
    ),
    (
        "What causes ocean tides?",
        "Tides are primarily caused by gravitational interactions between Earth, Moon, and Sun, with lunar gravity being dominant."
    )
]

In [37]:
qa = QASystem(collector)

Device set to use cuda:0


In [23]:
qa.generate_answer(general_qa_pairs[0][0])

'Photosynthesis is the process by which plants, algae, and certain bacteria produce food (glucose) by capturing light energy from the sun and converting it into chemical energy in the form of ATP (adenosine triphosphate) and NADPH (nicotinamide adenine dinucleotide phosphate). This process occurs in the chloroplasts of leaves, stems, and roots of plants. In photosynthesis, the chlorophyll pigments (chlorophyll a and b) absorb light energy from the sun and convert it into chemical energy, which is stored in the form of glucose (a type of sugar) and oxygen. The oxygen that is produced during photosynthesis is released back into the atmosphere as a byproduct. Photosynthesis is essential for the growth, survival, and reproduction of plants and other organisms that rely on photosynthesis for their energy needs.'

In [39]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [40]:
from bert_score import score

def calculate_bertscore(results):
    references = [res['true_answer'] for res in results]
    candidates = [res['predicted_answer'] for res in results]

    P, R, F1 = score(candidates, references, lang="en", verbose=True)

    for i, res in enumerate(results):
        res['bertscore'] = {
            "precision": P[i].item(),
            "recall": R[i].item(),
            "f1": F1[i].item()
        }

    return results

In [41]:
def evaluate(qa_system, test_pairs):
    results = []
    for question, true_answer in test_pairs:
        answer = qa_system.generate_answer(question)
        results.append({
            "question": question,
            "true_answer": true_answer,
            "predicted_answer": answer,
            "bertscore": None
        })
        results = calculate_bertscore(results)
    return results

In [42]:
# Запуск оценки
evaluation_results = evaluate(qa, general_qa_pairs)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.14 seconds, 6.96 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.16 seconds, 12.58 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.21 seconds, 14.39 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.26 seconds, 15.61 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.34 seconds, 14.49 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


done in 0.37 seconds, 16.13 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.50 seconds, 14.13 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.53 seconds, 15.04 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.60 seconds, 15.05 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.82 seconds, 12.16 sentences/sec


In [47]:
# Тестирование
for result in evaluation_results:
    print(f"question: {result['question']}")
    print(f"Expected: {result['true_answer']}")
    print(f"Actual: {result['predicted_answer']}")
    print("Bert score metrics:")
    print(result['bertscore'])
    print('-'*50)

question: What is photosynthesis?
Expected: Photosynthesis is the process by which plants convert sunlight, water, and carbon dioxide into glucose and oxygen using chlorophyll.
Actual: Photosynthesis is a process where organisms convert sunlight into chemical energy in the form of glucose (a type of carbohydrate) by breaking down carbon dioxide and water into oxygen and glucose. It is a crucial process in the food chain, as it is the primary source of energy for plants and many other organisms. In photosynthesis, chlorophyll, a green pigment, absorbs light energy and converts it into electrical energy, which drives the process of chemical reactions.
Bert score metrics:
{'precision': 0.8815720081329346, 'recall': 0.9450684785842896, 'f1': 0.9122166037559509}
--------------------------------------------------
question: How does a blockchain work?
Expected: Blockchain is a decentralized digital ledger that records transactions across multiple computers in encrypted blocks linked in a chro