In [1]:
!pip install evaluate==0.4.3
!pip install llama-cpp-python==0.1.9
!pip install pinecone-client==5.0.1
!pip install langchain_community==0.2.16
!pip install langchain-chroma==0.1.4
!pip install chromadb==0.5.11
!pip install sentence-transformers==3.1.1

Collecting evaluate==0.4.3
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate==0.4.3)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate==0.4.3)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate==0.4.3)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate==0.4.3)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate==0.4.3)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from evaluate==0.4.3)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━

In [2]:
from langchain_community.document_loaders import PDFMinerLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader, UnstructuredHTMLLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from multiprocessing.pool import ThreadPool
from langchain_chroma.vectorstores import Chroma
from langchain.schema import Document
from chromadb.config import Settings
from llama_cpp import Llama
from evaluate import load
from typing import Any
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd
import numpy as np
import statistics
import pinecone
import glob
import os

In [4]:
from sentence_transformers import SentenceTransformer

#Лабораторная работа №5

##Declaring constant

In [5]:
# Словарь, сопоставляющий расширения файлов с соответствующими загрузчиками данных и их параметрами
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".pdf": (PDFMinerLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
}

In [6]:
# Параметры конфигурации для векторного поиска и разделения текста
INDEX_NAME = "VDB"  # Название индекса для хранения векторных представлений
EMBEDDINGS = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'  # Название модели эмбеддингов, используемой для векторизации текстов
SIZE = 250  # Размер фрагмента текста для разделения документов
OVERLAP = 50  # Перекрытие между фрагментами текста для обеспечения контекста

##Loader

In [7]:
import itertools
import re

pattern = r"{price_pattern}|{abbr_patterns}|({phone_pattern})|({email_pattern})|(\'?[\w\-]+)|([^A-Za-z0-9 \n])"
sent_pattern = r"((?<=\.|\?|!|\;))({abbr_patterns})\s"

phone_pattern = r"\+?[0-9] ?\(?[0-9]+\)?[0-9 -]+"
# [\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}
email_pattern = r"[^@ \t\r\n]+@[^@ \t\r\n]+\.[^@ \t\r\n]+"
price_pattern = r"(\$ ?\d*\.?\d+)|(\d*\.?\d+ ?\$)"

english_abbr = ["Mr.", "Mrs.", "Mss.", "Ms.", "Dr."]
english_abbr = [x.replace(".", "\.") for x in english_abbr]
english_abbr.extend(map(lambda x: x.lower(), english_abbr.copy()))

sent_pattern = sent_pattern.format(abbr_patterns="".join(map(lambda x: fr"(?<!{x})", english_abbr)))
sent_pattern = re.compile(sent_pattern)

def split_to_sentence(text: str) -> list[str]:
    return list(filter(lambda x: len(x) if x else False, sent_pattern.split(text)))

In [8]:
def load_dataset(split_type="train", n: int | None = None, dataset_path = "../../assets/{split_type}.csv", random_state=42) -> pd.DataFrame:
    assert split_type == "train" or split_type == "test"
    dataset_path = dataset_path.format(split_type=split_type)
    if not os.path.exists(dataset_path):
        splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet',
                  'test': 'yelp_review_full/test-00000-of-00001.parquet'}
        df = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits[split_type])
        df.to_csv(dataset_path, index=False)
    else:
        df = pd.read_csv(dataset_path)
    if n is None:
        return df
    else:
        return train_test_split(df, train_size=n, stratify=df["label"], random_state=random_state)[0]

In [9]:
def process_df(df: pd.DataFrame) -> list[tuple[list[str], str|int]]:
    data = []
    meta = []
    ids = []
    for idx, row in df.iterrows():
        label, text = row["label"], row["text"]
        chunks = splitter.split_document(text)
        data.extend(chunks)
        meta.extend([{"label": label} for _ in range(len(chunks))])
        ids.extend([f"{idx}_{i}" for i in range(len(chunks))])
    return data, meta, ids

def dataset_batch_iter(df, batch_size):
    for df_b in np.array_split(df, batch_size):
        yield process_df(df_b)
    return

In [10]:
# Класс для загрузки документов из различных источников, поддерживающий работу с разными форматами файлов
class Loader:
    def load_single_document(self, file_path: str):
        return

    def load_documents(self, source_dir: str):
        pass  # Метод для загрузки всех документов из указанной директории

##Splitter

In [11]:
# Класс для разделения документов на фрагменты определённого размера с заданным перекрытием
class Splitter:
    def __init__(self, chunk_size, chunk_overlap):
        assert chunk_size > chunk_overlap
        self.chunk_size = chunk_size
        self.chunk_overlap=chunk_overlap

    def split_document(self, document: str):
        # Метод для разделения переданных документов на фрагменты
        doc_sents = []
        for sent in split_to_sentence(document):
          for i in range(0, len(sent), self.chunk_size-self.chunk_overlap):
            start, end = i, i+self.chunk_size
            doc_sents.append(sent[start: end])
        return doc_sents

In [12]:
splitter=Splitter(SIZE, OVERLAP)

In [13]:
# splitter.split_document("I love driving and I dont loke kaksd asdkf")

In [14]:
df = load_dataset(split_type="train", n=10, dataset_path='./assets/{split_type}.csv')
for i in dataset_batch_iter(df, batch_size = 2):
  print(i)
  break

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(['Stayed here mid July 2012 on a weekend.', 'I was able to snag a 3 nights for $150 deal through Facebook.', 'Overall I was satisfied with our stay but this hotel seems more like hotel for \\"older folk\\".', 'When you first walk you feel like your at a hospital because its really bright and white.', 'The renovations are noticeable and the rooms are comfy.', 'Won a bit of money on black-jack and roulette.', 'The coconut/smoke mixture in the air got annoying after the 2nd day.', 'Resort fee is 15 dollars which is less than a majority of the other resorts.', "The light let in from the blinds didn't really bug me.", 'I would recommend this hotel if you are on a budget.\\n\\nPros\\n-Check-in was fast.\\n-Hotel does not get crowded (like other hotels).\\n-Room was nice.\\n-Pool area is nice even though it was a mixture of older and younger folk.', ' older and younger folk.', "\\n\\nCons\\n-Our room felt like it was miles from the casino floor.\\n-Bathrooms didn't seem so remodeled to me.",

  return bound(*args, **kwds)


##Vector database

In [15]:
# Базовый класс для работы с коллекцией документов, поддерживающий добавление, поиск и очистку данных
class Collector:
    def add(self, texts: list[str], metadatas: list[dict]):
        pass  # Метод для добавления текстов и связанных с ними метаданных в коллекцию

    def add_from_directory(self, dir_path: str):
        pass  # Метод для добавления документов в коллекцию из указанной директории

    def get(self, search_strings: list[str], n_results: int) -> list[Document]:
        pass  # Метод для поиска документов по строкам запроса с ограничением на количество результатов

    def get_documents(self, search_string: str, n_results: int, score_threshold: float) -> list[Document]:
        pass  # Метод для поиска документов с учётом порога релевантности и количества возвращаемых результатов

    def clear(self):
        pass  # Метод для очистки коллекции документов

In [16]:
# Базовый класс для создания эмбеддингов, обеспечивающий интерфейс для получения модели эмбеддингов
class Embedder:
    def __init__(self, model_name):
        pass  # Инициализация эмбеддера

    def get_embedding(self, sent):
        pass  # Метод для получения модели эмбеддингов, которая будет использоваться для векторизации текстов

In [17]:
class SentenceEmbedder(Embedder):
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"):
       self.model = SentenceTransformer(model_name)
    def get_embedding(self, sent):
        # Метод для получения модели эмбеддингов, которая будет использоваться для векторизации текстов
        return self.model.encode(sent).tolist()
    def __call__(self, input):
        return self.get_embedding(input)

In [18]:
class ChromaCollector(Collector):
    def __init__(self, name_prefix, root_path, embeddnig_fn, distance_fn):
      self.client = chromadb.PersistentClient(path=root_path)
      self.distance_fn = distance_fn
      self.embedding_fn = embeddnig_fn
      self._collection_name = name_prefix + self.distance_fn
      self.database = self.get_database()

    def get_database(self):
      return self.client.get_or_create_collection(
            self._collection_name,
            metadata={"hnsw:space": self.distance_fn},
            embedding_function=self.embedding_fn
        )

    def load_dataset(self, df: pd.DataFrame, batch_size=128) -> None:
        for chunks, metas, ids in tqdm(dataset_batch_iter(df, batch_size = batch_size), total=math.ceil(df.shape[0] / batch_size), desc="loading to the DB"):
          self.database.add(
                documents=chunks,
                metadatas=metas,
                ids=ids
            )

    def query(self, query, n_results: int, query_texts=None, where=None, where_document=None):
        return self.database.query(
            n_results=n_results,
            query_texts=query_texts,
            query_embeddings=self.embedding_fn(query),
            where=where,
            where_document=where_document
        )

    def clear(self):
        self.client.delete_collection(self._collection_name)

###Implementation vector database

In [19]:
path_to_index = '/VDB' #@param {type:"string"}
path_to_df = './assets/{split_type}.csv' #@param {type:"string"}

In [20]:
import chromadb
import math

In [21]:
df = load_dataset(split_type="train", n=10_000, dataset_path=path_to_df)
emedder = SentenceEmbedder()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [22]:
database_cos = ChromaCollector("my_db", path_to_index, emedder, "cosine")

In [23]:
database_cos.load_dataset(df)

loading to the DB:   0%|          | 0/79 [00:00<?, ?it/s]

  return bound(*args, **kwds)


##Search

In [24]:
query = 'What is your favorite food?' #@param {type:"string"}
n_results = 5 #@param {type:"integer"}
# score_threshold = 0.5 # @param {type:"slider", min:0, max:1, step:0.1}

result = database_cos.query(query, n_results=5)
for dist, document, meta in zip(result["distances"][0], result["documents"][0], result["metadatas"][0]):
  print(f"{dist:0.2f}  {meta['label']}  {document}")

0.14  4  Maybe my favorite side dish ever.
0.16  4  one of my favorite dishes of all time!
0.19  4  Love the food.
0.19  4  Love the food.
0.19  4  My absolute favorite place to eat.


In [25]:
query = 'What is your favorite restaurant?' #@param {type:"string"}
n_results = 5 #@param {type:"integer"}

result = database_cos.query(query, n_results=5)
for dist, document, meta in zip(result["distances"][0], result["documents"][0], result["metadatas"][0]):
  print(f"{dist:0.2f}  {meta['label']}  {document}")

0.08  4  One of my favorite restaurants on the planet.
0.08  4  This is officially my favorite restaurant.
0.12  4  Absolutely my most favorite restaurant ever.
0.12  4  This is one of my favorite places to eat.
0.12  3  This is one of my favorite places to eat.


In [26]:
query = 'What do you think about this doctor?' #@param {type:"string"}
n_results = 5 #@param {type:"integer"}

result = database_cos.query(query, n_results=5)
for dist, document, meta in zip(result["distances"][0], result["documents"][0], result["metadatas"][0]):
  print(f"{dist:0.2f}  {meta['label']}  {document}")

0.26  2   Seems like a good Doctor.
0.28  1  I used to like this doctor, but now I don't.
0.33  2  This is my primary doctor...
0.37  0  I hear he's in med school now?
0.37  3   There is another Doctor in the office however, Dr. Kessler.


##Evaluation

In [None]:
# Класс для оценки работы коллектора, предоставляющий функционал для поиска, оценки и расчета статистики по результатам
class CollectorEvaluator:
    def __init__(self, collector: Collector, n_top=100):
        pass  # Инициализация коллектора и параметра n_top для ограничения числа возвращаемых результатов

    def explore_collector(self, text):
        pass  # Метод для поиска документов в коллекторе на основе текста запроса

    def eval(self, query, answer):
        pass  # Метод для оценки корректности найденных документов на основе запроса и правильного ответа

    def calculate_statistics(self, data):
        pass  # Метод для расчета статистических показателей (например, минимальное, максимальное, среднее значение)

    def explore_and_calculate(self, data):
        pass  # Метод для проведения поиска по данным и расчета статистики на основе результатов

(10, 2)

In [120]:
path_to_dataset = '/content/QA.csv' #@param {type:"string"}
n_docs = 100 #@param {type:"integer"}
n_top = 100 #@param {type:"integer"}

In [None]:
df_qa = df.iloc[:n_docs,]

In [114]:
def find_id_index(result_ids, my_id, trunc=False):
  for idx, rid in enumerate(result_ids):
    if trunc:
      if rid.split("_")[0]==my_id.split("_")[0]:
        return idx
    else:
      if rid == my_id:
        return idx
  return

In [115]:
find_id_index(["111", "123"], "123", trunc=True)

1

In [None]:
find_id_indexes = []
for chunks, metas, ids in tqdm(dataset_batch_iter(df_qa, batch_size = 1), desc="qa DB"):
  for chunk, meta, id_ in zip(chunks, metas, ids):
    results = database_cos.query(query=chunk, n_results=n_top)
    ch_id = find_id_index(results["ids"][0], id_, trunc=False)
    find_id_indexes.append(ch_id)

qa DB: 0it [00:00, ?it/s]

  return bound(*args, **kwds)


In [None]:
indexes = np.array(find_id_indexes, dtype="float64")

print("Средняя позиция документа:" ,np.mean(indexes[~np.isnan(indexes)]))
print("Количество не найденных документов:", np.count_nonzero(np.isnan(indexes)))

Средняя позиция документа: 0.0
Количество не найденных документов: 20


In [None]:
df[["text"]].head()

Unnamed: 0,text
618104,Review at the bottom-- Picky Much? I love tha...
121392,This review is based on the fact I was a guest...
360065,This place is just awful. Don't get me wrong t...
120474,Get your picture taken with the Million Dollar...
340905,I was having family come visit from out of sta...


In [92]:
idx = 80
df.iloc[idx, 1], df.iloc[idx].name

("Perhaps we had unrealistic expectations for this place, having gone to L'Avenue the day before, but boy was it bad. Service beyond slow, despite the place being only half full, latte so bad as to be un drinkable . I actually sent it back, twice and told them to forget it. Pancakes were meh, poached eggs rubbery, need I say more? When we got the check, they made a big deal of saying they didn't charge me for the latte I couldn't drink!!! Should hope not....won't be back.",
 335445)

In [116]:
queries = [ # text_query, doc_id
    ("What color is a parking pass?", 121392),
    ("Does the discount compensate the workers incompetent?", 340905),
    ("How long I was a Centurylink customer?",561808),
    ("What was a spicy level for the Panang Curry", 153961),
    ("How many times does the servers came back?", 596493),
    ("Where is a good Ted Wien's store located?", 257236),
    ("Which DJ played on Paul Oakenfield night?", 213914),
    ("How many start does the Carvel lost since last year?", 418978),
    ("What item did a man from Arizona buy two at a time?", 33736),
    ("Did the poor service, unpalatable latte, and subpar food items disappoint the diner to such an extent that they felt the need to send back the latte twice and even requested not to be charged for it?", 335445)
]

In [122]:
find_id_indexes = []
for data in tqdm(queries):
  query, id_ = data
  results = database_cos.query(query=query, n_results=n_top)
  ch_id = find_id_index(results["ids"][0], str(id_), trunc=True)
  find_id_indexes.append(ch_id)

  0%|          | 0/10 [00:00<?, ?it/s]

In [123]:
indexes = np.array(find_id_indexes, dtype="float64")

print("Средняя позиция документа:" ,np.mean(indexes[~np.isnan(indexes)]))
print("Количество не найденных документов:", np.count_nonzero(np.isnan(indexes)))

Средняя позиция документа: 10.555555555555555
Количество не найденных документов: 1
