In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

In [3]:
from sentence_transformers import CrossEncoder
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

query = "Who wrote 'To Kill a Mockingbird'?"
documents = [
    "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature.",
    "The novel 'Moby-Dick' was written by Herman Melville and first published in 1851. It is considered a masterpiece of American literature and deals with complex themes of obsession, revenge, and the conflict between good and evil.",
    "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961.",
    "Jane Austen was an English novelist known primarily for her six major novels, which interpret, critique and comment upon the British landed gentry at the end of the 18th century.",
    "The 'Harry Potter' series, which consists of seven fantasy novels written by British author J.K. Rowling, is among the most popular and critically acclaimed books of the modern era.",
    "'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan."
]

model.rank(query, documents, return_documents=True)

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[{'corpus_id': 0,
  'score': 10.678579,
  'text': "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature."},
 {'corpus_id': 2,
  'score': 9.761675,
  'text': "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961."},
 {'corpus_id': 1,
  'score': -3.3099546,
  'text': "The novel 'Moby-Dick' was written by Herman Melville and first published in 1851. It is considered a masterpiece of American literature and deals with complex themes of obsession, revenge, and the conflict between good and evil."},
 {'corpus_id': 5,
  'score': -4.8989124,
  'text': "'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter # type: ignore
from langchain_core.prompts import PromptTemplate
from langchain_ollama import OllamaLLM, OllamaEmbeddings

import weaviate
from weaviate.classes.data import DataObject # type: ignore
from weaviate.classes.config import Configure, VectorDistances # type: ignore

import weaviate.classes.query as wq
from weaviate.classes.query import Filter
from weaviate.classes.query import Rerank, MetadataQuery
from weaviate.classes.config import Property, DataType

from enum import Enum
from typing import Dict, Tuple
import numpy as np
import json

ollama_url = 'localhost'

In [None]:
class BooksProcessor:
    def __init__(self, ollama_url: str = 'localhost', embedding_model_name:str = 'nomic-embed-text',
                 wv_port_rest: int = 8080, wv_port_grpc: int = 50051):
        self.embedding_model_name = embedding_model_name
        self.wv_client = weaviate.connect_to_local(
            host=ollama_url,
            port=wv_port_rest,
            grpc_port=wv_port_grpc,
        )
        print(f"Connected to Weaviate: {self.wv_client.is_ready()}")
        self.big_chunks = self.create_collection_if_not_exists('big_chunks')
        self.medium_chunks = self.create_collection_if_not_exists('medium_chunks')
        self.small_chunks = self.create_collection_if_not_exists('small_chunks')


    def create_collection_if_not_exists(self, collection_name: str):
        # Проверка наличия коллекции
        if self.wv_client.collections.exists(collection_name):
            print(f"Getting '{collection_name}'")
        else:
            print(f"Creating '{collection_name}'")

            # Создать коллекцию с привязанным векторизатором. Два варианта векторизаторов
            book_chunks = self.wv_client.collections.create(
                name=collection_name,
                properties=[Property(name="chunk", data_type=DataType.TEXT),
                            Property(name="book_name", data_type=DataType.TEXT),
                            Property(name="chunk_num", data_type=DataType.INT)],
                
                vectorizer_config=[
                    Configure.NamedVectors.text2vec_ollama(
                        name="book_vectorizer",
                        source_properties=["book_chunks"],
                        api_endpoint="http://ollama:11434",
                        model=self.embedding_model_name,
                        vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE))
                ]
            )
        return self.wv_client.collections.get(collection_name)

    def check_book_exists(self, book_name: str):
        book_filter = Filter.by_property("book_name").equal(book_name)
        
        # Выполняем запрос
        result = self.big_chunks.query.fetch_objects(
            filters=book_filter,
            limit=1
        )
        return result.objects

    def split_book(self, book_text: str, chunk_size: int, chunk_overlap: int):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,  # Максимальный размер чанка
            chunk_overlap=chunk_overlap  # Перекрытие между чанками
        )
        return splitter.create_documents([book_text])      

    def send_to_db(self, collection, chunks, book_name):
        with collection.batch.fixed_size(batch_size=10) as batch:
            for i, d in enumerate(chunks):
                batch.add_object({
                    "chunk": d.page_content,
                    "book_name": book_name,
                    "chunk_num": int(i)
                })

    def delete_book(self, collection_name: str):
        book_filter = Filter.by_property("book_name").equal(collection_name)
        
        # Выполнение пакетного удаления
        response_big = self.big_chunks.data.delete_many(
            where=book_filter
        )
        
        # Выполнение пакетного удаления
        response_medium = self.medium_chunks.data.delete_many(
            where=book_filter
        )

        # Выполнение пакетного удаления
        response_small = self.small_chunks.data.delete_many(
            where=book_filter
        )
        
        # Проверка результата
        if response_small.successful > 0:
            print(f"Successfully deleted {response_big.successful}, {response_medium.successful}, {response_small.successful} objects.")
        else:
            print("Nothing to delete")
    
    def process_book(self, book_name: str, book_txt: str):
        if self.check_book_exists(book_name):
            print("Book already exists")
            return
        else:
            print("Processing book")

        big_chunks = self.split_book(book_txt, 3000, 1000)
        self.send_to_db(collection=self.big_chunks, chunks=big_chunks, book_name=book_name)

        medium_chunks = self.split_book(book_txt, 1500, 500)
        self.send_to_db(collection=self.medium_chunks, chunks=medium_chunks, book_name=book_name)

        small_chunks = self.split_book(book_txt, 750, 250)
        self.send_to_db(collection=self.small_chunks, chunks=small_chunks, book_name=book_name)
        
        print("Book successfully processed")

class ChunkSize(Enum):
    SMALL = 'small_chunks'
    MEDIUM = 'medium_chunks'
    LARGE = 'big_chunks'

class Search:
    def __init__(self, ollama_url: str = 'localhost', llm_name:str = 'Llama3.2',
                 wv_port_rest: int = 8080, wv_port_grpc: int = 50051):
        self.llm = OllamaLLM(model=llm_name, temperature=0, base_url=f"{ollama_url}:11434")
        self.wv_client = weaviate.connect_to_local(
            host=ollama_url,
            port=wv_port_rest,
            grpc_port=wv_port_grpc,
        )
        print(f"Connected to Weaviate: {self.wv_client.is_ready()}")

        with open('classifier_prompt.j2') as f:
            template = f.read()
        
        self.prompt = PromptTemplate(
            input_variables=["query"],
            template=template,
            template_format="jinja2"
        )

    def classify_query(self, query: str) -> Dict[str, int]:
        prompt = self.prompt.format(query=query)
        response = self.llm.invoke(prompt).strip().upper()
        try:
            return ChunkSize[response].value
        except KeyError:
            return ChunkSize.MEDIUM.value
    
    def get_chunk_count(collection_name, book_name):
        # Создаем фильтр для конкретной книги
        book_filter = Filter.by_property("book_name").equal(book_name)
        
        # Выполняем агрегирующий запрос для подсчета количества чанков
        result = self.wv_client.query.aggregate("YourClassName").with_where(book_filter).with_group_by(["book_name"]).with_fields("count").do()
        
        # Извлекаем количество из результата
        count = result['data']['Aggregate']['YourClassName'][0]['count']
        return count

    def search(self, query: str, book: list):
        collection = self.classify_query(query)
        books = self.wv_client.collections.get(collection)
        response = books.query.hybrid(
            query=query,  # The model provider integration will automatically vectorize the query
            limit=9,
            filters=(
                Filter.by_property("book_name").equal(book)
            ),
        )


In [None]:
books_processor = BooksProcessor()
with open('Sherlock Study in Scarlet.txt', 'r', encoding='utf8') as file:
    text = file.read()
books_processor.process_book('Sherlock Study in Scarlet', text)
#books_processor.delete_book('Sherlock Study in Scarlet')

In [50]:
books_processor.big_chunks.query.hybrid('hi')



In [None]:
books_processor.big_chunks.

In [47]:
search = Search()

Connected to Weaviate: True


In [48]:
search.search(query='what is this book about?', books_to_search_in=['Sherlock Study in Scarlet'])

Коллекция для поиска: big_chunks


TypeError: _HybridQueryAsync.hybrid() got an unexpected keyword argument 'sort'