In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import weaviate
from weaviate.classes.data import DataObject
from weaviate.classes.config import Configure, VectorDistances

from langchain_ollama import OllamaEmbeddings
import numpy as np
import json

import weaviate.classes.query as wq
from weaviate.classes.query import Filter
from weaviate.classes.query import Rerank, MetadataQuery


ollama_url = 'localhost'

In [3]:
with open('alice_in_wonderland.txt', 'r') as file:
    text = file.read()

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Максимальный размер чанка
    chunk_overlap=300  # Перекрытие между чанками
)

documents = splitter.create_documents([text], metadatas=[{'book': 'alice_in_wonderland'}])

In [5]:
client = weaviate.connect_to_local(
    host=ollama_url,  # Укажите адрес хоста
    port=8080,         # Укажите порт HTTP
    grpc_port=50051    # Укажите порт gRPC
)

client.collections.delete_all()

print(client.is_ready())

True


In [8]:
client.collections.delete_all()
# Создать коллекцию с привязанным векторизатором. Два варианта векторизаторов
questions = client.collections.create(
    name="Books",
    #vectorizer_config=Configure.Vectorizer.text2vec_ollama(     # Configure the Ollama embedding integration
    #    api_endpoint="http://ollama:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
    #    model="all-minilm:33m",                              # The model to use
    #),
    vectorizer_config =[
        Configure.NamedVectors.text2vec_ollama(
            name="book_vectorizer",
            source_properties=["text"],
            api_endpoint="http://ollama:11434",  # If using Docker, use this to contact your local Ollama instance
            model="nomic-embed-text",  # The model to use, e.g. "nomic-embed-text"
            vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
    )
    ],
    generative_config=Configure.Generative.ollama(              # Configure the Ollama generative integration
        api_endpoint="http://ollama:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
        model="llama3.2",                                       # The model to use
    )
)

# Заполнение коллекции
books = client.collections.get("Books")

with books.batch.fixed_size(batch_size=10) as batch:
    for d in documents:
        batch.add_object({
            "text": d.page_content,
            "book": d.metadata['book']
        })

In [58]:
# Провести векторный поиск и поиск по ключевым словам по полю answer
books = client.collections.get("Books")
query = "was Bill killed?"
response = books.query.hybrid(
    query=query,  # The model provider integration will automatically vectorize the query
    limit=9,
    return_metadata=wq.MetadataQuery(distance=True),
    rerank=Rerank(
        prop="text",
        query=query
    ),
#    filters=(
#        Filter.by_property("text").contains_any(["mouse"])
#        Filter.by_property("points").less_than(600)
#    ),

)

for obj in response.objects:
    print('========')
    print(obj.properties['text'])

`Oh! So Bill's got to come down the chimney, has he?' said
Alice to herself.  `Shy, they seem to put everything upon Bill!
I wouldn't be in Bill's place for a good deal:  this fireplace is
narrow, to be sure; but I THINK I can kick a little!'

  She drew her foot as far down the chimney as she could, and
waited till she heard a little animal (she couldn't guess of what
sort it was) scratching and scrambling about in the chimney close
above her:  then, saying to herself `This is Bill,' she gave one
sharp kick, and waited to see what would happen next.

  The first thing she heard was a general chorus of `There goes
Bill!' then the Rabbit's voice along--`Catch him, you by the
hedge!' then silence, and then another confusion of voices--`Hold
up his head--Brandy now--Don't choke him--How was it, old fellow?
What happened to you?  Tell us all about it!'
One of the jurors had a pencil that squeaked.  This of course,
Alice could not stand, and she went round the court and got
behind him, and 

In [59]:
prompt_new = ''.join(['<doc>' + i.properties['text'] + '</doc>\n' for i in response.objects])
print(prompt_new)

<doc>`Oh! So Bill's got to come down the chimney, has he?' said
Alice to herself.  `Shy, they seem to put everything upon Bill!
I wouldn't be in Bill's place for a good deal:  this fireplace is
narrow, to be sure; but I THINK I can kick a little!'

  She drew her foot as far down the chimney as she could, and
waited till she heard a little animal (she couldn't guess of what
sort it was) scratching and scrambling about in the chimney close
above her:  then, saying to herself `This is Bill,' she gave one
sharp kick, and waited to see what would happen next.

  The first thing she heard was a general chorus of `There goes
Bill!' then the Rabbit's voice along--`Catch him, you by the
hedge!' then silence, and then another confusion of voices--`Hold
up his head--Brandy now--Don't choke him--How was it, old fellow?
What happened to you?  Tell us all about it!'</doc>
<doc>One of the jurors had a pencil that squeaked.  This of course,
Alice could not stand, and she went round the court and got


In [62]:
from llmlingua import PromptCompressor

llm_lingua = PromptCompressor(
    model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
    use_llmlingua2=True, # Whether to use llmlingua-2
    device_map="cpu"  # или аналогичный параметр
)
compressed_prompt = llm_lingua.compress_prompt(prompt_new, rate=0.48, force_tokens = ['\n', '?'])
#compressed_prompt = {'compressed_prompt': prompt_new}

Token indices sequence length is longer than the specified maximum sequence length for this model (2223 > 512). Running this sequence through the model will result in indexing errors


In [65]:
compressed_prompt['compressed_prompt']

"Bill's come down chimney?' said\n Alice `Shy they put everything upon Bill!\n wouldn't be in Bill's place for deal fireplace\n narrow THINK I can kick little!'\n\n drew foot down chimney\n waited till heard animal\n scratching in chimney\n `This is Bill gave\n sharp kick waited to see what next\n\n first heard chorus of `There goes\n Bill!' Rabbit's voice`Catch him\n hedge!' silence confusion of voices`Hold\n up head-Brandy-Don't choke him old fellow?\n What happened? Tell us about it!\n had pencil squeaked\n Alice stand went\n found opportunity taking it\n away quickly juror\n Bill, Lizard could make out\n obliged to write\n with one finger\n left no mark on slate\n\n `Herald read accusation!' said King.\n\n White Rabbit blew three blasts on trumpet\n unrolled parchment scroll read\n\n Queen of Hearts made tarts\n summer day\n Knave of Hearts stole tarts\n took them away!'\n\n`Consider verdict King said to jury.\n\n `Not yet! Rabbit interrupted\n great deal to come before that!'</doc

In [63]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="llama3.2", temperature=0.1, base_url=f"{ollama_url}:11434")
response = llm.invoke(compressed_prompt['compressed_prompt'] + '\n\n' + 
                      '\n<question>\n' +query+ '\n</question>' +
                      '\n Answer shortly, do not overthink. Use ONLY the materials above!')
print(response)

  llm = OllamaLLM(model="llama3.2", temperature=0.1, base_url=f"{ollama_url}:11434")


No, Bill was not killed. He was held by two guinea-pigs and given something from a bottle, but there is no indication that he was harmed or killed.


In [None]:
# РЕАЛИЗОВАТЬ 4 контейнер: реранкер + llmlingua-2 + обращение к ollama. Можно ещё добавить немного рассуждений 