In [6]:
from qdrant_client.http.models import (
    PointStruct, VectorParams, Distance, PayloadSchemaType
)
from qdrant_client.http.models import TextIndexParams, TokenizerType

from qdrant_client import QdrantClient, models

import numpy as np
import pandas as pd
import sys, json
from pathlib import Path
from qdrant_client.http.models import TextIndexParams
from more_itertools import batched
from tqdm.notebook import tqdm

sys.path.append(str(Path.cwd().parent))
from app import gen_queries, embed, rag_pipeline
from config import LLM_NAME, ENCODER_NAME, COLLECTION_NAME, LLM_PORT, ENCODER_PORT, QDRANT_PORT

from openai import OpenAI

openai_api_key = "EMPTY"
openai_api_base = f"http://localhost:{ENCODER_PORT}/v1"

llm_client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
    timeout=10000
)

q_client = QdrantClient(host="localhost", port=QDRANT_PORT, check_compatibility=False, timeout=1000)
Docs_Dense = "Docs_Dense"

await rag_pipeline('what is this service about')

{'answer': 'This service is about providing detailed settings for various Moodle activities, including Choice, Building Feedback, Feedback settings, Advanced grading methods, Survey settings, and others. It covers administrative tasks such as editing existing activities, configuring appointments, managing chat settings, and adding question types to lessons. Additionally, it provides information on technical skills needed for Moodle administration and how to use specific plugins like AMOS for translation workflows.',
 'context': QueryResponse(points=[ScoredPoint(id=387, version=0, score=2.1666667, payload={'text': "##Choice settings\n\n###\nThis page explores in more detail the settings for the Choice activity once you have added it to your course and also covers the Site administration settings.\nNote that if you want to edit an existing choice, click the Edit link to its right and choose the action you wish to take, e.g. 'Edit settings'.\n\n\n===SECTION_BREAK===\n\n###General\nIn the 

# Создание индекса qudrant

In [4]:
embs_table = pd.read_csv('../data/buffer_table.csv', index_col=0).reset_index()
retrieve_texts = embs_table["text"]
embs_table

Unnamed: 0,url,text,lengths
0,https://docs.moodle.org/403/en/2D/3D_structure...,##2D/3D structure display short answer\n\n###O...,323
1,https://docs.moodle.org/403/en/2D/3D_structure...,##2D/3D structure display short answer\n\nYou ...,241
2,https://docs.moodle.org/403/en/2D/3D_structure...,##2D/3D structure display short answer\n\n===S...,59
3,https://docs.moodle.org/403/en/2D/3D_structure...,##2D/3D structure display short answer\n\n###F...,334
4,https://docs.moodle.org/403/en/2D/3D_structure...,##2D/3D structure display short answer\n\n===S...,177
...,...,...,...
44627,https://docs.moodle.org/403/en/wikindx,##Wikindx\n\n5. Re the citation/bibliographic ...,293
44628,https://docs.moodle.org/403/en/wikindx,##Wikindx\n\n. In fact doubly easy if moodle ...,224
44629,https://docs.moodle.org/403/en/wikindx,##Wikindx\n\n6,12
44630,https://docs.moodle.org/403/en/wikindx,##Wikindx\n\n. What capabilities are there in ...,424


In [7]:
all_responses = []
tests_batches = list(batched(retrieve_texts.to_list(), 1024))
for batch in tqdm(tests_batches, desc="Processing batches"):
    responses = llm_client.embeddings.create(
        input=batch,
        model=ENCODER_NAME,
    )
    all_responses.extend(responses.data)

Processing batches:   0%|          | 0/44 [00:00<?, ?it/s]

In [10]:
texts = retrieve_texts.to_list()
embeddings = list(map(lambda x: x.embedding, responses.data))
ids = embs_table.index.to_list()
urls = embs_table.url.to_list()

In [11]:
if q_client.collection_exists(Docs_Dense):
    q_client.delete_collection(Docs_Dense)
    
q_client.create_collection(
    collection_name=Docs_Dense,
    vectors_config=VectorParams(size=2048, distance=Distance.COSINE),
)

points = [
    PointStruct(id=idx, vector=vec, payload={"text": text, "urls": url})
    for idx, vec, text, url in zip(ids, embeddings, texts, urls)
]

batches = list(batched(points, 512))
for batch in tqdm(batches):
    q_client.upsert(collection_name=Docs_Dense, points=list(batch), wait=True)

q_client.create_payload_index(
    collection_name=Docs_Dense,
    field_name="text",
    field_schema=TextIndexParams( 
        type="text",
        tokenizer=TokenizerType.MULTILINGUAL,
        lowercase=True,
        min_token_len=2,
        max_token_len=20,
    ),
    wait=True
)

q_client.get_collections()

  0%|          | 0/2 [00:00<?, ?it/s]

CollectionsResponse(collections=[CollectionDescription(name='Docs_BM25'), CollectionDescription(name='Docs_Dense')])