In [11]:
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
from qdrant_client import QdrantClient
from qdrant_client.http import models
import os
import re

In [12]:
load_dotenv()

client = InferenceClient(
    provider="hf-inference",
    api_key=os.environ["HF_TOKEN"],
)

database = QdrantClient(
    url=os.environ["QDRANT_URL"],
    api_key=os.environ["QDRANT_API_KEY"],
)

In [13]:
database.recreate_collection(
    collection_name="PassportKnowledgeBase",
    vectors_config={
        "chunk_vector": models.VectorParams(size=768, distance=models.Distance.COSINE),
        "summary_vector": models.VectorParams(size=768, distance=models.Distance.COSINE)
    },
    on_disk_payload=True
)

  database.recreate_collection(


True

In [14]:
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'#{1,6}\s*', '', text)
    text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text)
    text = re.sub(r'(\*|_)(.*?)\1', r'\2', text)
    text = re.sub(r'\|', ' ', text)
    text = re.sub(r'-{3,}', '', text)
    text = re.sub(r'={3,}', '', text)
    text = re.sub(r'[\*=\-_]{3,}', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\\n', ' ').replace('\\t', ' ')
    text = text.strip()
    return text

with open('chunkedKnowledge.txt', 'r', encoding='utf-8') as f:
    content = f.read()
    chunks = [chunk.strip() for chunk in content.split('\n\n') if chunk.strip()]

chunk_embeddings = []
summary_embeddings = []
texts = []
summaries = []

for i, chunk in enumerate(chunks):
    try:
        cleanedText = clean_text(chunk)
        
        chunk_vector = client.feature_extraction(
            text=cleanedText,
            model="google/embeddinggemma-300m"
        )
        summary_result = client.summarization(
            text=cleanedText,
            model="facebook/bart-large-cnn"
        )
        
        summary_text = summary_result.summary_text
        
        summary_vector = client.feature_extraction(
            text=summary_text,
            model="google/embeddinggemma-300m"
        )
        
        chunk_embeddings.append(chunk_vector)
        summary_embeddings.append(summary_vector)
        texts.append(cleanedText)
        summaries.append(summary_text)
        
    except Exception as e:
        print(f"Error processing chunk {i + 1}: {e}")
        continue


In [16]:
for i, chunk in enumerate(chunks):
    database.upsert(
        collection_name="PassportKnowledgeBase",
        points=[
            models.PointStruct(
                id=i,
                vector={
                    "chunk_vector": chunk_embeddings[i],
                    "summary_vector": summary_embeddings[i]
                },
                payload={
                    "text": texts[i],
                    "summary": summaries[i]
                }
            )
        ]
    )
