In [8]:
from qdrant_client import QdrantClient, models

QDRANT_HOST = "http://localhost:6333"
COLLECTION_NAME = "financial_reports"
DENSE_VECTOR_DIM = 768 # e.g., for a model like all-MiniLM-L6-v2

# 1. Initialize Qdrant Client
client = QdrantClient(url=QDRANT_HOST)

# 2. Define the collection with both dense and sparse vector configurations
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "dense": models.VectorParams(
            size=DENSE_VECTOR_DIM,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "sparse": models.SparseVectorParams(),
    }
)

print(f"Collection '{COLLECTION_NAME}' created with Dense and Sparse vector support.")

Collection 'financial_reports' created with Dense and Sparse vector support.


  client.recreate_collection(


In [10]:
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
import torch
from splade.models.transformer_rep import Splade  # For sparse vectors

client = QdrantClient("localhost", port=6333)  # Or cloud URL

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'splade'

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from qdrant_client import models
from typing import List

# Load a pre-trained sentence transformer model for dense embeddings
# For production, use a domain-specific financial model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def create_dense_embedding(text: str) -> List[float]:
    """Generates the dense (semantic) vector."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use CLS token embedding, mean-pooled or similar
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().tolist()
    return embedding

def create_sparse_embedding(text: str):
    """Generates the sparse (lexical/BM25) vector. 
    In Qdrant, a custom function/API (e.g., a dedicated sparse model) is typically used for this,
    but we'll show the structure."""
    # This is highly simplified. A real sparse model (like a splade variant or explicit BM25)
    # would generate indices and values.
    # For a real implementation, you would use a dedicated sparse vector generation library
    # that integrates with Qdrant's sparse vector format.
    
    # Placeholder for sparse vector (indices and values)
    # Example: {'indices': [100, 200, 300], 'values': [0.5, 0.3, 0.9]}
    
    # The actual generation logic is complex and dependent on the chosen sparse model.
    # For this guide, we focus on the Qdrant storage structure.
    return {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} 

# --- Ingestion Loop ---
def ingest_document_chunk(chunk_id: str, company: str, year: int, section: str, text: str):
    dense_vec = create_dense_embedding(text)
    sparse_vec = create_sparse_embedding(text) # NOTE: Placeholder
    
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=chunk_id,
                # Store both vector types under their respective names
                vectors={
                    "dense": dense_vec,
                    "sparse": models.SparseVector(indices=sparse_vec['indices'], values=sparse_vec['values'])
                },
                payload={
                    "company": company,
                    "year": year,
                    "section": section,
                    "text": text
                }
            )
        ]
    )

# Example ingestion of a chunk (requires a separate script for full text extraction/chunking)
ingest_document_chunk(
    chunk_id="AAPL_2024_RISK_001",
    company="Apple Inc.",
    year=2024,
    section="Risk Factors",
    text="Our operations are subject to risks associated with global supply chain disruptions..."
)