In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
from qdrant_search import QdrantSearchEngine

qdrant_search = QdrantSearchEngine()
await qdrant_search.drop_collection()
await qdrant_search.initialize()



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Collection discussion_points dropped successfully


In [3]:
from psql_helpers import get_session

In [4]:
self = qdrant_search

In [6]:
await self.sync_from_postgres(get_session)

In [34]:
from qdrant_client import AsyncQdrantClient
from fastembed import TextEmbedding


In [37]:
import os

In [38]:
QDRANT_HOST = os.getenv('QDRANT_HOST', '127.0.0.1')
QDRANT_PORT = os.getenv('QDRANT_PORT', '6333')




In [39]:
self.client = AsyncQdrantClient(QDRANT_HOST, port=QDRANT_PORT)
# Set up both dense and sparse models
self.model = TextEmbedding("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
self.client.set_sparse_model("Qdrant/bm25")
self.collection_name = "discussion_points"

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [40]:

async with get_session() as session:
    # Get existing meeting_ids from Qdrant
    existing_points = await self.client.scroll(
        collection_name=self.collection_name,
        limit=10000000,
        with_payload=["meeting_id"]
    )
    existing_meeting_ids = set()
    if existing_points[0]:
        existing_meeting_ids = {
            point.payload.get("meeting_id") 
            for point in existing_points[0] 
            if point.payload.get("meeting_id")
        }

In [41]:
existing_points

([], None)

In [42]:
from sqlalchemy import select
from psql_models import DiscussionPoint, Meeting, Speaker

In [43]:
query = select(DiscussionPoint, Meeting, Speaker).join(
    Meeting, DiscussionPoint.meeting_id == Meeting.meeting_id
).join(
    Speaker, DiscussionPoint.speaker_id == Speaker.id
)

result = await session.execute(query)
rows = result.fetchall()

In [45]:
points = []
current_meeting = True


In [46]:
rows[0]

(<psql_models.DiscussionPoint object at 0x7efc184c4710>, <psql_models.Meeting object at 0x7efc184c67d0>, <psql_models.Speaker object at 0x7efc184c7dd0>)

In [47]:
dp, meeting, speaker = rows[0]

In [51]:
current_meeting = meeting

In [52]:
meeting_id_str = str(meeting.meeting_id)

In [53]:
if current_meeting and current_meeting.meeting_id != meeting.meeting_id:
    if current_meeting.transcript:
        transcript_points = await self._process_transcript(current_meeting)
        points.extend(transcript_points)

In [55]:
embedding_strings = [dp.topic_name if dp.topic_name else "", dp.summary if dp.summary else "", dp.details if dp.details else "", speaker.name]
embeddings = self.model.embed(embedding_strings)
topic_vector,summary_vector,details_vector,speaker_vector = list(embeddings)


In [18]:
from fastembed import TextEmbedding
from typing import List

# Example list of documents
documents: List[str] = [
    "This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc.",
    "fastembed is supported by and maintained by Qdrant.",
]

# This will trigger the model download and initialization
embedding_model = TextEmbedding(model="paraphrase-multilingual-MiniLM-L12-v2",cuda=False)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
embeddings = list(embedding_model.embed(documents))

In [20]:
embeddings

[array([-0.1115468 ,  0.00976557,  0.00524553,  0.01951895, -0.01934952,
         0.02943452, -0.1051909 , -0.00890122,  0.01831437,  0.014868  ,
        -0.05642497,  0.02561355, -0.00120166,  0.0063746 ,  0.02633464,
         0.00892209,  0.05313656,  0.03955455, -0.04400251, -0.02929405,
         0.04691842, -0.02515871,  0.00778645, -0.05410658, -0.04362101,
         0.01275096, -0.02304645, -0.02250822,  0.01992304, -0.19920595,
         0.01895351, -0.02651558,  0.08252289, -0.02281935, -0.057813  ,
        -0.01367696, -0.03570741,  0.05386206, -0.10155279,  0.0209727 ,
         0.00652712,  0.03738065, -0.03508432, -0.00091194,  0.03425015,
        -0.02445885, -0.00739984, -0.0308434 , -0.03605131, -0.02814274,
         0.01547161, -0.02158502,  0.0254423 ,  0.02438808, -0.02049716,
        -0.02665791,  0.03727326,  0.08809512,  0.02471924, -0.0085119 ,
        -0.02016545,  0.05734606, -0.05127538,  0.02370393, -0.02993907,
        -0.02091446, -0.0392783 , -0.02315619,  0.0

In [2]:
from fastembed import TextEmbedding

# Initialize with explicit model name and force download
embeddings_generator = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    max_length=512,
    cache_dir=None  # This will use default cache location but ensure fresh download
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
from fastembed import SparseTextEmbedding

model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
embeddings = list(model.embed(documents))

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
embeddings_generator = embedding_model.embed(['test'])
