In [None]:
# connect to milvus db

from pymilvus import MilvusClient

MILVUS_URI = "https://in05-f2065fb7edb22e2.serverless.aws-eu-central-1.cloud.zilliz.com"
MILVUS_TOKEN = "e548ca0b2cdf6d09cbe43de608c2a80a52d001d36248cdbc3aab61e9ba2ff49eb57fe576b2c7d792fab406f1cb35470b860abf96"
MILVUS_COLLECTION_NAME = "test"

milvus_client = MilvusClient(
    uri=MILVUS_URI,
    token=MILVUS_TOKEN,
    collection_name=MILVUS_COLLECTION_NAME
)


In [None]:
import os
from google import genai
from google.genai.types import EmbedContentConfig
from typing import List

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
genai_client = genai.Client(api_key=GEMINI_API_KEY)


doc_cfg = EmbedContentConfig(task_type="RETRIEVAL_QUERY", output_dimensionality=768)

def get_embedding(text: str) -> List[float]:
    """Generate embedding for the given text using Gemini."""
    resp = genai_client.models.embed_content(
        model="gemini-embedding-001",
        contents=text,
        config=doc_cfg
    )
    return [i.values for i in resp.embeddings][0]

In [None]:
# prepare data

import json

data_path='resources/clean_pecha_ids.json'
with open(data_path,'r',encoding='utf-8') as f:
    data=json.load(f)

data_text=list(data.values())



In [45]:
# inset data into milvus

prepare_data=[]
for i in range(len(data_text)):
    text=data_text[i]
    embedding=get_embedding(text) #get embedding for the first text
    prepare_data.append({"text":data_text[i],"vector":embedding})

# insert data into milvus
from pymilvus import MilvusClient

def insert_rows(milvus_client: MilvusClient, rows: list[dict]):
    """
    rows: [{"id": "...", "text": "...", "lang": "bo", "embedding": [..DIM..]}, ...]
    """
    # Milvus insert wants column-oriented data by field order
    milvus_client.insert(
        collection_name=MILVUS_COLLECTION_NAME,
        data=rows
    )
    milvus_client.flush(collection_name=MILVUS_COLLECTION_NAME) 

insert_rows(milvus_client,prepare_data)


In [44]:
# search a text in milvus

from pymilvus import AnnSearchRequest, RRFRanker


text_search="abstract"

def search_text(milvus_client: MilvusClient, text: str):
    """
    search a text in milvus
    """
    query_embedding=get_embedding(text_search)
    results = milvus_client.search(
    collection_name=MILVUS_COLLECTION_NAME,
    data=[query_embedding],
    limit=5,
    output_fields=["text"]
    )
    
    return results

def hybrid_search(milvus_client: MilvusClient, text: str):
    query_embedding=get_embedding(text_search)
    limit=5
    search_param_1 = {
        "data": [query_embedding],
        "anns_field": "vector",
        "param": {},
        "limit": limit
    } 
    request_1 = AnnSearchRequest(**search_param_1)
    
    search_param_2 = {
        "data": [query_embedding],
        "anns_field": "vector",
        "param": {"drop_ratio_search": 0.2},
        "limit": limit
    }
    request_2 = AnnSearchRequest(**search_param_2)
    
    
    results = milvus_client.hybrid_search(
    collection_name=MILVUS_COLLECTION_NAME,
    vector=[query_embedding],
    reqs=[request_1,request_2],
    ranker=RRFRanker(),
    limit=limit
)
    return results

hybrid_search(milvus_client,text_search)

data: [[{'id': '462953313717749614', 'distance': 0.032786883413791656, 'entity': {}}, {'id': '462953313717749908', 'distance': 0.032258063554763794, 'entity': {}}, {'id': '462953313717749606', 'distance': 0.0317460335791111, 'entity': {}}, {'id': '462953313717749900', 'distance': 0.03125, 'entity': {}}, {'id': '462953313717749907', 'distance': 0.03076923079788685, 'entity': {}}]],{'cost': 6}