In [None]:
!bash ../storage/standalone_embed.sh start

^C


In [3]:
os.chdir('../storage')

In [4]:
import os
import functools
from dotenv import load_dotenv

from datasets import load_dataset
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_milvus.utils.sparse import BM25SparseEmbedding
from langchain_milvus.retrievers import MilvusCollectionHybridSearchRetriever
from pymilvus import (
    MilvusClient,
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    WeightedRanker,
    connections,
)

load_dotenv()

True

In [5]:
CONNECTION_URI = os.getenv("MILVUS_CONNECTION_URI")
connections.connect(uri=CONNECTION_URI)

# client = MilvusClient(uri="http://localhost:19530", token="root:Milvus")
# if client.has_collection(collection_name="habr_collection"):
#     client.drop_collection(collection_name="habr_collection")

In [6]:
texts = [
    "In 'The Whispering Walls' by Ava Moreno, a young journalist named Sophia uncovers a decades-old conspiracy hidden within the crumbling walls of an ancient mansion, where the whispers of the past threaten to destroy her own sanity.",
    "In 'The Last Refuge' by Ethan Blackwood, a group of survivors must band together to escape a post-apocalyptic wasteland, where the last remnants of humanity cling to life in a desperate bid for survival.",
    "In 'The Memory Thief' by Lila Rose, a charismatic thief with the ability to steal and manipulate memories is hired by a mysterious client to pull off a daring heist, but soon finds themselves trapped in a web of deceit and betrayal.",
    "In 'The City of Echoes' by Julian Saint Clair, a brilliant detective must navigate a labyrinthine metropolis where time is currency, and the rich can live forever, but at a terrible cost to the poor.",
    "In 'The Starlight Serenade' by Ruby Flynn, a shy astronomer discovers a mysterious melody emanating from a distant star, which leads her on a journey to uncover the secrets of the universe and her own heart.",
    "In 'The Shadow Weaver' by Piper Redding, a young orphan discovers she has the ability to weave powerful illusions, but soon finds herself at the center of a deadly game of cat and mouse between rival factions vying for control of the mystical arts.",
    "In 'The Lost Expedition' by Caspian Grey, a team of explorers ventures into the heart of the Amazon rainforest in search of a lost city, but soon finds themselves hunted by a ruthless treasure hunter and the treacherous jungle itself.",
    "In 'The Clockwork Kingdom' by Augusta Wynter, a brilliant inventor discovers a hidden world of clockwork machines and ancient magic, where a rebellion is brewing against the tyrannical ruler of the land.",
    "In 'The Phantom Pilgrim' by Rowan Welles, a charismatic smuggler is hired by a mysterious organization to transport a valuable artifact across a war-torn continent, but soon finds themselves pursued by deadly assassins and rival factions.",
    "In 'The Dreamwalker's Journey' by Lyra Snow, a young dreamwalker discovers she has the ability to enter people's dreams, but soon finds herself trapped in a surreal world of nightmares and illusions, where the boundaries between reality and fantasy blur.",
]

In [7]:
model_name = "deepvk/USER-bge-m3"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

dense_embedding_func = hf
sparse_embedding_func = BM25SparseEmbedding(corpus=texts)

In [8]:
pk_field = "doc_id"
dense_field = "dense_vector"
sparse_field = "sparse_vector"
text_field = "text"

fields = [
    FieldSchema(
        name=pk_field,
        dtype=DataType.VARCHAR,
        is_primary=True,
        auto_id=True,
        max_length=100,
    ),
    FieldSchema(name=dense_field, dtype=DataType.FLOAT_VECTOR, dim=1024),
    FieldSchema(name=sparse_field, dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name=text_field, dtype=DataType.VARCHAR, max_length=65_535),
]

In [9]:
schema = CollectionSchema(fields=fields, enable_dynamic_field=False)
collection = Collection(
    name="habr_collection", schema=schema, consistency_level="Strong"
)

In [10]:
dense_index = {"index_type": "DISKANN", "metric_type": "IP"}
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}

collection.create_index("dense_vector", dense_index)
collection.create_index("sparse_vector", sparse_index)

collection.flush()

In [11]:
entities = []
for text in texts:
    entity = {
        dense_field: hf.embed_query(text),
        sparse_field: sparse_embedding_func.embed_query(text),
        text_field: text,
    }
    entities.append(entity)
collection.insert(entities)
collection.load()

In [12]:
sparse_search_params = {"metric_type": "IP"}
dense_search_params = {"metric_type": "IP", "params": {}}

retriever = MilvusCollectionHybridSearchRetriever(
    collection=collection,
    rerank=WeightedRanker(0.8, 0.2),
    anns_fields=[dense_field, sparse_field],
    field_embeddings=[dense_embedding_func, sparse_embedding_func],
    field_search_params=[dense_search_params, sparse_search_params],
    top_k=5,
    text_field=text_field,
)

In [14]:
query = "What are the story about ventures?"

res = retriever.invoke(query)

"In 'The Phantom Pilgrim' by Rowan Welles, a charismatic smuggler is hired by a mysterious organization to transport a valuable artifact across a war-torn continent, but soon finds themselves pursued by deadly assassins and rival factions."

In [20]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-large')
input_texts = [f'query: {query}'] + [f"passage: {item.page_content}" for item in res]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [21]:
input_texts

['query: What are the story about ventures?',
 "passage: In 'The Phantom Pilgrim' by Rowan Welles, a charismatic smuggler is hired by a mysterious organization to transport a valuable artifact across a war-torn continent, but soon finds themselves pursued by deadly assassins and rival factions.",
 "passage: In 'The Phantom Pilgrim' by Rowan Welles, a charismatic smuggler is hired by a mysterious organization to transport a valuable artifact across a war-torn continent, but soon finds themselves pursued by deadly assassins and rival factions.",
 "passage: In 'The Phantom Pilgrim' by Rowan Welles, a charismatic smuggler is hired by a mysterious organization to transport a valuable artifact across a war-torn continent, but soon finds themselves pursued by deadly assassins and rival factions.",
 "passage: In 'The Shadow Weaver' by Piper Redding, a young orphan discovers she has the ability to weave powerful illusions, but soon finds herself at the center of a deadly game of cat and mouse b

In [22]:
embeddings = model.encode(input_texts, normalize_embeddings=True)

In [51]:
query_emb = embeddings[0]
passage_embs = embeddings[1:]

ranks = []
for passage_emb in passage_embs:
    ranks.append(model.similarity(query_emb, passage_emb).item())
indexes = np.array(list(reversed(np.array(ranks).argsort())))

In [45]:
import numpy as np

In [52]:
indexes

array([2, 0, 1, 4, 3])

In [46]:
ranks

[0.7380368113517761,
 0.7380368113517761,
 0.7380368113517761,
 0.7359192967414856,
 0.7359192967414856]

In [None]:
data=[{"id": i, 
       "text": docs[i], 
       "sparse": sparse_embedding_func.embed_query(docs[i]), 
       "dense": model.encode(docs[i], normalize_embeddings=True)} for i in range(len(docs))]

In [196]:
from pymilvus import (
    MilvusClient, DataType
)

# Create schema
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)
# Add fields to schema
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=1000)
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="dense", datatype=DataType.FLOAT_VECTOR, dim=1024)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}, {'name': 'dense', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}], 'enable_dynamic_field': True}

In [197]:
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="dense",
    index_name="dense_index",
    index_type="DISKANN",
    metric_type="IP",
)

index_params.add_index(
    field_name="sparse",
    index_name="sparse_index",
    index_type="SPARSE_INVERTED_INDEX",  # Index type for sparse vectors
    metric_type="IP",  # Currently, only IP (Inner Product) is supported for sparse vectors
    params={"drop_ratio_build": 0.2},  # The ratio of small vector values to be dropped during indexing
)

client.create_collection(
    collection_name="habr_collection",
    schema=schema,
    index_params=index_params
)


In [198]:
res = client.insert(
    collection_name="habr_collection",
    data=data
)

In [199]:
query_dense_vector = model.encode("Who is Alan Turing?", normalize_embeddings=True)
query_sparse_vector = sparse_embedding_func.embed_query("Who is Alan Turing?")

In [200]:
from pymilvus import AnnSearchRequest

search_param_1 = {
    "data": [query_dense_vector],
    "anns_field": "dense",
    "param": {
        "metric_type": "IP",
        "params": {"nprobe": 10}
    },
    "limit": 2
}
request_1 = AnnSearchRequest(**search_param_1)

search_param_2 = {
    "data": [query_sparse_vector],
    "anns_field": "sparse",
    "param": {
        "metric_type": "IP",
        "params": {"drop_ratio_build": 0.2}
    },
    "limit": 2
}
request_2 = AnnSearchRequest(**search_param_2)

reqs = [request_1, request_2]

In [201]:
from pymilvus import WeightedRanker
from pymilvus import RRFRanker


rerank= WeightedRanker(0.8, 0.3)

In [217]:
from pymilvus import MilvusClient

res = client.hybrid_search(
    collection_name="habr_collection",
    reqs=reqs,
    ranker=rerank,
    limit=2
)

In [211]:
res = client.search(
    anns_field="dense",
    collection_name="habr_collection",  # target collection
    data=[query_dense_vector],  # query vectors
    limit=2
)