## Load data from author json file

In [None]:
import logging
from tqdm.auto import tqdm
from pymilvus import connections, utility, Collection
from multiprocessing import Pool
from embedding_search.vector_store import (
    AUTHOR_DIR,
    create_authors_collection,
    create_articles_collection,
    make_author_data_package,
    make_articles_data_packages,
)

logging.basicConfig(level=logging.DEBUG)

### Connect to Milvus

In [None]:
# `standalone` is the service name from docker-compose
connections.connect("default", host="standalone", port="19530")
print(utility.get_server_version())

## Make Milvus collections

Init collection and load data from json file

In [None]:
create_authors_collection()
create_articles_collection()

List collections

In [None]:
utility.drop_collection("articles")

In [None]:
for collection_name in utility.list_collections():
    utility.drop_collection(collection_name)

In [None]:
connections.disconnect(alias="default")

In [None]:
author_collection = Collection("authors")
print(f"There are {author_collection.num_entities} authors in the DB.")

article_collection = Collection("articles")
print(f"There are {article_collection.num_entities} articles in the DB.")

In [None]:
author_collection

## Ingest data

Authors

In [None]:
AUTHOR_DIR

In [None]:
author_ids = [path.stem for path in AUTHOR_DIR.glob("*.json")]

# DEBUG
# author_ids = author_ids[:100]

# with Pool(8) as p:
#     data_packages = p.map(make_author_data_package, author_ids)

In [None]:
author_ids[3]

In [None]:
tmp = make_articles_data_packages(author_ids[2026])

In [None]:
len(tmp)

In [None]:
author_collection.insert(data_packages)
author_collection.flush()

Articles

In [None]:
author_ids = [path.stem for path in AUTHOR_DIR.glob("*.json")]
author_ids = author_ids[:100]

article_collection = Collection("articles")
for author_id in tqdm(author_ids):
    data_packages = make_articles_data_packages(author_id)
    article_collection.insert(data_packages)

article_collection.flush()

## Create Milvus index

In [None]:
index_params = {
    "metric_type": "IP",  # inner product
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024},
}

In [None]:
article_collection.create_index("embedding", index_params)

In [None]:
utility.index_building_progress("articles")

## Load collection and test search

In [None]:
article_collection.load()

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
search_vector = embeddings.embed_query("Dark Higgs's boson")

In [None]:
res = article_collection.query(
    expr="author_id == 106927",
    offset=0,
    limit=10,
    output_fields=["title", "author_id"],
)

In [None]:
for x in res[0]:
    print(x)

In [None]:
res[0].distances

In [None]:
from typing import Any
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings

ARTICLE_COLLECTION = Collection(name="articles")
ARTICLE_COLLECTION.load()
EMBEDDINGS = OpenAIEmbeddings()


def search(
    query: str,
    output_fields: list,
    top_k: int = 3,
    distance_threshold: float = 0.2,
    pow: float = 3.0,
) -> list:
    """Search for articles by query."""

    # Embed query
    search_vector = EMBEDDINGS.embed_query(query)
    print(search_vector)

    # Search in Milvus
    search_params = {"metric_type": "IP", "params": {"nprobe": 16}}
    articles = article_collection.search(
        data=[search_vector],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        output_fields=output_fields,
    )

    return articles.__dict__

In [None]:
r = search("galaxy far away")

In [None]:
r[0][2].entity.get("author_id")

In [None]:
from pymilvus.orm.search import SearchResults


def _flatten_results(results: SearchResults) -> list[dict]:
    """Flatten Milvus search results."""

    return [
        {
            "ids": result.ids,
            "distances": result.distances,
            "scores": result.scores,
        }
        for result in results[0]
    ]

In [None]:
r.on_result()

In [None]:
type(r)

In [None]:
r[0][0].__dict__

In [None]:
for x in r[0]:
    print(type(x))

In [None]:
def search_with_emb

In [None]:
articles[0][0]

Get only the title

In [None]:
articles[0][0].entity.get("title")

## Drop collection

In [None]:
utility.list_collections()

In [None]:
# utility.drop_collection("articles")