## Load data from author json file

In [None]:
import logging
from tqdm.auto import tqdm
from pymilvus import connections, utility, Collection
from multiprocessing import Pool
from embedding_search.vector_store import (
    AUTHOR_DIR,
    create_authors_collection,
    create_articles_collection,
    make_author_data_package,
    make_articles_data_packages,
)

logging.basicConfig(level=logging.DEBUG)

### Connect to Milvus

In [None]:
connections.connect(host="milvus-standalone", port="19530")

## Make Milvus collections

Init collection and load data from json file

In [None]:
create_authors_collection()
create_articles_collection()

List collections

In [None]:
utility.list_collections()

In [None]:
author_collection = Collection("authors")
print(f"There are {author_collection.num_entities} authors in the DB.")

article_collection = Collection("articles")
print(f"There are {article_collection.num_entities} articles in the DB.")

## Ingest data

Authors

In [None]:
author_ids = [path.stem for path in AUTHOR_DIR.glob("*.json")]
with Pool(8) as p:
    data_packages = p.map(make_author_data_package, author_ids)

In [None]:
author_collection.insert(data_packages)
author_collection.flush()

Articles

In [None]:
author_ids = [path.stem for path in AUTHOR_DIR.glob("*.json")]

article_collection = Collection("articles")
for author_id in tqdm(author_ids):
    data_packages = make_articles_data_packages(author_id)
    article_collection.insert(data_packages)

In [None]:
article_collection.flush()

## Create Milvus index

In [None]:
index_params = {
    "metric_type": "IP",  # inner product
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024},
}

In [None]:
article_collection.create_index("embedding", index_params)

In [None]:
utility.index_building_progress("articles")

## Load collection and test search

In [None]:
article_collection.load()

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
search_vector = embeddings.embed_query("Dark Higgs's boson")

In [None]:
search_params = {"metric_type": "IP", "params": {"nprobe": 16}}
articles = article_collection.search(
    data=[search_vector],
    anns_field="embedding",
    param=search_params,
    limit=2,
    output_fields=["title", "author_id"],
)

In [None]:
articles[0][0]

Get only the title

In [None]:
articles[0][0].entity.get("title")

## Drop collection

In [None]:
utility.list_collections()

In [None]:
# utility.drop_collection("articles")