## Load data from author json file

In [1]:
import os
import logging
from tqdm.auto import tqdm
from pymilvus import connections, utility, Collection
from embedding_search.vector_store import (
    AUTHORS_DIR,
    create_author_collection,
    create_article_collection,
    make_articles_data_packages,
    push_data,
)
from dotenv import load_dotenv
from pathlib import Path

logging.basicConfig(level=logging.DEBUG)

  from .autonotebook import tqdm as notebook_tqdm


### Check all raw data is available locally

n should = 3313

In [2]:
load_dotenv()
AUTHORS_DIR = Path(os.getenv("AUTHORS_DIR"))
print(AUTHORS_DIR)
print(f"n={len(list(AUTHORS_DIR.glob('*.json')))}")

campus_users
n=2927


### Connect to Milvus

In [3]:
# `standalone` is the service name from docker-compose
MILVUS_HOST = os.getenv("MILVUS_HOST")
connections.connect("default", host=MILVUS_HOST, port="19530")
print(utility.get_server_version())

v2.2.11


## Make Milvus collections

Init collection and load data from json file

In [6]:
create_author_collection()
create_article_collection()

<Collection>:
-------------
<name>: articles
<description>: Articles
<schema>: {'auto_id': True, 'description': 'Articles', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'doi', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'author_id', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'publication_year', 'description': '', 'type': <DataType.INT32: 4>}, {'name': 'title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'abstract', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'cited_by', 'description': '', 'type': <DataType.INT32: 4>}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1536}}]}

In [4]:
utility.list_collections()

['authors']

In [5]:
# Drop all
[utility.drop_collection(c) for c in utility.list_collections()]

[None]

List collections

In [None]:
# connections.disconnect(alias="default")

In [None]:
author_collection = Collection("authors")
print(f"There are {author_collection.num_entities} authors in the DB.")

article_collection = Collection("articles")
print(f"There are {article_collection.num_entities} articles in the DB.")

## Ingest data

Authors

In [None]:
author_ids = [path.stem for path in AUTHORS_DIR.glob("*.json")]
author_ids = author_ids[:100]

In [None]:
data_packages = make_articles_data_packages(author_ids[0])

In [None]:
author_collection.insert(data_packages)
author_collection.flush()

Articles

In [None]:
article_collection = Collection("articles")
for author_id in tqdm(author_ids):
    data_packages = make_articles_data_packages(author_id)
    article_collection.insert(data_packages)

article_collection.flush()

## Create Milvus index

In [None]:
index_params = {
    "metric_type": "IP",  # inner product
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024},
}

In [None]:
article_collection.create_index("embedding", index_params)

In [None]:
author_collection.create_index("embedding", index_params)

In [None]:
utility.index_building_progress("articles")

In [None]:
utility.index_building_progress("authors")

## Load collection and test search

In [None]:
author_collection.load()

In [None]:
article_collection.load()

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
search_vector = embeddings.embed_query("Dark Higgs's boson")

In [None]:
articles = article_collection.search(
    data=[search_vector],
    anns_field="embedding",
    param={"metric_type": "IP", "params": {"nprobe": 16}},
    limit=10,
    output_fields=["title", "author_id", "doi", "publication_year", "cited_by"],
)

In [None]:
authors = author_collection.query(
    expr="community_name == ''",
    offset=0,
    limit=1,
    output_fields=["id", "first_name", "last_name", "community_name"],
)

In [None]:
authors

In [None]:
articles[0][0]

In [None]:
articles[0][0].entity.get("title")

## Drop collection

In [None]:
utility.list_collections()

In [None]:
utility.drop_collection("articles")
utility.drop_collection("authors")