## Load data from author json file

In [1]:
import os
import logging
from tqdm.auto import tqdm
from pymilvus import connections, utility, Collection
from embedding_search.vector_store import (
    AUTHORS_DIR,
    create_author_collection,
    create_article_collection,
    make_articles_data_packages,
    push_data,
)
from dotenv import load_dotenv
from pathlib import Path

logging.basicConfig(level=logging.DEBUG)

  from .autonotebook import tqdm as notebook_tqdm


### Check all raw data is available locally

n should = 3313

In [2]:
load_dotenv()
AUTHORS_DIR = os.getenv("AUTHORS_DIR")
print(AUTHORS_DIR)
print(f"n={len(list(Path(AUTHORS_DIR).glob('*.json')))}")

/community-search/data
n=2927


### Connect to Milvus

In [3]:
# `standalone` is the service name from docker-compose
connections.connect("default", host="localhost", port="19530")
print(utility.get_server_version())

v2.2.11


## Make Milvus collections

Init collection and load data from json file

In [None]:
# create_author_collection()
# create_article_collection()

In [4]:
utility.list_collections()

['articles', 'authors']

In [None]:
# Drop all
# [utility.drop_collection(c) for c in utility.list_collections()]

List collections

In [None]:
# connections.disconnect(alias="default")

In [6]:
author_collection = Collection("authors")
print(f"There are {author_collection.num_entities} authors in the DB.")

article_collection = Collection("articles")
print(f"There are {article_collection.num_entities} articles in the DB.")

There are 100 authors in the DB.
There are 4364 articles in the DB.


## Ingest data

Authors

In [None]:
author_ids = [path.stem for path in AUTHORS_DIR.glob("*.json")]
author_ids = author_ids[:100]

In [None]:
data_packages = make_articles_data_packages(author_ids[0])

In [None]:
author_collection.insert(data_packages)
author_collection.flush()

Articles

In [None]:
article_collection = Collection("articles")
for author_id in tqdm(author_ids):
    data_packages = make_articles_data_packages(author_id)
    article_collection.insert(data_packages)

article_collection.flush()

## Create Milvus index

In [None]:
index_params = {
    "metric_type": "IP",  # inner product
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024},
}

In [None]:
article_collection.create_index("embedding", index_params)

In [None]:
author_collection.create_index("embedding", index_params)

In [None]:
utility.index_building_progress("articles")

In [None]:
utility.index_building_progress("authors")

## Load collection and test search

In [15]:
author_collection.load()

In [17]:
article_collection.load()

In [13]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
search_vector = embeddings.embed_query("Dark Higgs's boson")

DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
DEBUG:openai:api_version=None data='{"input": [[26915, 473, 62247, 596, 43746, 263]], "model": "text-embedding-ada-002", "encoding_format": "base64"}' message='Post details'
DEBUG:urllib3.util.retry:Converted retries value: 2 -> Retry(total=2, connect=None, read=None, redirect=None, status=None)
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.openai.com:443
DEBUG:urllib3.connectionpool:https://api.openai.com:443 "POST /v1/embeddings HTTP/1.1" 200 None
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/embeddings processing_ms=38 request_id=1c35a41668659d15c3bc46d0022d0ae5 response_code=200


In [23]:
articles = article_collection.search(
    data=[search_vector],
    anns_field="embedding",
    param={"metric_type": "IP", "params": {"nprobe": 16}},
    limit=10,
    output_fields=["title", "author_id", "doi", "publication_year", "cited_by"],
)

In [10]:
authors = author_collection.query(
    expr="community_name == ''",
    offset=0,
    limit=1,
    output_fields=["id", "first_name", "last_name", "community_name"],
)

In [12]:
authors

[{'id': 29081,
  'first_name': 'Jessica',
  'last_name': 'Meindl',
  'community_name': ''}]

In [24]:
articles[0][0]

id: 443623367193404589, distance: 0.8111650943756104, entity: {'title': 'The algebra U q ( sl 2 ) in disguise', 'author_id': 278425, 'doi': '10.1016/J.LAA.2014.07.022', 'publication_year': 2014, 'cited_by': 0}

In [19]:
articles[0][0].entity.get("title")

'The algebra U q ( sl 2 ) in disguise'

## Drop collection

In [None]:
utility.list_collections()

In [25]:
utility.drop_collection("articles")
utility.drop_collection("authors")