In [1]:
# Load necessary libraries
import numpy as np
import os
import glob
import hydra
import cudf
import cupy as cp
from pymilvus import (
    db,
    connections,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    utility,
    MilvusClient
)
from tqdm import tqdm

In [2]:
# Configuration for Milvus
milvus_host = "localhost"
milvus_port = "19530"
milvus_uri = "http://localhost:19530"
milvus_token = "root:Milvus"
milvus_user = "root"
milvus_password = "Milvus"
milvus_database = "t2kg_primekg"

In [3]:
# Connect to Milvus
connections.connect(
    alias="default",
    host=milvus_host,
    port=milvus_port,
    user=milvus_user,
    password=milvus_password
)

In [4]:
# Switch to the desired database
db.using_database(milvus_database)

In [5]:
# List all collections
for coll in utility.list_collections():
    print(f"Collection: {coll}")

    # Load the collection to get stats
    collection = Collection(name=coll)
    print(collection.num_entities)

Collection: t2kg_primekg_nodes_disease
17054
Collection: t2kg_primekg_nodes_drug
6759
Collection: t2kg_primekg_nodes_gene_protein
18797
Collection: t2kg_primekg_nodes_molecular_function
10951
Collection: t2kg_primekg_edges
3904610
Collection: t2kg_primekg_nodes
84981
Collection: t2kg_primekg_nodes_biological_process
27409
Collection: t2kg_primekg_nodes_cellular_component
4011


In [6]:
utility.list_collections()

['t2kg_primekg_nodes_gene_protein',
 't2kg_primekg_nodes_molecular_function',
 't2kg_primekg_edges',
 't2kg_primekg_nodes',
 't2kg_primekg_nodes_biological_process',
 't2kg_primekg_nodes_cellular_component',
 't2kg_primekg_nodes_disease',
 't2kg_primekg_nodes_drug']

In [7]:
%%time

# Assume node_coll_name is defined and collection exists
collection = Collection('t2kg_primekg_nodes')

# Load the collection into memory before query
collection.load()

CPU times: user 2.41 ms, sys: 874 μs, total: 3.28 ms
Wall time: 7.22 ms


In [8]:
%%time

# Assume node_coll_name is defined and collection exists
collection = Collection('t2kg_primekg_nodes')

# Load the collection into memory before query
collection.load()

# Query by expr on node_index
expr = "node_index == 13814"
output_fields = ["node_index", "node_id", "node_name", "node_type", "desc", "desc_emb"]

results = collection.query(expr, output_fields=output_fields)

print(results)

data: ["{'node_name': 'Copper', 'node_type': 'drug', 'desc': 'Copper belongs to drug node. Copper is a transition metal and a trace element in the body. It is important to the function of many enzymes including cytochrome c oxidase, monoamine oxidase and superoxide dismutase. Copper is commonly used in contraceptive intrauterine devices (IUD). Copper is absorbed from the gut via high affinity copper uptake protein and likely through low affinity copper uptake protein and natural resistance-associated macrophage protein-2. It is believed that copper is reduced to the Cu1+ form prior to transport. Once inside the enterocyte, it is bound to copper transport protein ATOX1 which shuttles the ion to copper transporting ATPase-1 on the golgi membrane which take up copper into the golgi apparatus. Once copper has been secreted by enterocytes into the systemic circulation it remain largely bound by ceruloplasmin (65-90%), albumin (18%), and alpha 2-macroglobulin (12%).  Copper is nearly entirel

In [9]:
%%time

# Assume node_coll_name is defined and collection exists
collection = Collection('t2kg_primekg_nodes')

# Load the collection into memory before query
collection.load()

# Query by expr on node_index
expr = "node_index == 13814"
output_fields = ["node_index", "node_id", "node_name", "node_type", "desc", "desc_emb"]

results = collection.query(expr, output_fields=output_fields)

print(results)

data: ["{'node_index': 13814, 'node_id': 'Copper_(14012)', 'node_name': 'Copper', 'node_type': 'drug', 'desc': 'Copper belongs to drug node. Copper is a transition metal and a trace element in the body. It is important to the function of many enzymes including cytochrome c oxidase, monoamine oxidase and superoxide dismutase. Copper is commonly used in contraceptive intrauterine devices (IUD). Copper is absorbed from the gut via high affinity copper uptake protein and likely through low affinity copper uptake protein and natural resistance-associated macrophage protein-2. It is believed that copper is reduced to the Cu1+ form prior to transport. Once inside the enterocyte, it is bound to copper transport protein ATOX1 which shuttles the ion to copper transporting ATPase-1 on the golgi membrane which take up copper into the golgi apparatus. Once copper has been secreted by enterocytes into the systemic circulation it remain largely bound by ceruloplasmin (65-90%), albumin (18%), and alph

In [10]:
%%time

# Assume node_coll_name is defined and collection exists
collection = Collection('t2kg_primekg_nodes')

# Load the collection into memory before query
collection.load()

# Query by expr on node_index
expr = "node_index in [13814, 13815]"
output_fields = ["node_index", "node_id", "node_name", "node_type", "desc", "desc_emb"]

results = collection.query(expr, output_fields=output_fields)

print(results)

data: ["{'desc_emb': [-0.008437113, -0.0066760327, 0.017289385, -0.031070003, -0.024708688, 0.0269318, -0.025539007, -0.016030515, -0.022900734, -0.0172626, 0.016351929, 0.022257907, -0.013927932, 0.00728538, -0.010954854, 0.013639999, 0.033105623, 0.025713105, -0.0011316445, 0.019137515, -0.042480193, 0.021963278, 0.0012337604, -0.011530721, 0.0017661019, 0.011249484, 0.030427175, -0.039641038, -0.003863662, -0.021789178, -0.0007855387, 0.0020724495, -0.020061579, 0.018146489, 0.010546391, 0.011095473, 0.019512497, 0.0062073045, -0.0025277857, -0.006558851, 0.018280411, 0.0065488066, -0.006351271, 0.02423996, -0.00078260916, 0.009568757, 0.005782101, 0.0011140672, -0.025391692, 0.006451713, 0.0092004705, 0.012454785, -0.005999725, -0.015856415, -0.00074871007, -0.021869533, 0.007794285, 0.01845451, 0.01199945, -0.020356208, -0.01134323, -0.010820932, -0.011624467, 0.024373882, 0.0006189727, 0.031391416, 0.007030927, 0.008109002, -0.0015032792, 0.017235816, 0.006471801, 0.031150356, 0.

In [11]:
# Check results
[r['node_index'] for r in results]

[13814, 13815]

In [16]:
%%time

# Assume node_coll_name is defined and collection exists
collection = Collection('t2kg_primekg_edges')

# Load the collection into memory before query
collection.load()

# Query by expr on triplet_index
expr = "triplet_index == 0"
output_fields = ["triplet_index", "head_id", "tail_id", "edge_type", "feat", "feat_emb"]

results = collection.query(expr, output_fields=output_fields)
results

CPU times: user 4.75 ms, sys: 0 ns, total: 4.75 ms
Wall time: 11.4 ms


data: ["{'edge_type': 'gene/protein|ppi|gene/protein', 'feat': 'PHYHIP (gene/protein) has a direct relationship of protein_protein:ppi with KIF15 (gene/protein).', 'feat_emb': [-0.01934238, 0.0011752498, 0.004431808, -0.033904973, -0.01556057, 0.011148459, 0.0011645806, 0.028704984, 0.012691384, -0.005199988, 0.020970657, 0.01423431, 0.009421695, 0.0033714569, -0.0027230997, -0.0106954295, 0.015586833, -0.015100975, 0.01833127, 0.014812087, -0.008581293, 0.019014098, 0.005584078, -0.036977693, 0.010511592, 0.034535274, 0.010833308, -0.026236303, -0.025107013, -0.037502944, 0.027575694, -0.019513085, -0.001110414, 0.015416126, 0.0045368583, -0.0110171465, 0.009841897, -0.021863585, 0.016400972, -0.0021190608, -0.0024818124, -0.0077540223, -0.006516399, 0.0013426737, -0.0038474659, -0.0059189256, 0.0074257404, -0.022231262, -0.017438343, 0.008292405, -0.014102997, 0.002123985, -0.012579769, -0.020051468, 0.0068545295, -0.0032368612, 0.0050850892, 0.00063440506, -0.028127206, -0.004412111

In [17]:
# Check the ground truth for the search
results[0]['triplet_index'], results[0]['head_id'], results[0]['tail_id'], results[0]['edge_type'], results[0]['feat']

(0,
 'PHYHIP_(0)',
 'KIF15_(8889)',
 'gene/protein|ppi|gene/protein',
 'PHYHIP (gene/protein) has a direct relationship of protein_protein:ppi with KIF15 (gene/protein).')

In [18]:
%%time

# Assume node_coll_name is defined and collection exists
collection = Collection('t2kg_primekg_edges')

# Load the collection into memory before query
collection.load()

# Vector similarity search in Milvus
vector_to_search = np.array(results[0]['feat_emb']).tolist() # merged_edges_df["edge_emb"].iloc[0]
search_params = {"metric_type": "COSINE"}
results = collection.search(
    data=[vector_to_search],
    anns_field="feat_emb",
    param=search_params,
    limit=10,
    output_fields=["head_id", "tail_id", "edge_type", "feat"]
)
results

CPU times: user 3.27 ms, sys: 1.3 ms, total: 4.57 ms
Wall time: 25.9 ms


data: [[{'triplet_index': 0, 'distance': 1.000000238418579, 'entity': {'head_id': 'PHYHIP_(0)', 'tail_id': 'KIF15_(8889)', 'edge_type': 'gene/protein|ppi|gene/protein', 'feat': 'PHYHIP (gene/protein) has a direct relationship of protein_protein:ppi with KIF15 (gene/protein).'}}, {'triplet_index': 3069556, 'distance': 0.9815623760223389, 'entity': {'head_id': 'KIF15_(8889)', 'tail_id': 'PHYHIP_(0)', 'edge_type': 'gene/protein|ppi|gene/protein', 'feat': 'KIF15 (gene/protein) has a direct relationship of protein_protein:ppi with PHYHIP (gene/protein).'}}, {'triplet_index': 93582, 'distance': 0.9684444069862366, 'entity': {'head_id': 'PHYHIP_(0)', 'tail_id': 'PRKD2_(9221)', 'edge_type': 'gene/protein|ppi|gene/protein', 'feat': 'PHYHIP (gene/protein) has a direct relationship of protein_protein:ppi with PRKD2 (gene/protein).'}}, {'triplet_index': 305788, 'distance': 0.9657970070838928, 'entity': {'head_id': 'PHYHIP_(0)', 'tail_id': 'PPIE_(5629)', 'edge_type': 'gene/protein|ppi|gene/protein'

In [19]:
# Get node indices from the results
[n['triplet_index'] for n in results[0]]

[0, 3069556, 93582, 305788, 39303, 300682, 3154085, 3201926, 3334423, 110476]

In [20]:
# Check the first result
results[0][0]['triplet_index'], results[0][0]['head_id'], results[0][0]['tail_id'], results[0][0]['edge_type'], results[0][0]['feat']

(0,
 'PHYHIP_(0)',
 'KIF15_(8889)',
 'gene/protein|ppi|gene/protein',
 'PHYHIP (gene/protein) has a direct relationship of protein_protein:ppi with KIF15 (gene/protein).')