In [None]:
import os
import torch
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection, utility

# Load environment variables
load_dotenv()

# Embedding model setup
EMBEDDING_MODEL_NAME = "multi-qa-MiniLM-L6-cos-v1"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device)

# Milvus connection args
MILVUS_HOST = "127.0.0.1"
MILVUS_PORT = "19530"

# Global connection
_connection = None

def connect_to_milvus():
    """Connect to Milvus server (singleton pattern)."""
    global _connection
    if _connection is None:
        connections.connect(
            alias="default",
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        _connection = True

def fix_collection_index(collection_name: str):
    """Check and fix the index for a collection."""
    connect_to_milvus()
    
    if not utility.has_collection(collection_name):
        print(f"Collection '{collection_name}' does not exist")
        return
    
    collection = Collection(collection_name)
    
    # Check existing indexes
    print(f"\n=== Checking collection: {collection_name} ===")
    indexes = collection.indexes
    print(f"Current indexes: {indexes}")
    
    # Check the metric type of existing index
    if indexes:
        for idx in indexes:
            print(f"Index details: {idx.params}")
            metric_type = idx.params.get('metric_type', 'Unknown')
            print(f"Current metric type: {metric_type}")
            
            if metric_type == 'COSINE':
                print("Index already uses COSINE metric. No changes needed.")
                collection.load()
                return
    
    # Release collection before dropping index
    try:
        collection.release()
        print("Released collection")
    except Exception as e:
        print(f"Collection release note: {e}")
    
    # Drop existing index on embedding field
    try:
        collection.drop_index()
        print("Dropped existing index")
    except Exception as e:
        print(f"No index to drop or error: {e}")
    
    # Create new index with COSINE metric
    index_params = {
        "index_type": "IVF_FLAT",
        "metric_type": "COSINE",
        "params": {"nlist": 128}
    }
    
    collection.create_index(
        field_name="embedding",
        index_params=index_params
    )
    print(f"Created new index with COSINE metric type")
    
    # Verify the index
    indexes = collection.indexes
    print(f"New indexes: {indexes}")
    if indexes:
        for idx in indexes:
            print(f"New index metric type: {idx.params.get('metric_type', 'Unknown')}")
    
    # Load collection
    collection.load()
    print(f"Collection loaded successfully\n")

def search_collection(collection_name: str, query: str, output_fields: list, k: int = 5):
    """
    Generic search function for any Milvus collection.
    
    Args:
        collection_name: Name of the collection to search
        query: Search query text
        output_fields: List of field names to retrieve
        k: Number of results to return
    
    Returns:
        Raw search results from Milvus
    """
    connect_to_milvus()
    
    # Get collection
    if not utility.has_collection(collection_name):
        raise ValueError(f"Collection '{collection_name}' does not exist")
    
    collection = Collection(collection_name)
    collection.load()
    
    # Generate query embedding
    query_embedding = model.encode(query, normalize_embeddings=True)
    
    # Search parameters
    search_params = {
        "metric_type": "COSINE",
        "params": {"nprobe": 10}
    }
    
    # Perform search
    results = collection.search(
        data=[query_embedding.tolist()],
        anns_field="embedding",
        param=search_params,
        limit=k,
        output_fields=output_fields
    )
    
    return results[0]

def search_publications(query: str, k: int = 5):
    """Search the publications collection."""
    output_fields = ["PMC_code", "name", "authors", "date", "doi", "content"]
    results = search_collection("publications", query, output_fields, k)
    
    formatted_results = []
    for hit in results:
        formatted_results.append({
            "PMC_code": hit.entity.get("PMC_code"),
            "name": hit.entity.get("name"),
            "authors": hit.entity.get("authors"),
            "date": hit.entity.get("date"),
            "doi": hit.entity.get("doi"),
            "text": hit.entity.get("content"),
            "score": hit.score
        })
    
    return formatted_results

def search_osdr(query: str, k: int = 5):
    """Search the osdr collection."""
    output_fields = ["study_id", "name", "organisms", "authors", "doi", "link", "type", "protocole_name", "text"]
    results = search_collection("osdr", query, output_fields, k)
    
    formatted_results = []
    for hit in results:
        formatted_results.append({
            "study_id": hit.entity.get("study_id"),
            "name": hit.entity.get("name"),
            "organisms": hit.entity.get("organisms"),
            "authors": hit.entity.get("authors"),
            "doi": hit.entity.get("doi"),
            "link": hit.entity.get("link"),
            "type": hit.entity.get("type"),
            "protocol_name": hit.entity.get("protocole_name"),
            "text": hit.entity.get("text"),
            "score": hit.score
        })
    
    return formatted_results

def main():
    """Main function to fix indexes and test searches."""
    print("="*60)
    print("STEP 1: Fixing collection indexes...")
    print("="*60)
    fix_collection_index("publications")
    fix_collection_index("osdr")
    
    print("\n" + "="*60)
    print("STEP 2: Testing searches...")
    print("="*60)
    
    # Test publications search
    print("\n--- Testing Publications Collection ---")
    try:
        pub_results = search_publications("space radiation effects", k=3)
        if pub_results:
            print(f"Found {len(pub_results)} results:")
            for i, result in enumerate(pub_results, 1):
                print(f"\n{i}. {result['name']}")
                print(f"   PMC: {result['PMC_code']}")
                print(f"   Score: {result['score']:.4f}")
                print(f"   Text: {result['text'][:100]}...")
        else:
            print("No results found")
    except Exception as e:
        print(f"Error searching publications: {e}")
        import traceback
        traceback.print_exc()
    
    # Test OSDR search
    print("\n--- Testing OSDR Collection ---")
    try:
        osdr_results = search_osdr("microgravity gene expression", k=3)
        if osdr_results:
            print(f"Found {len(osdr_results)} results:")
            for i, result in enumerate(osdr_results, 1):
                print(f"\n{i}. {result['name']}")
                print(f"   Study ID: {result['study_id']}")
                print(f"   Score: {result['score']:.4f}")
                print(f"   Organisms: {result['organisms']}")
        else:
            print("No results found")
    except Exception as e:
        print(f"Error searching OSDR: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

STEP 1: Fixing collection indexes...

=== Checking collection: publications ===
Current indexes: [<pymilvus.orm.index.Index object at 0x0000028617AA0E20>]
Index details: {'index_type': 'IVF_FLAT', 'metric_type': 'COSINE', 'params': {'nlist': 128}}
Current metric type: COSINE
Index already uses COSINE metric. No changes needed.

=== Checking collection: osdr ===
Current indexes: [<pymilvus.orm.index.Index object at 0x0000028617AA0C70>]
Index details: {'index_type': 'IVF_FLAT', 'metric_type': 'COSINE', 'params': {'nlist': 128}}
Current metric type: COSINE
Index already uses COSINE metric. No changes needed.

STEP 2: Testing searches...

--- Testing Publications Collection ---
Found 3 results:

1. NASA GeneLab Platform Utilized for Biological Response to Space Radiation in Animal Models
   PMC: PMC7072278
   Score: 0.7666
   Text: and changes in cardiovascular physiology [10]. The possibility that space radiation exposure to incr...

2. Simultaneous Exposure of Cultured Human Lymphoblasti

In [22]:
search_osdr("Mice in Bion-M 1 Space Mission")

[{'study_id': 'LSDS-71',
  'name': 'Toward countering muscle and bone loss with spaceflight: GSK3 as a potential target (Extensor Digitorum Longus, BION-M1, Western blot)',
  'organisms': 'Mus musculus',
  'authors': 'Ryan W. Baranowski,Jessica L. Braun,Briana L. Hockey,Jenalyn L. Yumol,Mia S. Geromella,Colton J.F. Watson,Nigel Kurgan,Holt N. Messner,Kennedy C. Whitley,Adam J. MacNeil,Guillemette Gauquelin-Koch,Fabrice Bertile,William Gittings,Rene Vandenboom,Wendy E. Ward,Val A. Fajardo',
  'doi': '10.26030/161z-1j19',
  'link': 'https://osdr.nasa.gov/bio/repo/data/studies/OSD-664',
  'type': 'protocole',
  'protocol_name': 'Animal Husbandry',
  'text': 'Bion-M1 mice (4-5 months of age) were flown in an unmanned, 30-day-long orbital spaceflight inside the Block Obespecheniya Soderzhaniya (BOS) rodent housing. Housing and climate parameters were replicated in the subsequent ground control (GC) experiment (July 26 to August 26, 2013). A total of 4 experimental groups were used for the f

{'collection_name': 'publications', 'auto_id': True, 'num_shards': 1, 'description': 'RAG collection with publication metadata', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'field_id': 102, 'name': 'PMC_code', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 20}}, {'field_id': 103, 'name': 'name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 300}}, {'field_id': 104, 'name': 'content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2000}}, {'field_id': 105, 'name': 'authors', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2000}}, {'field_id': 106, 'name': 'date', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 20}}, {'field_id':