In [1]:
import json
import os
from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Filter, FieldCondition, MatchValue, MatchText
from dotenv import load_dotenv

In [2]:
# Get environment variables
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
QDRANT_HOST= os.getenv("QDRANT_HOST")
QDRANT_PORT=os.getenv("QDRANT_PORT")

In [3]:
# Initialize clients
openai_client = OpenAI(api_key=OPENAI_API_KEY)
qdrant_client = QdrantClient(host=QDRANT_HOST, port=int(QDRANT_PORT))

In [4]:
def load_patient_data(file_path):
    """Load and parse patient JSON data"""
    with open(file_path, 'r') as f:
        return json.load(f)

In [5]:
def split_text(text, max_length=500):
    """Split text into chunks for vectorization"""
    words = text.split()
    return [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]

In [6]:
def get_embedding(text):
    return openai_client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    ).data[0].embedding

In [7]:
# Prepare Qdrant points
def prepare_points(patients):
    points = []
    point_id = 1  # Unique ID for each vector point
    
    for patient in patients:
        # Embed and store medical history entries
        for condition in patient["medical_history"]:
            embedding = get_embedding(condition)
            points.append(PointStruct(
                id=point_id,
                vector=embedding,
                payload={
                    "type": "medical_history",
                    "content": condition,
                    "patient": patient  # Full patient details in payload
                }
            ))
            point_id += 1
        
        # Embed and store lab reports
        for report in patient["lab_reports"]:
            embedding = get_embedding(report)
            points.append(PointStruct(
                id=point_id,
                vector=embedding,
                payload={
                    "type": "lab_report",
                    "content": report,
                    "patient": patient  # Full patient details in payload
                }
            ))
            point_id += 1
        
        # Optional: Embed combined patient metadata for holistic queries
        metadata = f"{patient['patient_name']} {patient['diagnosis']} {patient['gender']}"
        embedding = get_embedding(metadata)
        points.append(PointStruct(
            id=point_id,
            vector=embedding,
            payload={
                "type": "metadata",
                "content": metadata,
                "patient": patient  # Full patient details in payload
            }
        ))
        point_id += 1
    
    return points

In [8]:
# Create collection and upload points
def upload_to_qdrant(points):
    qdrant_client.create_collection(
        collection_name="patients",
        vectors_config=VectorParams(size=1536, distance="Cosine")  # Size matches OpenAI embedding
    )
    
    qdrant_client.upsert(collection_name="patients", points=points)
    print(f"Uploaded {len(points)} points to Qdrant")

In [82]:
def search_patient(query_text=None, patient_id=None, patient_name=None, limit=5):
    # Generate embedding for text queries
    if query_text:
        query_vector = get_embedding(query_text)
    else:
        query_vector = None  # For exact filtering

    # Filter by exact fields (patient_id, patient_name)
    filter_conditions = []
    if patient_id:
        filter_conditions.append({"key": "patient.patient_id", "match": {"value": patient_id}})
    if patient_name:
        filter_conditions.append({"key": "patient.patient_name", "match": {"value": patient_name}})

    # Use correct Qdrant search signature: always provide query_vector as first argument
    if query_vector is not None:
        results = qdrant_client.query_points(
            collection_name="patients",
            query=query_vector,
            query_filter={"must": filter_conditions} if filter_conditions else None,
            limit=limit
        )
    else:
        # For filter-only search, use a dummy vector (e.g., zeros) and set score_threshold very low
        dummy_vector = [0.0] * 1536
        results = qdrant_client.query_points(
            collection_name="patients",
            query=dummy_vector,
            query_filter={"must": filter_conditions} if filter_conditions else None,
            limit=limit,
            score_threshold=-1.0  # Ensures all filter matches are returned
            
        )

    # Deduplicate results by patient_id
    unique_patients = {}
    for scored_point in results.points:
        patient = scored_point.payload.get("patient")
        unique_patients[patient["patient_id"]] = patient

    return list(unique_patients.values())

In [83]:
def print_query_results(results):
    """Print formatted query results"""
    for patient in results:
        print("\n--- Patient Details ---")
        print(f"Patient ID: {patient['patient_id']}")
        print(f"Name: {patient['patient_name']}")
        print(f"Age: {patient['age']}")
        print(f"Medical History: {', '.join(patient['medical_history'])}")
        print(f"Lab Reports: {', '.join(patient['lab_reports'])}")
        print(f"Diagnosis: {patient['diagnosis']}")

In [84]:
# Load patients and upload to Qdrant only if collection does not exist
if not qdrant_client.collection_exists("patients"):
    patients = load_patient_data("patients_data.json")
    points = prepare_points(patients)
    upload_to_qdrant(points)
else:
    print("Collection 'patients' already exists. Skipping data upload.")

# Example queries
print("Query 1: Search by medical history")
print_query_results(search_patient(query_text="High blood pressure"))

print("\nQuery 2: Search by patient_id")
print_query_results(search_patient(patient_id="PID98765"))

print("\nQuery 3: Search by patient_name")
print_query_results(search_patient(patient_name="Sarah Johnson-Miller"))

Collection 'patients' already exists. Skipping data upload.
Query 1: Search by medical history

--- Patient Details ---
Patient ID: PID85647
Name: Sarah Johnson-Miller
Age: 54
Medical History: Coronary Artery Disease (since 10 years), Tuberculosis (active, treated with TBX therapy), Past heart attack at age 48, High blood pressure
Lab Reports: Elevated troponin levels on ECG ( suggestive of ischemic cardiomyopathy), Positive TB skin test in December 2022, Streptococcus pneumoniae throat culture negative as of March 2023
Diagnosis: Leukoencephalopathy from Tuberculosis Coronary Artery Disease

--- Patient Details ---
Patient ID: PID98765
Name: Emma Wilson
Age: 42
Medical History: Asthma (persistent), Hypertension, Snoring
Lab Reports: Blood oxygen levels: 98%, Chest X-ray shows wheezing and shortness of breath
Diagnosis: Asthma

--- Patient Details ---
Patient ID: PID789012
Name: Sarah Patricia Johnson
Age: 56
Medical History: Hypertension (since 2015), High blood pressure readings, Obe