In [None]:
from dotenv import load_dotenv
from groq import Groq
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
import json
import os
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
q_client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

In [None]:
# 

In [3]:
q_client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='health-care-facility-rag')])

In [25]:
doc_url = './0022lag_river_abuja_health_data.json'
with open(doc_url, 'rb') as doc:
    documents_raw = json.load(doc)

In [26]:
documents_raw[0]

{'text': 'Facility Name: Mamagi Primary Health Clinic\nState: Fct\nLGA: Abaji\nWard: Alumamagi\nFacility Level: Primary\nOwnership: Public (Local Government)\nStart Date: 1990-01-01\nOperational Status: Operational\nRegistration Status: Unknown\nLicense Status: Not Applicable\nHours of Operation: 12 hours\nDoctors: 0, Nurses: 0, Midwives: 0, Pharmacists: 0, Pharmacy Technicians: 0, Dentists: 0, Dental Technicians: 0, Nurses/Midwives: 0, Lab Technicians: 0, Lab Scientists: 0, Health Records/HIM Officers: 0, Community Health Officers: 0, Community Health Extension Workers: 1, Junior CHEWs: 1, Environmental Health Workers: 0, Health Attendants/Assistants: 3\nBeds: 8',
 'section': 'Facility Overview',
 'question': 'What is the summary of Mamagi Primary Health Clinic in Alumamagi, Abaji, Fct?'}

In [27]:
from fastembed import TextEmbedding
# TextEmbedding.list_supported_models()
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [17]:
collection_name='health-care-facility-rag'

In [18]:
q_client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='health-care-facility-rag')])

In [35]:
from qdrant_client import models


q_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity sear
    ))

True

In [36]:
q_client.create_payload_index(
    collection_name=collection_name,
    field_name="facility_level",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
model_handle = "jinaai/jina-embeddings-v2-small-en"


In [38]:
points = []

for i, doc in enumerate(documents_raw):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)


In [33]:
# points[2]
len(points)

128115

In [None]:

# === CONFIG ===
batch_size = 100
progress_file = "qdrant_progress.json"

# Load last progress if exists
start_index = 0
if os.path.exists(progress_file):
    with open(progress_file, "r") as f:
        data = json.load(f)
        start_index = data.get("last_index", 0)
        print(f"Resuming from index {start_index}")

total_points = len(points)

# Ensure each point has a unique ID
for idx, p in enumerate(points):
    p.id = idx  # or use uuid.uuid4().hex for random unique IDs

# Iterate in batches
for i in tqdm(range(start_index, total_points, batch_size), desc="Upserting to Qdrant"):
    batch = points[i:i + batch_size]

    # Upsert this batch and wait for commit
    q_client.upsert(
        collection_name=collection_name,
        points=batch,
        wait=True  # ensures points are stored before continuing
    )

    # Save progress
    with open(progress_file, "w") as f:
        json.dump({"last_index": i + batch_size}, f)

print("✅ Upload complete!")

# Optionally remove the progress file after success
if os.path.exists(progress_file):
    os.remove(progress_file)



Resuming from index 123000


Upserting to Qdrant:  83%|████████▎ | 43/52 [07:44<01:02,  6.98s/it]

In [42]:
q_client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='health-care-facility-rag')])

In [None]:
# def search(query, limit=1):

#     results = q_client.query_points(
#         collection_name=collection_name,
#         query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
#             text=query,
#             model=model_handle 
#         ),
#         limit=limit, # top closest matches
#         with_payload=True #to get metadata in the results
#     )

#     return results



# # 


In [None]:
# q = "How many hospital are in Bwari LGA"
# result = search(query=q)

In [None]:
# result.points[0].payload

{'text': 'Facility Name: Bwari General Hospital  (Bwari)\nState: Fct\nLGA: Bwari\nWard: Bwari Central\nFacility Level: Secondary\nOwnership: Public (State Government)\nStart Date: 2002-03-01\nOperational Status: Operational\nRegistration Status: Registered\nLicense Status: Not Applicable\nHours of Operation: 24 hours\nDoctors: 12, Nurses: 20, Midwives: 1, Pharmacists: 10, Pharmacy Technicians: 1, Dentists: 1, Dental Technicians: 1, Nurses/Midwives: 35, Lab Technicians: 2, Lab Scientists: 15, Health Records/HIM Officers: 6, Community Health Officers: 4, Community Health Extension Workers: 2, Junior CHEWs: 1, Environmental Health Workers: 0, Health Attendants/Assistants: 10\nBeds: 60',
 'section': 'Facility Type',
 'question': 'What type of facility is Bwari General Hospital  (Bwari)?'}

In [18]:
def search(query, limit=1):
    """Search the FAQ database using Qdrant client."""
    try:
        results = q_client.query_points(
            collection_name=collection_name,
            query=models.Document(
                text=query,
                model=model_handle
            ),
            limit=limit,
            with_payload=True
        )
        return results
    except Exception as e:
        print(f"Search error: {str(e)}")
        return None

def build_prompt(query, search_results):
    """Build a prompt for the LLM using search results."""
    prompt_template = """
You are a health facility information assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

Provide a clear and concise answer to the QUESTION using only the facts from the CONTEXT provided.
Do not include any assumptions or external information.

If the CONTEXT is empty or does not contain sufficient information to answer the QUESTION,
respond with: "No relevant information found to answer the question."

QUESTION: {question}

CONTEXT: {context}
""".strip()

    context = ""

    # Handle empty or invalid search results
    if not search_results:
        return prompt_template.format(question=query, context="No relevant information found.")

    # Check if search_results has points (Qdrant QueryResponse)
    if hasattr(search_results, 'points'):
        points = search_results.points
    else:
        # Assume search_results is a tuple or list of dictionaries
        points = search_results

    # Build context from points
    for point in points:
        try:
            # If point is a dict (tuple/list case)
            if isinstance(point, dict):
                section = point.get('section', 'N/A')
                question = point.get('question', 'N/A')
                text = point.get('text', 'N/A')
            # If point is a Qdrant Point with payload
            else:
                section = point.payload.get('section', 'N/A')
                question = point.payload.get('question', 'N/A')
                text = point.payload.get('text', 'N/A')
            context += f"Section: {section}\nQuestion: {question}\nAnswer: {text}\n\n"
        except (AttributeError, KeyError) as e:
            context += f"Invalid data format in search results: {str(e)}\n\n"

    # If no valid context was built, return default message
    if not context.strip():
        context = "No relevant information found."

    return prompt_template.format(question=query, context=context.strip())

def llm(prompt):
    """Generate a response using the LLM."""
    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.3-70b-versatile"
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"LLM error: {str(e)}"

def rag(query):
    """Run the RAG pipeline: search, build prompt, and generate answer."""
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
q = "i'm in Ijegun which public health facilities close to me, do they have doctor the?"
answer = rag(q)
print(answer)

There is a public health facility close to you, which is the Ijegun Primary Health Centre. However, according to the information, it does not have any doctors. It has other healthcare personnel such as 2 Nurses/Midwives, 1 Community Health Extension Worker, and 1 Health Attendant/Assistant.
