In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import json
import pandas as pd
import os

In [None]:
input_file = '/content/drive/MyDrive/HealthCareMagic-100k-en.jsonl'
output_dir = '/content/drive/MyDrive/healthcare_rag_chunks'

# Create output folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

## **Load Conversations from JSONL**

In [None]:
def load_conversations(file_path, limit=5000):
    """Read conversations from JSONL file"""
    conversations = []        # Initialize an empty list to store the conversations

    with open(file_path, 'r', encoding='utf-8') as f:     # Open the file in read mode with UTF-8 encoding
        for idx, line in enumerate(f):   # Iterate over each line in the file
            if idx >= limit:  # Stop after reaching the limit (default is 5000)
                break

            data = json.loads(line.strip()) # Parse the JSON data from the line, removing any leading/trailing whitespace

            # Extract conversation text from JSON
            if isinstance(data, str):  # If the data is a string (conversation text)
                conversations.append(data)  # Add it to the list
            elif isinstance(data, dict):   # If the data is a dictionary (usually JSON structure)
               # Try common field names to locate the conversation content
                for field in ['text', 'conversation', 'content']:
                    if field in data:     # Check if the field exists in the dictionary
                        conversations.append(data[field])  # Add the conversation content to the list
                        break # Stop once we find the relevant field

            # Every 1000 conversations, print a status update
            if (idx + 1) % 1000 == 0:
                print(f"Loaded {idx + 1} conversations...")

    print(f"Total loaded: {len(conversations)}") # Print the total number of loaded conversations
    return conversations # Return the list of conversations

# Load data
conversations = load_conversations(input_file, limit=5000)

Loaded 1000 conversations...
Loaded 2000 conversations...
Loaded 3000 conversations...
Loaded 4000 conversations...
Loaded 5000 conversations...
Total loaded: 5000


## **Chunk a Single Conversation**

In [None]:
def chunk_conversation(text, conv_id):
    """Split one conversation into chunks (question + answer pairs)"""
    chunks = []

    # Split by <human>: to separate each question
    parts = re.split(r'(?=<human>:)', text)

    for turn_idx, part in enumerate(parts):
        part = part.strip()
        if not part or '<human>:' not in part: #Skips empty pieces and any segment that doesn’t actually contain <human>: (guards against leading text or artifacts).
            continue

        # Extract question (between <human>: and <bot>:)
        question_match = re.search(r'<human>:\s*(.*?)(?=<bot>:|$)', part, re.DOTALL) #re.DOTALL makes . match newlines, so multi-line questions are allowed.
        if not question_match:
            continue

        question = question_match.group(1).strip()

        # Extract answer (between <bot>: and next <human>: or end)
        answer_match = re.search(r'<bot>:\s*(.*?)(?=<human>:|$)', part, re.DOTALL)
        answer = answer_match.group(1).strip() if answer_match else ""

        # Create chunk
        chunk = {
            'chunk_id': f"conv_{conv_id}_turn_{turn_idx}",
            'conversation_id': conv_id,
            'question': question,
            'answer': answer,
            'full_text': part.strip()
        }
        chunks.append(chunk)

    return chunks

In [None]:
# Process all conversations
all_chunks = []

print("Processing conversations...")
for idx, conv in enumerate(conversations):
    chunks = chunk_conversation(conv, conv_id=idx)
    all_chunks.extend(chunks)

    if (idx + 1) % 500 == 0:
        print(f"Processed {idx + 1} conversations → {len(all_chunks)} chunks")

print(f"\n✅ Total chunks created: {len(all_chunks)}")

Processing conversations...
Processed 500 conversations → 500 chunks
Processed 1000 conversations → 1000 chunks
Processed 1500 conversations → 1500 chunks
Processed 2000 conversations → 2000 chunks
Processed 2500 conversations → 2500 chunks
Processed 3000 conversations → 3000 chunks
Processed 3500 conversations → 3500 chunks
Processed 4000 conversations → 4000 chunks
Processed 4500 conversations → 4500 chunks
Processed 5000 conversations → 5000 chunks

✅ Total chunks created: 5000


In [None]:
# Show first 2 chunks
print("\n--- PREVIEW ---")
for i in range(min(2, len(all_chunks))):
    chunk = all_chunks[i]
    print(f"\nChunk {i+1}:")
    print(f"Question: {chunk['question']}")
    print(f"Answer: {chunk['answer'] if chunk['answer'] else 'No answer'}")


--- PREVIEW ---

Chunk 1:
Question: I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!
Answer: Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal positional vertigo (BPPV), a type of peripheral vertigo. In this condition, the most common symptom is dizziness or giddiness, which is made worse with movements. Accompanying nausea and vomiting are common. The condition is due to problem in the ear, and improves in a fe

In [None]:
# Install required packages
!pip install sentence-transformers faiss-cpu rank-bm25 transformers torch

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25, faiss-cpu
Successfully installed faiss-cpu-1.12.0 rank-bm25-0.2.2


In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import pickle
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import json
import os

## **Embedding**

In [None]:
# Choose embedding model
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

print(f"\nLoading embedding model: {EMBEDDING_MODEL}")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
embedding_dim = embedding_model.get_sentence_embedding_dimension()
print(f"✅ Model loaded! Embedding dimension: {embedding_dim}")


#Prepare texts for embedding (combining question + answer)
def prepare_texts_for_embedding(chunks):
    """Prepare text from chunks for embedding"""
    texts = []
    for chunk in chunks:
        combined_text = f"Question: {chunk['question']}\nAnswer: {chunk['answer']}"
        texts.append(combined_text)
    return texts

texts_to_embed = prepare_texts_for_embedding(all_chunks)
print(f"\nPrepared {len(texts_to_embed)} texts for embedding")


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded! Embedding dimension: 384

Prepared 5000 texts for embedding


In [None]:
# Generate embeddings
print("\nGenerating embeddings...")
embeddings = embedding_model.encode(
    texts_to_embed,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # Normalize for cosine similarity
)

print(f"✅ Embeddings generated!")
print(f"Shape: {embeddings.shape}")
print(f"Memory size: {embeddings.nbytes / (1024**2):.2f} MB")


Generating embeddings...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Embeddings generated!
Shape: (5000, 384)
Memory size: 7.32 MB


In [None]:
# ========================================================
# STORE EMBEDDINGS IN FAISS AND CREATE INDEX
# ========================================================


# Create FAISS index
dimension = embeddings.shape[1]

# Using IndexFlatIP for Inner Product (cosine similarity with normalized vectors)
faiss_index = faiss.IndexFlatIP(dimension)

# Add embeddings to index
faiss_index.add(embeddings.astype('float32'))
print(f"✅ FAISS index created with {faiss_index.ntotal} vectors")

# Save FAISS index
faiss_index_path = os.path.join(output_dir, 'faiss_index.bin')
faiss.write_index(faiss_index, faiss_index_path)
print(f"✅ FAISS index saved to: {faiss_index_path}")

✅ FAISS index created with 5000 vectors
✅ FAISS index saved to: /content/drive/MyDrive/healthcare_rag_chunks/faiss_index.bin


In [None]:
# ========================================================
# PREPARE BM25 FOR KEYWORD SEARCH
# ========================================================


# Tokenize texts for BM25
def simple_tokenize(text):
    """Simple tokenization for BM25"""
    return text.lower().split()

tokenized_corpus = [simple_tokenize(text) for text in texts_to_embed]
print(f"Tokenized {len(tokenized_corpus)} documents")

# Create BM25 index
bm25 = BM25Okapi(tokenized_corpus)
print("✅ BM25 index created")

# Save BM25 index
bm25_path = os.path.join(output_dir, 'bm25_index.pkl')
with open(bm25_path, 'wb') as f:
    pickle.dump(bm25, f)
print(f"✅ BM25 index saved to: {bm25_path}")


Tokenized 5000 documents
✅ BM25 index created
✅ BM25 index saved to: /content/drive/MyDrive/healthcare_rag_chunks/bm25_index.pkl


In [None]:
# ========================================================
#  LOAD RERANKER MODEL
# ========================================================

# Load cross-encoder for reranking
RERANKER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
print(f"Loading reranker: {RERANKER_MODEL}")
reranker = CrossEncoder(RERANKER_MODEL)
print("✅ Reranker loaded")

In [None]:
# ========================================================
# HYBRID SEARCH FUNCTION WITH RRF (Reciprocal Rank Fusion)
# ========================================================

def semantic_search(query, top_k=20):
    """Perform semantic search using FAISS"""
    # Encode query
    query_embedding = embedding_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    # Search in FAISS
    scores, indices = faiss_index.search(query_embedding.astype('float32'), top_k)

    results = []
    for rank, (idx, score) in enumerate(zip(indices[0], scores[0])):
        results.append({
            'chunk_idx': int(idx),
            'score': float(score),
            'rank': rank + 1,  # Rank starts from 1
            'source': 'semantic'
        })

    return results

def keyword_search(query, top_k=20):
    """Perform keyword search using BM25"""
    tokenized_query = simple_tokenize(query)
    scores = bm25.get_scores(tokenized_query)

    # Get top-k indices
    top_indices = np.argsort(scores)[::-1][:top_k]

    results = []
    for rank, idx in enumerate(top_indices):
        results.append({
            'chunk_idx': int(idx),
            'score': float(scores[idx]),
            'rank': rank + 1,  # Rank starts from 1
            'source': 'keyword'
        })

    return results


def reciprocal_rank_fusion(semantic_results, keyword_results, k=60):
    """
    Combine search results using Reciprocal Rank Fusion (RRF)

    RRF Formula: score(d) = sum(1 / (k + rank(d)))
    where k is a constant (typically 60) and rank(d) is the rank of document d

    Args:
        semantic_results: List of results from semantic search
        keyword_results: List of results from keyword search
        k: RRF constant (default: 60)

    Returns:
        List of fused results sorted by RRF score
    """
    # Create rank dictionaries
    semantic_ranks = {result['chunk_idx']: result['rank'] for result in semantic_results}
    keyword_ranks = {result['chunk_idx']: result['rank'] for result in keyword_results}

    # Create a set of all unique document IDs
    all_doc_ids = set(semantic_ranks.keys()) | set(keyword_ranks.keys())

    # Calculate RRF scores for all documents
    reranked_scores = {}
    for doc_id in all_doc_ids:
        # Get the ranks, defaulting to infinity if the document wasn't in a list
        semantic_rank = semantic_ranks.get(doc_id, float('inf'))
        keyword_rank = keyword_ranks.get(doc_id, float('inf'))

        # Calculate the RRF score
        rrf_score = (1 / (k + semantic_rank)) + (1 / (k + keyword_rank))
        reranked_scores[doc_id] = rrf_score

    # Sort the documents by their RRF score in descending order
    fused_results = []
    for doc_id, rrf_score in sorted(reranked_scores.items(), key=lambda item: item[1], reverse=True):
        fused_results.append({
            'chunk_idx': doc_id,
            'rrf_score': rrf_score,
            'semantic_rank': semantic_ranks.get(doc_id, None),
            'keyword_rank': keyword_ranks.get(doc_id, None)
        })

    return fused_results

In [None]:
def hybrid_search(query, top_k_semantic=20, top_k_keyword=20, rrf_k=60, final_top_k=10):
    """
    Perform hybrid search combining semantic and keyword search using RRF + Cross-Encoder Reranking

    Pipeline:
    1. Semantic Search (FAISS) → top_k_semantic results
    2. Keyword Search (BM25) → top_k_keyword results
    3. Reciprocal Rank Fusion (RRF) → combine and rank
    4. Cross-Encoder Reranking → final ranking
    """
    print(f"\nQuery: '{query[:100]}...'")
    print("-"*60)

    # Step 1: Semantic search
    # print("1️⃣  Performing semantic search...")
    semantic_results = semantic_search(query, top_k=top_k_semantic)
    # print(f"   Retrieved {len(semantic_results)} results from FAISS")

    # Step 2: Keyword search (BM25)
    # print("2️⃣  Performing keyword search (BM25)...")
    keyword_results = keyword_search(query, top_k=top_k_keyword)
    # print(f"   Retrieved {len(keyword_results)} results from BM25")

    # Step 3: Reciprocal Rank Fusion (RRF)
    # print(f"3️⃣  Applying Reciprocal Rank Fusion (k={rrf_k})...")
    fused_results = reciprocal_rank_fusion(semantic_results, keyword_results, k=rrf_k)
    # print(f"   Combined {len(fused_results)} unique chunks")

    # Step 4: Cross-Encoder Reranking
    # print("4️⃣  Reranking with Cross-Encoder...")

    # Prepare pairs for reranking (use top candidates from RRF)
    # We'll rerank more than final_top_k to get better results
    candidates_for_reranking = min(len(fused_results), final_top_k * 3)  # Rerank 3x the final amount

    rerank_pairs = []
    chunk_indices = []

    for result in fused_results[:candidates_for_reranking]:
        idx = result['chunk_idx']
        chunk = all_chunks[idx]
        chunk_text = f"Question: {chunk['question']}\nAnswer: {chunk['answer']}"
        rerank_pairs.append([query, chunk_text])
        chunk_indices.append(idx)

    # Get cross-encoder reranking scores
    rerank_scores = reranker.predict(rerank_pairs)

    # Combine RRF scores and reranker scores
    final_results = []
    for idx, rerank_score, rrf_result in zip(chunk_indices, rerank_scores, fused_results[:candidates_for_reranking]):
        final_results.append({
            'chunk_idx': idx,
            'rrf_score': rrf_result['rrf_score'],
            'rerank_score': float(rerank_score),
            'semantic_rank': rrf_result['semantic_rank'],
            'keyword_rank': rrf_result['keyword_rank'],
            'chunk': all_chunks[idx]
        })

    # Sort by cross-encoder reranking score (final ranking)
    final_results.sort(key=lambda x: x['rerank_score'], reverse=True)

    # Return top-k after reranking
    top_results = final_results[:final_top_k]

    # print(f"✅ Retrieved top {len(top_results)} chunks after reranking")
    # print(f"   Top result - RRF: {top_results[0]['rrf_score']:.4f}, Rerank: {top_results[0]['rerank_score']:.4f}")

    return top_results


# print("✅ Hybrid search function with RRF defined")

In [None]:
# ========================================================
#  LOAD GENERATION MODEL (FLAN-T5)
# ========================================================


GENERATOR_MODEL = 'google/flan-t5-base'

print(f"Loading generator model: {GENERATOR_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
generator_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATOR_MODEL)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
generator_model = generator_model.to(device)
print(f"✅ Generator loaded on device: {device}")


Loading generator model: google/flan-t5-base


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ Generator loaded on device: cpu


In [None]:
def generate_answer(query, context, max_length=1024):
    """
    Generate answer using FLAN-T5-base with improved prompt and fallback
    """
    # Extract doctor's answer from context
    if "Doctor's Answer:" in context:
        doctor_answer = context.split("Doctor's Answer:")[-1].strip()
    elif "Doctor:" in context:
        doctor_answer = context.split("Doctor:")[-1].strip()
    elif "Doctor's Response:" in context:
        doctor_answer = context.split("Doctor's Response:")[-1].strip()
    else:
        # If no label found, assume entire context is the answer
        doctor_answer = context

    # Clean up common artifacts
    doctor_answer = doctor_answer.split("Thank you for posting your query")[0].strip()

    # # DEBUG: Show what we extracted
    # print(f"📋 Extracted doctor's answer length: {len(doctor_answer)} chars")
    # print(f"📋 First 200 chars: {doctor_answer[:200]}...")

    # If doctor's answer is too short, something went wrong - use extractive fallback
    if len(doctor_answer) < 50:
        print(f"⚠️  Doctor's answer too short ({len(doctor_answer)} chars), using full context")
        doctor_answer = context

    # Simplified prompt for FLAN-T5
    prompt = f"""Based on this medical information, answer the patient's question.
    Medical Information:
{doctor_answer[:1000]}

Patient Question: {query[:300]}

Medical Answer:"""

    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    ).to(device)

    print(f"📊 Input tokens: {inputs['input_ids'].shape[1]}")

    # Generate
    with torch.no_grad():
        outputs = generator_model.generate(
            **inputs,
            max_length=max_length,
            min_length=50,
            num_beams=6,
            no_repeat_ngram_size=4,
            length_penalty=1.5,
            early_stopping=True,
            do_sample=False,
            repetition_penalty=1.3
        )

    # Decode
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # print(f"📝 Generated answer length: {len(answer)} chars")
    # print(f"📝 First 150 chars: {answer[:150]}...")

    # Quality checks
    answer_lower = answer.lower()
    query_start = query[:50].lower()

    is_bad_generation = (
        len(answer) < 30 or
        answer_lower.startswith(query_start) or
        answer.count(".") == 0
    )

    if is_bad_generation:
        print("⚠️  Generation failed, returning doctor's answer directly")
        return doctor_answer.strip()

    return answer.strip()


print("✅ Updated RAG pipeline defined")

✅ Updated RAG pipeline defined


In [None]:
def rag_pipeline(query, top_k=10, use_top_n_for_context=1):
    """
    Complete RAG pipeline with proper context handling
    """
    print("\n" + "="*80)
    print("RAG PIPELINE EXECUTION")
    print("="*80)

    # Step 1: Hybrid search with reranking
    search_results = hybrid_search(
        query,
        top_k_semantic=20,
        top_k_keyword=20,
        final_top_k=top_k
    )

    # Step 2: Check if we have results
    if not search_results:
        return {
            'query': query,
            'answer': "I couldn't find relevant information to answer your question.",
            'context': None,
            'top_chunks': [],
            'method': 'no_results'
        }

    # Step 3: Get the top chunk
    top_chunk = search_results[0]['chunk']

    # # DEBUG: Print what we actually have
    # print(f"\n🔍 DEBUG: Top chunk structure:")
    # print(f"   Question length: {len(top_chunk['question'])} chars")
    # print(f"   Answer length: {len(top_chunk['answer'])} chars")
    # print(f"   Question preview: {top_chunk['question'][:100]}...")
    # print(f"   Answer preview: {top_chunk['answer'][:100]}...")

    # Build context with FULL answer (this is the fix!)
    patient_question = top_chunk['question']
    doctor_answer = top_chunk['answer']

    # Create properly formatted context
    # full_context = f"Patient Question: {patient_question}\n\nDoctor's Answer: {doctor_answer}"
    full_context = f"Doctor's Answer: {doctor_answer}"

    # print(f"\n📊 Context statistics:")
    # print(f"   Full context length: {len(full_context)} characters")
    # print(f"   Doctor's answer length: {len(doctor_answer)} characters")
    print(f"🎯 Top result rerank score: {search_results[0]['rerank_score']:.4f}")

    # # Show context preview
    # print(f"\n📄 Full context preview (first 500 chars):")
    # print(f"{full_context[:500]}...")

    # Step 4: Generate answer
    print(f"\n🤖 Generating answer with FLAN-T5-base...")
    answer = generate_answer(query, full_context, max_length=1024)

    # print(f"\n✅ Answer generated")
    # print(f"📝 Final answer length: {len(answer)} characters")

    # Determine method
    method = 'extracted' if len(answer) > 500 else 'generated'

    # Return results
    return {
        'query': query,
        'answer': answer,
        'context': full_context,
        'method': method,
        'top_chunks': [
            {
                'chunk_id': r['chunk']['chunk_id'],
                'question': r['chunk']['question'][:200] + "...",
                'answer': r['chunk']['answer'][:200] + "...",
                'rerank_score': r['rerank_score']
            }
            for r in search_results[:5]
        ]
    }


In [None]:
# ========================================================
# STEP 8: TEST THE COMPLETE SYSTEM
# ========================================================


# Test queries
test_queries = [
   "hi my nine year old son had a cough and flu symptons three months ago and the chesty sounding cough and green phlegm still remains. it did seem to get better but never totally went and has now picked up again...he has only ever had antibiotics once in his live which suggests he his generally fitand well and active...never short of breath or weezy...so why would this be...."]
for test_query in test_queries:
    print("\n" + "="*80)
    result = rag_pipeline(test_query, top_k=10, use_top_n_for_context=1)

    # print(f"\n📝 QUERY: {result['query']}")
    print(f"\n💡 GENERATED ANSWER:\n{result['answer']}")
    print(f"\n📚 CONTEXT USED:\n{result['context'][:500]}...")
    print(f"\n🔍 TOP RETRIEVED CHUNKS:")
    for i, chunk in enumerate(result['top_chunks'][:3], 1):
        print(f"\n  {i}. Score: {chunk['rerank_score']:.4f}")
        print(f"     Q: {chunk['question'][:300]}...")
        print(f"     A: {chunk['answer'][:300]}...")




RAG PIPELINE EXECUTION

Query: 'hi my nine year old son had a cough and flu symptons three months ago and the chesty sounding cough ...'
------------------------------------------------------------
🎯 Top result rerank score: 8.3048

🤖 Generating answer with FLAN-T5-base...
📊 Input tokens: 207

💡 GENERATED ANSWER:
he has only ever had antibiotics once in his live which suggests he is generally fit and has a cough and flu symptoms three months ago and the chesty sounding cough and green phlegm still remains. It did seem to get better but never totally went and has now picked up again

📚 CONTEXT USED:
Doctor's Answer: Hi, If the symptoms persist that long this suggests the presence of an allergic element. To treat that kind of allergy you can give him an over the counter antihistamines once daily before going to bed. A cough suppressant as dextromethorphan and an expectorant as Murine will help reduce the cough. Make sure he Chat Doctor.  If the green phlegm persists he might require an

In [None]:
# =======================================================================
# STEP 8: TEST THE COMPLETE SYSTEM WITH DATASET QUERIES(only for 50 data)
# =======================================================================

import csv
from datetime import datetime

# Load the first 50 conversations and extract questions
print("Loading conversations for testing...")
test_conversations = load_conversations(input_file, limit=50)

# Extract all questions from the first 50 conversations
test_queries = []
for conv_id, conversation in enumerate(test_conversations):
    chunks = chunk_conversation(conversation, conv_id)
    for chunk in chunks:
        if chunk['question']:  # Only add non-empty questions
            test_queries.append(chunk['question'])

print(f"\nExtracted {len(test_queries)} questions from first 50 conversations")
print(f"Testing first 50 questions...\n")

# Prepare CSV file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"rag_test_results_{timestamp}.csv"

# Open CSV file for writing
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['query_number', 'question', 'generated_answer', 'context_snippet', 'top_score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Test the first 50 questions
    for idx, test_query in enumerate(test_queries[:50], 1):
        # print("\n" + "="*80)
        # print(f"QUERY {idx}/{min(50, len(test_queries))}")
        # print("="*80)

        result = rag_pipeline(test_query, top_k=10, use_top_n_for_context=1)

        # Write to CSV
        writer.writerow({
            'query_number': idx,
            'question': test_query,
            'generated_answer': result['answer'],
            'context_snippet': result['context'][:500],  # First 500 chars of context
            'top_score': result['top_chunks'][0]['rerank_score'] if result['top_chunks'] else 0
        })

        # Flush to ensure data is written immediately
        csvfile.flush()

print("\n" + "="*80)
print(f"Testing complete! Processed {min(50, len(test_queries))} queries.")
print(f"Results saved to: {csv_filename}")
print("="*80)

Loading conversations for testing...
Total loaded: 50

Extracted 50 questions from first 50 conversations
Testing first 50 questions...


RAG PIPELINE EXECUTION

Query: 'I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bat...'
------------------------------------------------------------
🎯 Top result rerank score: 5.7516

🤖 Generating answer with FLAN-T5-base...
⚠️  Doctor's answer too short (3 chars), using full context
📊 Input tokens: 266
⚠️  Generation failed, returning doctor's answer directly

RAG PIPELINE EXECUTION

Query: 'My baby has been pooing 5-6 times a day for a week. In the last few days it has increased to 7 and t...'
------------------------------------------------------------
🎯 Top result rerank score: 8.7373

🤖 Generating answer with FLAN-T5-base...
📊 Input tokens: 220
⚠️  Generation failed, returning doctor's answer directly

RAG PIPELINE EXECUTION

Query: 'Hello, My husband is taking Oxycodone due to a broken leg/surger