In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
!pip install -q datasets sentence-transformers faiss-cpu transformers nltk accelerate keybert[gensim] sentencepiece scikit-learn

In [5]:
import os, sys, time, json
import warnings
warnings.filterwarnings("ignore")

from datasets import load_dataset
import numpy as np
import torch
from sklearn.model_selection import train_test_split

# NLP imports
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer, CrossEncoder, util
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, pipeline
from keybert import KeyBERT

def log(message):
    print(f"[{time.strftime('%H:%M:%S')}] {message}")

log("All imports successful!")

2025-09-01 15:18:19.254030: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756739899.608921      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756739899.709990      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[15:18:40] All imports successful!


In [6]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

log("Loading medical QA dataset...")

dataset = load_dataset("Malikeh1375/medical-question-answering-datasets", name="all-processed", split="train")
qa_data = [{"question": row["input"], "answer": row["output"]} for row in dataset]

train_data, test_data = train_test_split(qa_data, test_size=0.2, random_state=42)

log(f"Dataset loaded: {len(train_data)} train, {len(test_data)} test samples")
print("Sample QA:", train_data[0])


CUDA available: True
GPU name: Tesla T4
GPU memory: 15.8GB
[15:18:40] Loading medical QA dataset...


README.md: 0.00B [00:00, ?B/s]

all-processed/train-00000-of-00001-9bfe4(…):   0%|          | 0.00/160M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/246678 [00:00<?, ? examples/s]

[15:18:53] Dataset loaded: 197342 train, 49336 test samples
Sample QA: {'question': 'Dr., This is Himadri Dhar, my son Adriz is 3 years old and today around 4-5 hours ago we have taken cheese popcorn from outside stall and right now he is crying like anything for stomach pain. We are also suffering from gas formation. He does not have any other problem and not taking any other medicines at present. Can you please advise medicine for him?', 'answer': 'hi, welcome to chatbot. i see similar cases in my clinic every day. your child probably has food poisoning. if i was your treating doctor, i would have given syrup cyclops 5 ml po three times a day for 2 days and syrup zoxakind-o 3.5 ml po three times a day for 3 days. however since this is a prescription medicine, i advice you to meet the local doctor to confirm the diagnosis. also, avoid food from that stall to prevent similar problem in the future. i hope this has helped you. take care. regards - chatbot.'}


In [7]:

# Sentence-window chunks for retrieval
def create_sentence_chunks(data, window_size=3, stride=1, min_chars=50):
    chunks = []
    for idx, item in enumerate(data):
        text = item.get('answer') or item.get('text', '')
        sentences = sent_tokenize(text)

        if not sentences:
            if len(text) > min_chars:
                chunks.append({"chunk": text, "source_idx": idx})
            continue

        if len(sentences) <= window_size:
            chunks.append({"chunk": " ".join(sentences), "source_idx": idx})
            continue

        for i in range(0, max(1, len(sentences) - window_size + 1), stride):
            chunk = " ".join(sentences[i:i+window_size])
            chunks.append({"chunk": chunk, "source_idx": idx})

    return chunks

try:
    sent_tokenize("This is a test sentence.")
except LookupError:
    nltk.download('punkt_tab', quiet=True)
    nltk.download('punkt', quiet=True)

sentence_chunks = create_sentence_chunks(train_data, window_size=3, stride=1)
log(f"Created {len(sentence_chunks)} sentence chunks")

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

log(f"Loading embedding model: {EMBED_MODEL}")
embedder = SentenceTransformer(EMBED_MODEL, device="cuda" if torch.cuda.is_available() else "cpu")

chunk_texts = [chunk["chunk"] for chunk in sentence_chunks]
log("Encoding chunks (this may take a while)...")
chunk_embeddings = embedder.encode(
    chunk_texts,
    normalize_embeddings=True,
    show_progress_bar=True,
    convert_to_numpy=True,
    batch_size=32
)

dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(chunk_embeddings.astype("float32"))

log(f"FAISS index built with {index.ntotal} vectors, dimension: {dimension}")


[15:19:08] Created 754364 sentence chunks
[15:19:08] Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[15:19:16] Encoding chunks (this may take a while)...


Batches:   0%|          | 0/23574 [00:00<?, ?it/s]

[15:28:02] FAISS index built with 754364 vectors, dimension: 384


In [8]:
HYDE_MODEL = "gpt2"

log(f"Loading HyDE model: {HYDE_MODEL}")
hyde_tokenizer = AutoTokenizer.from_pretrained(HYDE_MODEL)
if hyde_tokenizer.pad_token is None:
    hyde_tokenizer.pad_token = hyde_tokenizer.eos_token

hyde_model = AutoModelForCausalLM.from_pretrained(HYDE_MODEL)
hyde_model = hyde_model.to("cuda" if torch.cuda.is_available() else "cpu")

def get_hyde_answer(query, max_length=80):
    prompt = f"Question: {query}\nAnswer:"

    inputs = hyde_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(hyde_model.device if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = hyde_model.generate(
            **inputs,
            max_length=inputs['input_ids'].shape[1] + max_length,
            do_sample=False,  # FIXED: Removed temperature/top_p
            repetition_penalty=1.2,  # FIXED: Added to reduce repetition
            pad_token_id=hyde_tokenizer.eos_token_id
        )

    text = hyde_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.split("Answer:")[-1].strip()

log("HyDE model loaded successfully")


[15:28:02] Loading HyDE model: gpt2


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[15:28:06] HyDE model loaded successfully


In [9]:
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

log(f"Loading reranker: {RERANKER_MODEL}")
reranker = CrossEncoder(RERANKER_MODEL, device="cuda" if torch.cuda.is_available() else "cpu")

def retrieve_with_hyde(query, k=10):
    hyde_answer = get_hyde_answer(query)
    hyde_emb = embedder.encode([hyde_answer], normalize_embeddings=True, convert_to_numpy=True)
    distances, indices = index.search(hyde_emb.astype("float32"), k)
    retrieved_chunks = [sentence_chunks[i]["chunk"] for i in indices[0]]
    return retrieved_chunks, indices[0], distances[0]

def rerank_chunks(query, chunks, top_k=5):
    if not chunks:
        return []
    pairs = [[query, chunk] for chunk in chunks]
    scores = reranker.predict(pairs)
    ranked_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), key=lambda x: x[0], reverse=True)]
    return ranked_chunks[:top_k]

log("Reranker loaded successfully")

[15:28:06] Loading reranker: cross-encoder/ms-marco-MiniLM-L-6-v2


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

[15:28:08] Reranker loaded successfully


In [10]:
GEN_MODEL = "microsoft/DialoGPT-medium"

log(f"Loading generation model: {GEN_MODEL}")
gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
if gen_tokenizer.pad_token is None:
    gen_tokenizer.pad_token = gen_tokenizer.eos_token

gen_model = AutoModelForCausalLM.from_pretrained(GEN_MODEL)
gen_model = gen_model.to("cuda" if torch.cuda.is_available() else "cpu")

def generate_answer(query, context, max_new_tokens=100):
    prompt = f"""Answer the question using the context provided.

Context: {context}

Question: {query}

Answer:"""

    inputs = gen_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(gen_model.device if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = gen_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # FIXED: Removed temperature/top_p
            pad_token_id=gen_tokenizer.eos_token_id
        )

    text = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = text.split("Answer:")[-1].strip()
    return answer if answer else "Unable to generate answer."  # FIXED: Handle empty output

log("Generation model loaded successfully")


[15:28:08] Loading generation model: microsoft/DialoGPT-medium


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[15:28:16] Generation model loaded successfully


In [11]:
kw_model = KeyBERT(model=embedder)
STOPWORDS = set(stopwords.words('english'))

def extract_keywords(text, top_n=8):
    if not text or len(text.split()) < 3:
        return []
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        top_n=top_n
    )
    return [kw for kw, score in keywords]

def retrieval_relevance_score(query, top_k=5):
    q_emb = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True)
    distances, indices = index.search(q_emb.astype("float32"), top_k)
    scores = np.clip(distances[0], -1.0, 1.0)
    similarity_scores = (scores + 1.0) / 2.0
    return float(similarity_scores.mean())

def answer_completeness_score(answer, retrieved_chunks, sim_threshold=0.65):
    all_facts = []
    for chunk in retrieved_chunks:
        facts = extract_keywords(chunk, top_n=4)
        all_facts.extend(facts)

    unique_facts = list(dict.fromkeys([f.lower() for f in all_facts if f]))
    if not unique_facts:
        return 0.0

    answer_emb = embedder.encode([answer], normalize_embeddings=True, convert_to_numpy=True)
    fact_embs = embedder.encode(unique_facts, normalize_embeddings=True, convert_to_numpy=True)

    similarities = util.cos_sim(answer_emb, fact_embs).cpu().numpy()[0]
    covered_facts = (similarities >= sim_threshold).sum()

    return float(covered_facts / len(unique_facts))

def faithfulness_score(answer, context_chunks, sentence_threshold=0.5):
    if not answer or not context_chunks:
        return 0.0

    answer_sentences = sent_tokenize(answer)
    if not answer_sentences:
        return 0.0

    context_text = "\n".join(context_chunks)
    context_sentences = sent_tokenize(context_text)
    if not context_sentences:
        return 0.0

    try:
        answer_sentence_embeddings = embedder.encode(answer_sentences, normalize_embeddings=True, convert_to_numpy=True)
        context_sentence_embeddings = embedder.encode(context_sentences, normalize_embeddings=True, convert_to_numpy=True)

        similarity_matrix = util.cos_sim(answer_sentence_embeddings, context_sentence_embeddings).cpu().numpy()
        max_similarities = np.max(similarity_matrix, axis=1)
        faithful_sentences_count = (max_similarities >= sentence_threshold).sum()
        score = faithful_sentences_count / len(answer_sentences)
        return float(score)
    except Exception as e:
        log(f"Error calculating faithfulness score: {e}")
        return 0.0

log("Evaluation metrics set up")


[15:28:16] Evaluation metrics set up


In [12]:
def rag_pipeline_with_metrics(query, retrieve_k=10, rerank_k=5, context_chunks=3):
    log(f"Processing query: {query[:100]}...")

    retrieved_chunks, indices, distances = retrieve_with_hyde(query, k=retrieve_k)
    reranked_chunks = rerank_chunks(query, retrieved_chunks, top_k=rerank_k)
    context = "\n\n".join(reranked_chunks[:context_chunks])
    answer = generate_answer(query, context)

    retrieval_score = retrieval_relevance_score(query, top_k=retrieve_k)
    completeness_score = answer_completeness_score(answer, reranked_chunks[:context_chunks])

    context_emb = embedder.encode([context], normalize_embeddings=True, convert_to_numpy=True)
    answer_emb = embedder.encode([answer], normalize_embeddings=True, convert_to_numpy=True)
    faithfulness_score_val = float(util.cos_sim(context_emb, answer_emb).cpu().numpy()[0][0])

    weights = {"retrieval": 0.4, "completeness": 0.3, "faithfulness": 0.3}
    composite_score = (
        weights["retrieval"] * retrieval_score +
        weights["completeness"] * completeness_score +
        weights["faithfulness"] * faithfulness_score_val
    )

    return {
        "query": query,
        "answer": answer,
        "context": reranked_chunks[:context_chunks],
        "retrieval_score": retrieval_score,
        "completeness_score": completeness_score,
        "faithfulness_score": faithfulness_score_val,
        "composite_score": composite_score,
        "hyde_answer": get_hyde_answer(query)
    }

log("Complete RAG pipeline ready!")


[15:28:21] Complete RAG pipeline ready!


In [17]:
def test_pipeline():
    test_queries = [
      "What is the MRC grading scale for muscle power?"
    ]

    results = []
    for query in test_queries:
        try:
            result = rag_pipeline_with_metrics(query)
            results.append(result)

            print(f"\n{'='*50}")
            print(f"QUERY: {result['query']}")
            print(f"\nHyDE Answer: {result['hyde_answer'][:200]}...")
            print(f"\nGenerated Answer: {result['answer']}")
            print(f"\nScores:")
            print(f"  Retrieval: {result['retrieval_score']:.3f}")
            print(f"  Completeness: {result['completeness_score']:.3f}")
            print(f"  Faithfulness: {result['faithfulness_score']:.3f}")
            print(f"  Composite: {result['composite_score']:.3f}")

        except Exception as e:
            print(f"Error processing query '{query}': {e}")

    return results

test_results = test_pipeline()


[15:38:49] Processing query: What is the MRC grading scale for muscle power?...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


QUERY: What is the MRC grading scale for muscle power?

HyDE Answer: The MRR (Muscle Power Rating Scale) was developed by Dr. Robert J. Hirsch, MD and has been used to evaluate strength in men with muscular dystrophy since its inception as a clinical trial of anabolic ...

Generated Answer: Unable to generate answer.

Scores:
  Retrieval: 0.750
  Completeness: 0.000
  Faithfulness: 0.001
  Composite: 0.300


In [18]:
q = "What is the MRC grading scale for muscle power?"
log("Running pipeline (this may take a while on first load)...")
result = rag_pipeline_with_metrics(q, retrieve_k=10, rerank_k=5, context_chunks=3)
print("\n=== RESULT ===")
print("Query:", result["query"])
print("\nContext (top chunks):")
for c in result["context"]:
    print("-", c[:300].replace("\n", " "), "...")
print("\nGenerated Answer:\n", result["answer"])
print("\nScores: Retrieval=%.3f, Completeness=%.3f, Faithfulness=%.3f, Composite=%.3f" % (
    result["retrieval_score"], result["completeness_score"], result["faithfulness_score"], result["composite_score"]))


[15:38:51] Running pipeline (this may take a while on first load)...
[15:38:51] Processing query: What is the MRC grading scale for muscle power?...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


=== RESULT ===
Query: What is the MRC grading scale for muscle power?

Context (top chunks):
- Muscle strength and force development in high- and low-functioning elderly men: Influence of muscular and neural factors. ...
- The gold standard diagnostic test for Duchenne muscular dystrophy is genetic studies. This involves analyzing the patient's DNA for mutations in the dystrophin gene, which is responsible for producing a protein essential for muscle function. Duchenne muscular dystrophy is a genetic disorder that pri ...
- hi, thank you for providing the brief history of you. a thorough neuromuscular assessment is advised. based on the assessment the muscular strength will be determined, also an mri will help us determine the status of the soft tissue injury. ...

Generated Answer:
 Unable to generate answer.

Scores: Retrieval=0.750, Completeness=0.000, Faithfulness=0.001, Composite=0.300


In [15]:
def evaluate_on_testset(test_data, sample_size=10):
    log(f"Evaluating on {sample_size} test samples...")

    results = []
    for i, sample in enumerate(test_data[:sample_size]):
        try:
            result = rag_pipeline_with_metrics(sample['question'])
            result['ground_truth'] = sample['answer']
            results.append(result)

            if (i + 1) % 5 == 0:
                log(f"Processed {i + 1}/{sample_size} samples")

        except RuntimeError as e:
            if "device-side assert triggered" in str(e):
                log(f"Skipping sample {i} due to CUDA error: {e}")
                continue
            else:
                log(f"Error on sample {i}: {e}")
                continue
        except Exception as e:
            log(f"Error on sample {i}: {e}")
            continue

    if results:
        avg_retrieval = np.mean([r['retrieval_score'] for r in results])
        avg_completeness = np.mean([r['completeness_score'] for r in results])
        avg_faithfulness = np.mean([r['faithfulness_score'] for r in results])
        avg_composite = np.mean([r['composite_score'] for r in results])

        print(f"\n{'='*50}")
        print("EVALUATION RESULTS:")
        print(f"Average Retrieval Score: {avg_retrieval:.3f}")
        print(f"Average Completeness Score: {avg_completeness:.3f}")
        print(f"Average Faithfulness Score: {avg_faithfulness:.3f}")
        print(f"Average Composite Score: {avg_composite:.3f}")
        print(f"Total samples processed: {len(results)}")

    return results

eval_results = evaluate_on_testset(test_data, sample_size=10)


[15:28:27] Evaluating on 10 test samples...
[15:28:27] Processing query: my husband has hepatitis c but the liver biopsy was good. 0, 0, 1 I believe.  he is taking high dose...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:29] Processing query: Mina and Andersen, authors of the Perspectives in Science: COVID-19 Testing: One Size Does Not Fit A...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:32] Processing query: Q:A 2-month-old is brought to the physician for a well-child examination. She was born at 39 weeks g...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:34] Processing query: My grandmmother is 81 yrs old. Checked her bp several times over a 3hr period and always got 190/90....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:36] Processing query: What are the two main functions of the Vestibulocochlear nerve (CN VIII)?...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:38] Processed 5/10 samples
[15:28:38] Processing query: Background and importanceA greater benefit was suggested with early treatment with remdesivir agains...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:39] Processing query: Hello doctor,I have a few bumps on my penis skin. I do masturbate. I am concerned because I do not k...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:41] Processing query: my husband had throat cancer in 2004, hes had 2 mri an cat scan an bone scan, now the suregon saying...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:43] Processing query: What medical condition is suggested by the presence of headache, systemic symptoms such as fever or ...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:45] Processing query:  Glaucoma Open-angle glaucoma Chronic glaucoma Chronic open-angle glaucoma Primary open-angle glauco...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[15:28:47] Processed 10/10 samples

EVALUATION RESULTS:
Average Retrieval Score: 0.848
Average Completeness Score: 0.062
Average Faithfulness Score: 0.205
Average Composite Score: 0.419
Total samples processed: 10
