In [1]:
!pip install spacy



In [2]:
# python -m spacy download en_core_web_sm

import subprocess

try:
    subprocess.check_call(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])
except subprocess.CalledProcessError as e:
    print(f"Error downloading model: {e}")
    print(e.output) #Print the full error message. This is often more useful than just the exception.

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 91.2 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## MedQuAD Dataset Sampling and Answer Cleaning

In [3]:
# Import necessary libraries
from datasets import load_dataset
import pandas as pd
import json
import spacy
from nltk.tokenize import sent_tokenize
import nltk
import random

# Download NLTK sentence tokenizer (run this only for the first time)
nltk.download('punkt')

# Load Spacy model for natural language processing
nlp = spacy.load("en_core_web_sm")

# Load MedQuAD dataset
medquad_dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")
if 'train' in medquad_dataset:
    medquad_df = medquad_dataset['train'].to_pandas()
    print("Original dataset size:", len(medquad_df))
else:
    print("Error: 'train' section not found in the dataset")
    medquad_df = pd.DataFrame()

# Set random seed for reproducibility
random_seed = 42
random.seed(random_seed)

# Randomly sample 10000 examples
sample_size = 10000
if len(medquad_df) >= sample_size:
    sampled_df = medquad_df.sample(n=sample_size, random_state=random_seed)
else:
    sampled_df = medquad_df
print("Sampled dataset size:", len(sampled_df))

# Define cleaning function: extract sentences related to the question
def clean_sample(row):
    question = row.get("Question")
    answer = row.get("Answer")

    # Check for null values
    if not (question and answer):
        return {"Question": question, "Answer": answer}

    # Tokenize the answer into sentences
    sentences = sent_tokenize(answer)

    # Analyze the question using Spacy to extract keywords (nouns and verbs)
    question_doc = nlp(question.lower())
    question_keywords = [token.text for token in question_doc if token.pos_ in ["NOUN", "VERB"]]

    # Extract relevant sentences
    relevant_sentences = []
    for sent in sentences:
        sent_doc = nlp(sent.lower())
        sent_tokens = [token.text for token in sent_doc]
        # Keep the sentence if it contains question keywords
        if any(keyword in sent_tokens for keyword in question_keywords):
            relevant_sentences.append(sent)

    # If no relevant sentences are found, keep the original answer
    cleaned_answer = " ".join(relevant_sentences) if relevant_sentences else answer
    return {"Question": question, "Answer": cleaned_answer}

# Apply the cleaning function to the sampled data
cleaned_df = sampled_df.apply(clean_sample, axis=1, result_type='expand')
print("Cleaning complete, processed sample size:", len(cleaned_df))

# Save to .jsonl file
def save_to_jsonl(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for index, row in df.iterrows():
            json.dump(row.to_dict(), f, ensure_ascii=False)
            f.write('\n')

save_to_jsonl(cleaned_df, "medquad_sampled_cleaned.jsonl")
print("Cleaned samples saved to medquad_sampled_cleaned.jsonl")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Original dataset size: 16407
Sampled dataset size: 10000
Cleaning complete, processed sample size: 10000
Cleaned samples saved to medquad_sampled_cleaned.jsonl


In [6]:
# Check for duplicates in the cleaned data
duplicates = cleaned_df.duplicated(subset=["Question", "Answer"])
print("Number of duplicate samples:", duplicates.sum())

Number of duplicate samples: 23


In [7]:
import json

# knowledge_base_file = "medquad_sampled.jsonl"
knowledge_base_file = "medquad_sampled_cleaned.jsonl"

first_ten_entries = []

try:
    with open(knowledge_base_file, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < 10:
                data = json.loads(line)
                first_ten_entries.append(data)
            else:
                break  # Stop after reading the first ten entries

    if first_ten_entries:
        print("First ten entries from the medquad_sampled.jsonl file:")
        for entry in first_ten_entries:
            print(json.dumps(entry, ensure_ascii=False, indent=2))
    else:
        print(f"The file {knowledge_base_file} is empty or contains fewer than ten entries.")

except FileNotFoundError:
    print(f"Error: The file {knowledge_base_file} was not found. Please ensure the file has been generated and is located in the current directory.")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

First ten entries from the medquad_sampled.jsonl file:
{
  "Question": "What are the treatments for High Blood Pressure ?",
  "Answer": "Today, many different types of medicines are available to control high blood pressure. Some lower blood pressure by removing extra fluid and salt from your body. Others affect blood pressure by slowing down the heartbeat, or by relaxing and widening blood vessels. Here are the types of medicines used to treat high blood pressure. -   Diuretics (water or fluid Pills)  flush excess sodium from your body, which reduces the amount of fluid in your blood and helps to lower your blood pressure. Diuretics are often used with other high blood pressure medicines, sometimes in one combined pill. As a result, your heart pumps less blood through your blood vessels, which can help to lower your blood pressure. Angiotensin-II is a hormone that narrows blood vessels, increasing blood pressure. ACE inhibitors block this process, which stops the production of Angioten

## RAG with Cross-Encoder Reranking for Question Answering

In [8]:
!pip uninstall -y transformers
!pip install -q transformers
!pip install -q --upgrade sentence-transformers
!pip install -q faiss-cpu

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
import faiss
import torch
import numpy as np
import json

# Load model
print("Loading models...")
retriever = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") 

cross_encoder = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L-6-v2")
cross_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-6-v2")

qa_tokenizer = AutoTokenizer.from_pretrained("Nin8520/MedQA_v2")
qa_model = AutoModelForSeq2SeqLM.from_pretrained("Nin8520/MedQA_v2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qa_model.to(device)
cross_encoder.to(device)

# Load knowledge base from .jsonl file and build FAISS index
knowledge_base_file = "/kaggle/working/medquad_sampled_cleaned.jsonl"
knowledge_base = []
try:
    with open(knowledge_base_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            knowledge_base.append(data["Answer"]) 
        print(f"Successfully loaded {len(knowledge_base)} knowledge base documents.")
except FileNotFoundError:
    print(f"Error: Could not find knowledge base file {knowledge_base_file}. Please ensure the file has been generated and is in the current directory.")
    knowledge_base = []

if knowledge_base:
    print("Building FAISS index...")
    # Encode the knowledge base documents into embeddings
    document_embeddings = retriever.encode(knowledge_base, convert_to_numpy=True) 
    # Get the dimensionality of the embeddings
    dimension = document_embeddings.shape[1] 
    # Create a Flat L2 FAISS index with the embedding dimension
    index = faiss.IndexFlatL2(dimension) 
    # Add the document embeddings to the FAISS index
    index.add(document_embeddings) 
else:
    index = None
    print("Knowledge base is empty, cannot build FAISS index.")

# Retrieve + rerank
def retrieve_and_rerank(question, top_k=3):
    if index is None:
        return []
    # Encode the input question into a vector embedding
    question_embedding = retriever.encode(question, convert_to_numpy=True) 
    # Search the FAISS index for the top_k most similar document embeddings
    D, I = index.search(np.array([question_embedding]), top_k) 
    # Retrieve the actual text passages corresponding to the top_k indices
    retrieved_passages = [knowledge_base[i] for i in I[0]] 

    # Create pairs of (question, retrieved passage) for cross-encoder input
    cross_inputs = [(question, passage) for passage in retrieved_passages] 
    # Tokenize and encode the question-passage pairs for the cross-encoder
    encoded = cross_tokenizer.batch_encode_plus(cross_inputs, padding=True, truncation=True, return_tensors="pt").to(device) 
    with torch.no_grad():
        # Get the relevance scores from the cross-encoder for each question-passage pair
        scores = cross_encoder(**encoded).logits.squeeze(-1) 

    # Get the indices that would sort the scores in descending order
    sorted_indices = torch.argsort(scores, descending=True) 
    # Reorder the retrieved passages based on the cross-encoder scores
    reranked_passages = [retrieved_passages[i] for i in sorted_indices] 

    return reranked_passages

# QA Answer Generation
def answer_question_rerank(question):
    # Retrieve and rerank the passages based on the question, limiting the number of retrieved documents
    retrieved_passages = retrieve_and_rerank(question, top_k=3) 
    context = "\n".join(retrieved_passages)

    prompt = f"Question: {question}\nContext: {context}\nAnswer (Briefly summarize and keep it concise):"

    # Tokenize the prompt with truncation to a maximum length
    inputs = qa_tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Generate the answer with more focused generation settings
    with torch.no_grad():
        output = qa_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=1024,  # Limit the maximum length of generated text
            num_beams=7,  # Increased beams for better quality
            do_sample=False,  # Disable sampling to ensure controlled generation
            top_p=0.95,
            temperature=0.7,  # Lower temperature to ensure concise answers
            no_repeat_ngram_size=3,
            repetition_penalty=1.2,
            # max_new_tokens=250,  # Limit the number of new tokens to avoid overly long answers
            eos_token_id=qa_tokenizer.eos_token_id,  # Force the model to stop generating when it encounters the eos token
        )

    # Decode the generated answer and return
    answer = qa_tokenizer.decode(output[0], skip_special_tokens=True)
    return answer

# Test
if __name__ == "__main__":
    user_question = "Is surfactant dysfunction inherited ?"
    final_answer = answer_question_rerank(user_question)
    print(f"Q: {user_question}")
    print(f"A: {final_answer}")


Found existing installation: transformers 4.51.1
Uninstalling transformers-4.51.1:
  Successfully uninstalled transformers-4.51.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-05-09 07:28:55.792756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746775736.038540      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746775736.104870      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Successfully loaded 10000 knowledge base documents.
Building FAISS index...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Q: Is surfactant dysfunction inherited ?
A: Answer (Briefly summarize and keep it concise): Surfactant dysfunction is inherited in an autosomal recessive pattern, which means both copies of the gene in each cell have mutations. This condition results from abnormalities in the composition or function of surfactant, a mixture of certain fats (called phospholipids) and proteins that lines the lung tissue and makes breathing easy. When caused by mutations in the SFTPB or ABCA3 gene, this condition is characterized by signs and symptoms that range from mild to severe. In some cases, the cause of the condition is unknown. In others, the condition may be caused by a mutation in one or more of the following genes: SP-C deficiency, ABCA3, or SP-B. The cause of these mutations is unknown; however, it is thought to be related to a genetic cause. Some people with this condition have no history of the disorder in their family, while others have a history of a condition that has been described as a 

## RAG with Enhanced Prompt Engineering for Question Answering

In [9]:
# Install dependencies
!pip uninstall -y transformers
!pip install -q transformers
!pip install -q --upgrade sentence-transformers
!pip install -q faiss-cpu

# Import libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import faiss
import torch
import numpy as np
import json

# Load models (only load retriever and QA model)
print("Loading models...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

retriever = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # Retriever
qa_tokenizer = AutoTokenizer.from_pretrained("Nin8520/MedQA_v2")
qa_model = AutoModelForSeq2SeqLM.from_pretrained("Nin8520/MedQA_v2").to(device)



# Load knowledge base from .jsonl file and build FAISS index (keep unchanged)
knowledge_base_file = "/kaggle/working/medquad_sampled_cleaned.jsonl"
knowledge_base = []
try:
    with open(knowledge_base_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            knowledge_base.append(data["Answer"])  # Use "Answer" as knowledge base documents
    print(f"Successfully loaded {len(knowledge_base)} knowledge base documents.")
except FileNotFoundError:
    print(f"Error: Knowledge base file {knowledge_base_file} not found. Please ensure the file has been generated and is located in the current directory.")
    knowledge_base = []

if knowledge_base:
    print("Building FAISS index...")
    document_embeddings = retriever.encode(knowledge_base, convert_to_numpy=True)
    dimension = document_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(document_embeddings)
else:
    index = None
    print("Knowledge base is empty, unable to build FAISS index.")

# Retrieval (without using rerank)
def retrieve_documents(question, top_k=3):
    if index is None:
        return []
    question_embedding = retriever.encode(question, convert_to_numpy=True)
    D, I = index.search(np.array([question_embedding]), top_k)
    retrieved_passages = [knowledge_base[i] for i in I[0]]
    return retrieved_passages

# QA Generates the answer (using Prompt Engineering)
def answer_question_prompt(question):
    retrieved_passages = retrieve_documents(question, top_k=3)
    context = "\n".join(retrieved_passages)

    # # Enhance Prompt Engineering: Guide the model to summarize and respond more explicitly
    prompt = f"""Answer the following question based on the provided context.
                        Maintain conciseness and answer the core points.
                        If the context does not contain the answer, state that the answer is not available in the provided context.

Question: {question}

Context:
{context}

Answer: """

    inputs = qa_tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)
    with torch.no_grad():
        output = qa_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=512, 
            num_beams=5,      
            do_sample=False, # Keep generation deterministic
            top_p=0.95,
            temperature=0.7,
            no_repeat_ngram_size=3,
            repetition_penalty=1.1,
            eos_token_id=qa_tokenizer.eos_token_id,
        )
    answer = qa_tokenizer.decode(output[0], skip_special_tokens=True)
    return answer

# Test 
if __name__ == "__main__":
    user_question = "Is surfactant dysfunction inherited ?"
    final_answer = answer_question_prompt(user_question)
    print(f"Q: {user_question}")
    print(f"A: {final_answer}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading models...
Successfully loaded 10000 knowledge base documents.
Building FAISS index...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Q: Is surfactant dysfunction inherited ?
A: Answer: Surfactant dysfunction is a lung disorder that causes breathing problems. This condition results from abnormalities in the composition or function of surfactant, a mixture of certain fats (called phospholipids) and proteins that lines the lung tissue and makes breathing easy. Other types, known as SP-C dysfunction and ABCA3 deficiency, have signs and symptoms that range from mild to severe.


### Without RAG model output (for comparison)

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Nin8520/MedQA_v2")
model = AutoModelForSeq2SeqLM.from_pretrained("Nin8520/MedQA_v2")

# Set the model to evaluation mode and move to the appropriate device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def generate_answer_original(question: str, tokenizer, model, device):
    """Generate an answer for a given question using the pre-trained model."""
    # Tokenize the input question
    inputs = tokenizer(
        f"Question: {question}",
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128  # You can adjust this as needed
    )

    # Move input tensors to the correct device
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate an answer using the model
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=1024,  # Adjust the max output length
            num_beams=5,  # Number of beams for beam search
            length_penalty=1.2,
            no_repeat_ngram_size=3,
            repetition_penalty=1.2,
            top_k=50,
            do_sample=True,
            early_stopping=True
        )

    # Decode the generated answer
    answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return answer

# Example usage
if __name__ == "__main__":
    question = "Is surfactant dysfunction inherited ?"
    answer = generate_answer_original(question, tokenizer, model, device)
    print(f"Q: {question}")
    print(f"A: {answer}")


Q: Is surfactant dysfunction inherited ?
A: Answer: Surfactant dysfunction is inherited in an autosomal recessive pattern, which means both copies of the gene in each cell have mutations. In some cases, the parents of an individual with this condition each carry one copy of the mutated gene, but they typically do not show signs and symptoms of the condition. mes Itstler gland glands to produce a hormone called surfactant (surfactant) phosphorylation, which is a protein that attaches to the surface of the skin and protects the skin from damage caused by contact with other parts of the body, such as the skin, hair, eyes, ears, and nails. Information about the prevalence of this condition is available from the National Institute of Neurological Disorders and Strokeizes the underlying genetic cause of the disorder, which may be inherited as a result of a mutation in the CYP1A gene.ia may occur in people with no history of this disorder in their family, or it may be caused by mutations in o

### Test: Generate RAG answers for the dataset

In [11]:
import pandas as pd

# 读取 CSV 文件，生成答案并写入 (在新的代码块中执行)
input_csv_path = "/kaggle/input/evaluate/bioasq2_dataset.csv" 
output_csv_path = "/kaggle/working/bioasq2_dataset_rag_prompt.csv"

try:
    df = pd.read_csv(input_csv_path)
    # 确保 CSV 文件中有列叫 "Question"
    if "Question" not in df.columns:
        print("错误：CSV 文件中必须有 'Question' 列！")
    else:
        df["RAG Response"] = ""
        for idx, row in df.iterrows():
            question = row["Question"]
            rag_response = answer_question(question)  
            df.at[idx, "RAG Response"] = rag_response

        df.to_csv(output_csv_path, index=False)
        print(f"处理完毕！生成的文件保存到：{output_csv_path}")

except FileNotFoundError:
    print(f"错误：找不到输入 CSV 文件：{input_csv_path}")
except Exception as e:
    print(f"处理 CSV 文件时发生错误：{e}")

Download 

In [12]:
import os
from IPython.display import FileLink
import pandas as pd

# 检查文件是否存在
file_path = '/kaggle/working/bioasq2_dataset_rag_prompt.csv'
if os.path.exists(file_path):
    print("文件已生成，准备下载...")
    
    # 方法1：直接生成下载链接（适用于 Kaggle Notebook）
    display(FileLink(file_path, result_html_prefix="点击下载: "))
    
    # 方法2：保存为 Pandas DataFrame 并下载（验证数据完整性）
    df = pd.read_csv(file_path)
    output_path = 'bioasq2_dataset_rag_prompt_download.csv'
    df.to_csv(output_path, index=False)
    display(FileLink(output_path))
else:
    print("错误：文件未找到！请检查路径或文件是否生成成功。")