In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

def create_medical_dataset(pdf_path, section_headers, output_file="medical_dataset.json"):
    """
    Create a dataset from a structured medical book PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        section_headers (list): List of keywords indicating the start of sections (e.g., medical terms or categories).
        output_file (str): Name of the output JSON file.

    Returns:
        list: Dataset containing queries, ground truth documents, and answers.
    """
    # Load the PDF content
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # Split the text into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(docs)

    dataset = []

    # Process each chunk to identify sections and extract Q&A
    for chunk in chunks:
        content = chunk.page_content

        for header in section_headers:
            if header in content:
                # Split content by the section header
                parts = content.split(header)
                for part in parts[1:]:  # Skip the first part before the header
                    lines = part.strip().split("\n", 1)
                    if len(lines) > 1:
                        question = header.strip() + " " + lines[0].strip()
                        answer = lines[1].strip()

                        # Store in dataset
                        dataset.append({
                            "query": question,
                            "ground_truth_document": content,
                            "ground_truth_answer": answer,
                        })

    # Save the dataset to a JSON file
    with open(output_file, "w") as f:
        json.dump(dataset, f, indent=4)

    print(f"Dataset created and saved to {output_file}.")
    return dataset

# Example Usage
pdf_path = "/Users/innovapathinc/Desktop/Gen Ai Topics /Retrieval_eval/ML_GenAI_Concepts/RAG/Data/Medical_book.pdf"
section_headers = ["What is", "Treatment", "Symptoms", "Causes"]  # Update based on your book's structure
dataset = create_medical_dataset(pdf_path, section_headers)


Dataset created and saved to medical_dataset.json.
