In [1]:
import os
import json
from dotenv import load_dotenv
from langchain_core.documents import Document
from benchmark.protocol_parsing.soa import extract_and_save_soa_table

# Load environment variables
load_dotenv()

# Example protocol content with SOA tables
MOCK_PROTOCOL_PAGES = [
    Document(
        page_content="""
        4.1 Study Overview and Schedule
        
        The table below summarizes the study schedule. Details of procedures are provided in Section 5.

        Table 1: Schedule of Assessments (Screening through Week 4)
        +-------------------+-------------+-------------+-------------+-------------+
        | Visit Description | Screening   | Baseline    | Week 2      | Week 4      |
        |                   | (Day -28 to | (Day 1)     |             |             |
        |                   | Day -1)     |             |             |             |
        +-------------------+-------------+-------------+-------------+-------------+
        | Informed Consent  | X           |             |             |             |
        | Demographics      | X           |             |             |             |
        | Medical History   | X           |             |             |             |
        | Physical Exam     | X           | X           |             | X           |
        | Vital Signs       | X           | X           | X           | X           |
        | 12-Lead ECG       | X           | X           |             | X           |
        | Clinical Labs     | X           | X           | X           | X           |
        | Pregnancy Test    | X           | X           |             | X           |
        | Drug Administration|            | X           | X           | X           |
        | AE Assessment     |             | X           | X           | X           |
        +-------------------+-------------+-------------+-------------+-------------+
        """,
        metadata={"page_number": 15}
    ),
    Document(
        page_content="""
        Table 2: Schedule of Assessments (Weeks 8 through 24)
        +-------------------+-------------+-------------+-------------+-------------+
        | Visit Description | Week 8      | Week 12     | Week 16     | Week 24     |
        |                   |             |             |             | (End of     |
        |                   |             |             |             | Study)      |
        +-------------------+-------------+-------------+-------------+-------------+
        | Physical Exam     | X           |             | X           | X           |
        | Vital Signs       | X           | X           | X           | X           |
        | 12-Lead ECG       | X           |             | X           | X           |
        | Clinical Labs     | X           | X           | X           | X           |
        | Pregnancy Test    | X           |             | X           | X           |
        | Drug Administration| X           | X           | X           |             |
        | AE Assessment     | X           | X           | X           | X           |
        | Quality of Life   | X           |             | X           | X           |
        | MRI               |             | X           |             | X           |
        +-------------------+-------------+-------------+-------------+-------------+
        """,
        metadata={"page_number": 16}
    ),
    Document(
        page_content="""
        7.2 Laboratory Tests
        
        The following laboratory tests will be performed at the timepoints indicated in the Schedule of Assessments:
        
        Hematology: CBC with differential, platelet count
        Chemistry: Sodium, potassium, chloride, BUN, creatinine, glucose, calcium, phosphorus, 
        magnesium, AST, ALT, GGT, alkaline phosphatase, total bilirubin, total protein, albumin
        Urinalysis: pH, specific gravity, glucose, protein, blood, ketones, bilirubin, 
        nitrite, leukocyte esterase, microscopic examination if dipstick abnormal
        
        Blood samples for clinical laboratory tests will be collected at Screening, Baseline, 
        Weeks 2, 4, 8, 12, 16, and 24.
        """,
        metadata={"page_number": 38}
    ),
    Document(
        page_content="""
        9.1 Statistical Methods
        
        The primary analysis will be performed on the Intent-to-Treat (ITT) population, defined as 
        all randomized participants who receive at least one dose of study medication.
        
        Figure 1: Study Flow Chart
        
        [Screening] → [Randomization] → [Treatment Period (Weeks 1-16)] → [Follow-up (Week 24)]
        
        Participants will be assessed according to the Schedule of Assessments. Safety will be monitored 
        throughout the study by recording adverse events, clinical laboratory tests, vital signs, and 
        physical examinations.
        """,
        metadata={"page_number": 52}
    ),
    Document(
        page_content="""
        12.3 Amendment History
        
        The following is a list of all protocol amendments to date:
        
        Amendment 1 (08-Jan-2023): Updated Schedule of Assessments to add MRI assessment at Week 12 and Week 24.
        
        Amendment 2 (15-Mar-2023): Added Quality of Life questionnaire to be administered at Weeks 8, 16, and 24.
        
        Table 3: Summary of Visits and Procedures
        
        Screening Visit:
        - Informed Consent
        - Demographics
        - Medical History
        - Physical Examination
        - Vital Signs
        - 12-Lead ECG
        - Clinical Laboratory Tests
        - Pregnancy Test (WOCBP)
        
        Baseline Visit (Day 1):
        - Physical Examination
        - Vital Signs
        - 12-Lead ECG
        - Clinical Laboratory Tests
        - Pregnancy Test (WOCBP)
        - Drug Administration
        - AE Assessment
        """,
        metadata={"page_number": 87}
    )
]

def main():
    print("Testing SOA table extraction pipeline...\n")
    
    # Define output path
    output_dir = "./extraction_results"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/mock_soa_table.json"
    
    # Run the extraction pipeline
    extract_and_save_soa_table(
        documents=MOCK_PROTOCOL_PAGES,
        output_path=output_path,
        llm="gpt-4o-mini"  # Use a smaller model for testing
    )

In [2]:
main() 

Testing SOA table extraction pipeline...

Searching for SOA tables in 5 documents...
Found SOA table on page 15 (confidence: high)
Found SOA table on page 16 (confidence: high)
Found 2 SOA tables, merging...
Saved SOA table as CSV to ./extraction_results/mock_soa_table.csv
Saved SOA table as JSON to ./extraction_results/mock_soa_table.json

Summary: Found SOA table with 12 procedures across 8 timepoints
Source pages: [15, 16]

SOA Table Preview:
                    Screening Baseline Week 2 Week 4 Week 8 Week 12 Week 16  \
Informed Consent            X                                                 
Demographics                X                                                 
Medical History             X                                                 
Physical Exam               X        X             X                          
Vital Signs                 X        X      X      X                          
12-Lead ECG                 X        X             X                        