In [1]:
import pandas as pd

limit_lines = 50

dataset = pd.read_csv("final_inclusion_in_SR.csv")[:limit_lines]
articles = [f"{entry["Title"]}:\n\n{entry["Abstract"]}" for _, entry in dataset.iterrows()]


In [2]:
screening_question = "What is the association between exposure to radiotherapy for prostate cancer and incidence/risk of second malignancy / second primary cancers?"

exclusion_criteria = "non-clinical studies, editorials, review articles, case reports, conference abstracts, basic science papers, unclear comparator group, metastatic tumors, non-standard treatment for prostate cancer (such as cryotherapy), articles not dealing with radiation induced malignancy"


In [4]:
from dotenv import load_dotenv
load_dotenv(override=True)
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

from typing import TypedDict, Annotated, Literal, Optional

class ExtractedStudyData(TypedDict):
    data_sources: Annotated[Optional[str], ..., "for each data collected, summarize what/where/when/who it was collected from"]
    study_design: Annotated[Optional[Literal["Cohort", "Case-control", "Cross-sectional", "", "Experimental", "Other"]], ..., "based on the data source, what is the type of study conducted?"]
    study_accrural_periods: Optional[str]
    sample_size: Optional[int]
    exposures: Annotated[Optional[str], ..., "exposures, with the sources of the exposures if available"]
    exposure_ascertainment: Annotated[Optional[str], ..., "i.e., how was the exposure measured / what was the data source?"]
    outcomes: Optional[str]
    outcome_ascertainment: Annotated[Optional[str], ..., "i.e., how was the outcome measured / what was the data source?"]
    all_results: Annotated[Optional[str], ..., "results verbatim, including all stats and metrics where available"]
    conclusions: Optional[str]

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

template = \
"""Please very carefully extract the following data from the article. 
Be very meticulous and specific for each category. If information is not available for a category, please leave it blank.

Please extract data as it relates to this question of interest:
{screening_question}

<article>
{article}
</article>"""

prompt = ChatPromptTemplate([
    ("system", template),
])

chain = prompt | llm.with_structured_output(ExtractedStudyData, method="json_schema", strict=True)

extracted = chain.batch([{
    "article": e,
    "screening_question": screening_question
} for e in articles], {"max_concurrency": 10})


In [None]:
for e in extracted:
    for k, v in e.items():
        print(k, v)
    print("\n\n")


data_sources Retrospective analysis of cases diagnosed with prostate cancer (PCa) and urothelial cancer (TCC) between January 1996 and June 2003.
study_design Cohort
study_accrural_periods January 1996 to June 2003
sample_size 816
exposures Previous diagnosis of prostate cancer (PCa) or urothelial cancer (TCC).
exposure_ascertainment Diagnosis confirmed through abnormal digital rectal examination or increased prostate specific antigen for PCa; hematuria or irritative voiding symptoms for TCC with tissue diagnosis confirmation.
outcomes Incidence of second malignancies (bladder cancer and prostate cancer) in patients with a previous diagnosis of PCa or TCC.
outcome_ascertainment Standardized incidence ratio (SIR) calculated for TCC in patients with PCa and for PCa in patients with TCC.
all_results A total of 816 men were diagnosed with PCa and/or TCC. Of 673 men initially diagnosed with PCa, 21 had TCC. Of 149 men initially diagnosed with TCC, 18 had PCa. Average age at PCa diagnosis wa

In [5]:
screening_prompt = \
"""Given the following article information, please decide if it should be included in the analysis, based on the screening question and exclusion criteria.

<article>
{article}
</article>

<screening_question>
{screening_question}
</screening_question>

<exclusion_criteria>
{exclusion_criteria}
</exclusion_criteria>"""

sr_prompt = ChatPromptTemplate([
    ("system", screening_prompt),
])

class Inclusion(TypedDict):
    include: bool
    reason_if_excluded: Optional[str]

sr_chain = sr_prompt | llm.with_structured_output(Inclusion, method="json_schema", strict=True)

result = sr_chain.batch([{
    "article": e,
    "screening_question": screening_question,
    "exclusion_criteria": exclusion_criteria
} for e in extracted], {"max_concurrency": 10})



In [6]:
for r in result:
    for k, v in r.items():
        print(k, v)
    print("\n\n")

include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include False
reason_if_excluded The study does not specifically focus on prostate cancer; it examines radiotherapy treatment for first primary invasive solid cancers in general.



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include False
reason_if_excluded The study focuses on radiation therapy for rectal cancer and its association with subsequent prostate cancer risk, rather than the association between radiotherapy for prostate cancer and second malignancies.



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 



include True
reason_if_excluded 
