In [4]:
from dotenv import load_dotenv
load_dotenv(override=True)
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

from typing import TypedDict, Annotated, Literal, Optional

class ExtractedStudyData(TypedDict):
    data_sources: Annotated[Optional[str], ..., "for each data collected, summarize what/where/when/who it was collected from"]
    study_design: Annotated[Optional[Literal["Cohort", "Case-control", "Cross-sectional", "", "Experimental", "Other"]], ..., "based on the data source, what is the type of study conducted?"]
    study_accrural_periods: Optional[str]
    sample_size: Optional[int]
    exposures: Annotated[Optional[str], ..., "exposures, with the sources of the exposures if available"]
    exposure_ascertainment: Annotated[Optional[str], ..., "i.e., how was the exposure measured / what was the data source?"]
    outcomes: Optional[str]
    outcome_ascertainment: Annotated[Optional[str], ..., "i.e., how was the outcome measured / what was the data source?"]
    all_results: Annotated[Optional[str], ..., "results verbatim, including all stats and metrics where available"]
    conclusions: Optional[str]

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

template = \
"""Please very carefully extract the following data from the article. 
Be very meticulous and specific for each category. If information is not available for a category, please leave it blank.

{article}"""

prompt = ChatPromptTemplate([
    ("system", template),
])

chain = prompt | llm.with_structured_output(ExtractedStudyData, method="json_schema", strict=True)

result = chain.invoke("""
Purpose: In this study we evaluated the risk of a second malignancy of the bladder or prostate in patients with a previous diagnosis of prostate cancer (PCa) or urothelial cancer (TCC). Material(s) and Method(s): We retrospectively analyzed all cases of PCa and TCC diagnosed between January 1996 and June 2003. Only PCa diagnosed due to abnormal digital rectal examination or increased prostate specific antigen were included. All patients with TCC presented with hematuria or irritative voiding symptoms and the diagnoses were confirmed with a tissue diagnosis. The incidence of lung, colon and renal cancers was also analyzed. Result(s): A total of 816 men were diagnosed with PCa and/or TCC. Of 673 men initially diagnosed with PCa 21 had TCC. Of 149 men initially diagnosed with TCC 18 had PCa. Average age at PCa and TCC diagnosis +/- SD was 68.2 +/- 7.9 and 68.2 +/- 10.4 years, respectively. The standardized incidence ratio (SIR) of TCC in patients with PCa (SIR 4.31, 95% CI 2.411 to 7.110) and of PCa in patients with TCC (SIR 3.83, 95% CI 1.911 to 6.858) was significantly increased. There was no statistical significant difference in SIR for TCC in men with or without radiotherapy. SIR for lung, renal or colon cancer was not significantly different from what was expected. Conclusion(s): Patients with PCa have higher incidence of bladder cancer and those with bladder cancer have a higher incidence of PCa. This study has clinical implications in the care of these patients and it may stimulate research interest that may identify common pathways of carcinogenesis. © 2008 American Urological Association.
""")

In [5]:
for k, v in result.items():
    print(k, v)

data_sources Retrospective analysis of cases diagnosed with prostate cancer (PCa) and urothelial cancer (TCC) between January 1996 and June 2003.
study_design Cohort
study_accrural_periods January 1996 to June 2003
sample_size 816
exposures Previous diagnosis of prostate cancer (PCa) or urothelial cancer (TCC).
exposure_ascertainment Diagnosis confirmed through abnormal digital rectal examination or increased prostate specific antigen for PCa; hematuria or irritative voiding symptoms for TCC with tissue diagnosis confirmation.
outcomes Incidence of second malignancies (bladder cancer and prostate cancer) in patients with a previous diagnosis of PCa or TCC.
outcome_ascertainment Standardized incidence ratio (SIR) calculated for TCC in patients with PCa and vice versa.
all_results A total of 816 men were diagnosed with PCa and/or TCC. Of 673 men initially diagnosed with PCa, 21 had TCC. Of 149 men initially diagnosed with TCC, 18 had PCa. Average age at PCa and TCC diagnosis was 68.2 +/-

In [6]:
screening_question = "What is the association between exposure to radiotherapy for prostate cancer and incidence/risk of second malignancy / second primary cancers?"

exclusion_criteria = "non-clinical studies, editorials, review articles, case reports, conference abstracts, basic science papers, unclear comparator group, metastatic tumors, non-standard treatment for prostate cancer (such as cryotherapy), articles not dealing with radiation induced malignancy"

screening_prompt = \
"""Given the following article information, please decide if it should be included in the analysis, based on the screening question and exclusion criteria.

<article>
{article}
</article>

<screening_question>
{screening_question}
</screening_question>

<exclusion_criteria>
{exclusion_criteria}
</exclusion_criteria>"""

sr_prompt = ChatPromptTemplate([
    ("system", screening_prompt),
])

class Inclusion(TypedDict):
    include: bool

sr_chain = sr_prompt | llm.with_structured_output(Inclusion, method="json_schema", strict=True)

result = sr_chain.invoke({
    "article": result,
    "screening_question": screening_question,
    "exclusion_criteria": exclusion_criteria
})

print(result)

{'include': False}
