In [16]:
from dotenv import load_dotenv

load_dotenv()

True

In [9]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import convert_to_dict
import json
import os

def extract_text_from_pdf(filepath):
    """Extract text from a PDF file with error handling"""
    try:
        elements = partition_pdf(filepath, strategy="auto")
        data = convert_to_dict(elements)
        return "\n".join([item["text"] for item in data if "text" in item])
    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        return None

# Process all PDFs in medical_papers directory
medical_data = {}
pdf_folder = "medical_papers"

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pmcid = filename.split(".")[0]
        filepath = os.path.join(pdf_folder, filename)
        print(f"Processing {filename}...")
        
        text = extract_text_from_pdf(filepath)
        if text:
            medical_data[pmcid] = text

# Save results
with open(os.path.join(pdf_folder, "extracted_text.json"), "w") as f:
    json.dump(medical_data, f, indent=2)

print(f"Extracted text from {len(medical_data)} PDFs. Saved to medical_papers/extracted_text.json")

  from .autonotebook import tqdm as notebook_tqdm


Processing ai in radiology.pdf...
Processing cancer immunotherapy.pdf...
Processing covid-19.pdf...
Processing diabetes.pdf...
Processing Hypertension treatment.pdf...
Extracted text from 5 PDFs. Saved to medical_papers/extracted_text.json


In [10]:
import re

def clean_text(text):
    """Clean extracted text."""
    # Remove headers/footers
    text = re.sub(r'Page \d+ of \d+', '', text)
    # Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    # Remove references section
    text = re.sub(r'References.*', '', text, flags=re.DOTALL)
    # Remove special characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()

# Clean all extracted text
cleaned_data = {pmcid: clean_text(text) for pmcid, text in medical_data.items()}

# Save cleaned data
with open("medical_papers/cleaned_data.json", "w") as f:
    json.dump(cleaned_data, f)

print("Text cleaning complete. Data saved to 'medical_papers/cleaned_data.json'.")

Text cleaning complete. Data saved to 'medical_papers/cleaned_data.json'.


In [11]:
# Load cleaned data
with open("medical_papers/cleaned_data.json", "r") as f:
    cleaned_data = json.load(f)

# Print a sample
for pmcid, text in list(cleaned_data.items())[:1]:  # Inspect the first document
    print(f"PMCID: {pmcid}")
    print("Sample Text:\n", text[:500], "...")  # Show first 500 characters

PMCID: ai in radiology
Sample Text:
 Edited by: Yiannis Kyratsis, Vrije Universiteit Amsterdam, Netherlands
Reviewed by: Niamh Lennox-Chhugani, International Foundation for Integrated Care (IFIC), United Kingdom Ram Bajpai, Keele University, United Kingdom
Correspondence: Christina Malamateniou christina.malamateniou@city.ac.uk
 These authors have contributed equally to this work and share last authorship
Specialty section: This article was submitted to Health Technology Innovation, a section of the journal Frontiers in Digital Hea ...


In [15]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document
import json

# Load cleaned data from JSON file
with open("medical_papers/extracted_text.json", "r") as f:
    cleaned_data = json.load(f)

# Initialize node parser
node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)

# Create nodes from cleaned text
nodes = []
for pmcid, text in cleaned_data.items():
    document = Document(text=text, id_=pmcid)  # Use `id_` instead of `doc_id`
    nodes.extend(node_parser.get_nodes_from_documents([document]))

print(f"Created {len(nodes)} chunks from {len(cleaned_data)} documents.")

Created 469 chunks from 5 documents.


In [33]:
groq_api = os.getenv("GROQ_API_KEY")
from llama_index.core import VectorStoreIndex, Settings
from llama_index.llms.groq import Groq  # Make sure you have this installed

# Set up Groq LLM
llm = Groq(model="llama-3.1-8b-instant", api_key=groq_api)

# Configure global settings
Settings.llm = llm
Settings.embed_model = "local:BAAI/bge-small-en-v1.5"  # Local embedding model

# Create index from existing chunks
index = VectorStoreIndex(nodes)  # nodes = your pre-created chunks
print("Index built successfully!")

Index built successfully!


In [34]:
query_engine = index.as_query_engine()

# Query 1: Diabetes Treatment
response = query_engine.query(
    "What are the latest treatments for Type 2 Diabetes mentioned in these papers?"
)
print("Diabetes Treatments:\n", response)

# Query 2: Hypertension Risk Factors
response = query_engine.query(
    "What are the key risk factors for hypertension according to these papers?"
)
print("Hypertension Risk Factors:\n", response)

Diabetes Treatments:
 There is no information provided about the latest treatments for Type 2 Diabetes in the given context. The context appears to be related to diabetes insipidus, a condition distinct from Type 2 Diabetes.
Hypertension Risk Factors:
 According to the provided information, the key risk factors for hypertension that can affect disease outcomes include age, comorbidities, musculoskeletal disease, and other factors beyond pulmonary arterial hypertension (PAH) severity. Additionally, functional class, 6-minute walk distance (6MWD), and BNP or NT-proBNP levels are also considered important noninvasive prognostic measures.


In [35]:
# Query 1: Diabetes Treatment
response = query_engine.query(
    "What are the latest treatments for diabetes insipidus mentioned in these papers, particularly after pituitary surgery?"
)
print("Diabetes Insipidus Treatments:\n", response)

# Query 2: Hypertension Treatment
response = query_engine.query(
    "What are the recommended treatment algorithms for pulmonary arterial hypertension according to these papers?"
)
print("Pulmonary Arterial Hypertension Treatment:\n", response)

# Query 3: COVID-19 Vaccines
response = query_engine.query(
    "What are the latest updates on COVID-19 vaccine development, including variants and considerations for special populations?"
)
print("COVID-19 Vaccine Updates:\n", response)

# Query 4: Cancer Immunotherapy
response = query_engine.query(
    "How are nanoparticles being used in cancer immunotherapy and tumor microenvironment remodeling?"
)
print("Nanoparticles in Cancer Immunotherapy:\n", response)

# Query 5: AI in Radiology
response = query_engine.query(
    "What are the key applications of AI in radiology mentioned in these papers?"
)
print("AI in Radiology:\n", response)

Diabetes Insipidus Treatments:
 The latest treatments for diabetes insipidus mentioned in these papers, particularly after pituitary surgery, involve the use of desmopressin. Desmopressin is an analogue of arginine vasopressin, which helps to regulate the amount of water in the body by promoting water reabsorption in the kidneys. It is often used to treat central diabetes insipidus, a common complication after pituitary surgery.
Pulmonary Arterial Hypertension Treatment:
 Initial combination therapy with early and frequent reassessment is recommended. Medications targeting four pathways are now available, and maximal medical therapy is now four-drug therapy.
COVID-19 Vaccine Updates:
 The progress in vaccine development against SARS-CoV-2 has been remarkable due to advancements in molecular and biologic sciences. Various vaccine platforms, including whole virus vaccines, viral vector vaccines, and nucleic acid vaccines, are being developed to target the viral spike proteins and generat

In [36]:
import textwrap

# Function to print responses in a clean, wrapped format
def print_response(title, response):
    print(f"\n\033[1;36m{title}\033[0m")  # Cyan color for title
    print("\033[1;32m" + "=" * len(title) + "\033[0m")  # Green line under title
    wrapped_text = textwrap.fill(response, width=80)  # Wrap text to 80 characters
    print("\033[0;37m" + wrapped_text + "\033[0m")  # White text for response
    print("\n")

# Query 1: Diabetes Treatment
response = query_engine.query(
    "What are the latest treatments for diabetes insipidus mentioned in these papers, particularly after pituitary surgery?"
)
print_response("Diabetes Insipidus Treatments", str(response))

# Query 2: Hypertension Treatment
response = query_engine.query(
    "What are the recommended treatment algorithms for pulmonary arterial hypertension according to these papers?"
)
print_response("Pulmonary Arterial Hypertension Treatment", str(response))

# Query 3: COVID-19 Vaccines
response = query_engine.query(
    "What are the latest updates on COVID-19 vaccine development, including variants and considerations for special populations?"
)
print_response("COVID-19 Vaccine Updates", str(response))

# Query 4: Cancer Immunotherapy
response = query_engine.query(
    "How are nanoparticles being used in cancer immunotherapy and tumor microenvironment remodeling?"
)
print_response("Nanoparticles in Cancer Immunotherapy", str(response))

# Query 5: AI in Radiology
response = query_engine.query(
    "What are the key applications of AI in radiology mentioned in these papers?"
)
print_response("AI in Radiology", str(response))


[1;36mDiabetes Insipidus Treatments[0m
[0;37mThe latest treatments for diabetes insipidus mentioned in these papers,
particularly after pituitary surgery, involve the use of desmopressin.
Desmopressin is an analogue of arginine vasopressin, which helps to regulate
water balance in the body. It is often used to treat central diabetes insipidus,
a condition that can occur after pituitary surgery.[0m



[1;36mPulmonary Arterial Hypertension Treatment[0m
[0;37mInitial combination therapy with early and frequent reassessment is recommended.
Medications targeting four pathways are now available, and maximal medical
therapy is now four-drug therapy.[0m



[1;36mCOVID-19 Vaccine Updates[0m
[0;37mThe progress in vaccine development against SARS-CoV-2 has been remarkable due
to advancements in molecular and biologic sciences. Various vaccine platforms,
including whole virus vaccines, viral vector vaccines, and nucleic acid
vaccines, are being developed to target the viral spike prote

In [37]:
# Specific Detail Questions
response = query_engine.query(
    "What is the role of desmopressin in treating diabetes insipidus after pituitary surgery?"
)
print_response("Role of Desmopressin in Diabetes Insipidus", str(response))

response = query_engine.query(
    "What are the first-line medications recommended for pulmonary arterial hypertension?"
)
print_response("First-Line Medications for Pulmonary Arterial Hypertension", str(response))

# Broad Theme Questions
response = query_engine.query(
    "What are the long-term outcomes for patients with diabetes insipidus after pituitary surgery?"
)
print_response("Long-Term Outcomes for Diabetes Insipidus", str(response))

response = query_engine.query(
    "What are the common comorbidities associated with pulmonary arterial hypertension?"
)
print_response("Comorbidities in Pulmonary Arterial Hypertension", str(response))

# Cross-Document Questions
response = query_engine.query(
    "Are there any overlapping treatment strategies for diabetes insipidus and pulmonary arterial hypertension?"
)
print_response("Overlapping Treatments for Diabetes and Hypertension", str(response))

response = query_engine.query(
    "How has AI been used to improve COVID-19 vaccine development or distribution?"
)
print_response("AI in COVID-19 Vaccine Development", str(response))


[1;36mRole of Desmopressin in Diabetes Insipidus[0m
[0;37mDesmopressin is used to manage diabetes insipidus by replacing the deficient
antidiuretic hormone (ADH) and thereby reducing urine production. It is
administered intranasally or subcutaneously, and has been shown to be effective
in treating central diabetes insipidus, particularly in infants and young
children.[0m



[1;36mFirst-Line Medications for Pulmonary Arterial Hypertension[0m
[0;37mThe first-line medications recommended for pulmonary arterial hypertension
typically involve a combination of therapies targeting different pathways. These
include endothelin-1 receptor antagonists, such as ambrisentan, bosentan, and
macitentan, and guanylyl cyclase stimulators, like riociguat.[0m



[1;36mLong-Term Outcomes for Diabetes Insipidus[0m
[0;37mClose monitoring is required to evaluate the response to treatment and to
determine whether the diabetes insipidus is transient or permanent. This
suggests that the long-term out

In [38]:
# Diabetes Insipidus
response = query_engine.query(
    "What are the recommended dosages and administration methods for desmopressin in treating diabetes insipidus after pituitary surgery?"
)
print_response("Desmopressin Dosages and Administration", str(response))

# Pulmonary Arterial Hypertension
response = query_engine.query(
    "What are the specific first-line medications for pulmonary arterial hypertension, and how do they differ in mechanism of action?"
)
print_response("First-Line Medications for Pulmonary Arterial Hypertension", str(response))

# COVID-19 Vaccines
response = query_engine.query(
    "What are the specific challenges in developing COVID-19 vaccines for immunocompromised populations, and how are they addressed?"
)
print_response("Challenges in COVID-19 Vaccine Development", str(response))

# Cancer Immunotherapy
response = query_engine.query(
    "What are the specific types of nanoparticles used in cancer immunotherapy, and how do they differ in effectiveness?"
)
print_response("Nanoparticles in Cancer Immunotherapy", str(response))

# AI in Radiology
response = query_engine.query(
    "What are the specific AI algorithms mentioned for analyzing medical imaging, and how accurate are they?"
)
print_response("AI Algorithms in Radiology", str(response))


[1;36mDesmopressin Dosages and Administration[0m
[0;37mDesmopressin is typically administered orally or via injection, and its efficacy
and safety have been demonstrated in patients with central diabetes insipidus.
The duration of antidiuretic action of desmopressin can vary, and high doses may
require more frequent administration. A shortened duration of action has been
observed in patients requiring high doses of peroral antidiuretic drugs.[0m



[1;36mFirst-Line Medications for Pulmonary Arterial Hypertension[0m
[0;37mMedications targeting four pathways are now available for the treatment of
pulmonary arterial hypertension. These pathways include endothelin-1, nitric
oxide, prostacyclin, and bone morphogenetic protein/activin signalling.   The
specific first-line medications for pulmonary arterial hypertension typically
involve a combination of therapies targeting these pathways. However, the exact
medications used can vary depending on the individual patient's needs and the

In [39]:
# Diabetes Insipidus
response = query_engine.query(
    "What are the recommended dosages and administration methods for desmopressin in pediatric patients with diabetes insipidus, and are there any side effects or contraindications mentioned?"
)
print_response("Desmopressin in Pediatric Diabetes Insipidus", str(response))

# Pulmonary Arterial Hypertension
response = query_engine.query(
    "What are the success rates of combination therapies (e.g., endothelin receptor antagonists + phosphodiesterase-5 inhibitors) for pulmonary arterial hypertension, and are there any specific guidelines for pediatric patients?"
)
print_response("Combination Therapies for Pulmonary Arterial Hypertension", str(response))

# COVID-19 Vaccines
response = query_engine.query(
    "What are the specific strategies mentioned for improving COVID-19 vaccine efficacy in immunocompromised populations, and are there any clinical trial results supporting these strategies?"
)
print_response("Strategies for COVID-19 Vaccines in Immunocompromised Populations", str(response))

# Cancer Immunotherapy
response = query_engine.query(
    "What are the specific clinical outcomes of using biomimetic nanocarriers in cancer immunotherapy, and are there any ongoing clinical trials mentioned?"
)
print_response("Biomimetic Nanocarriers in Cancer Immunotherapy", str(response))

# AI in Radiology
response = query_engine.query(
    "What are the specific applications of convolutional neural networks (CNNs) in radiology, and how accurate are they in detecting abnormalities in medical imaging?"
)
print_response("CNNs in Radiology", str(response))


[1;36mDesmopressin in Pediatric Diabetes Insipidus[0m
[0;37mDesmopressin is a synthetic analogue of vasopressin, commonly used to treat
diabetes insipidus. In pediatric patients, the efficacy and safety of
desmopressin have been evaluated in various studies.  For infants with central
diabetes insipidus, subcutaneous administration of desmopressin has been found
to be effective. A study published in the Journal of Pediatric Endocrinology and
Metabolism reported the use of subcutaneous desmopressin in infants, with a
dosage of 0.05-0.1 mg per day.  Intranasal administration of desmopressin has
also been studied in pediatric patients. A review article mentioned the efficacy
and safety of intranasal desmopressin in infants with neurogenic diabetes
insipidus.  Regarding dosages, a multicenter open-label dose-titration study
found that orally disintegrating tablets of desmopressin were effective in
patients with central diabetes insipidus, with a starting dose of 0.05 mg per
day.  As for