In [1]:
import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import urllib.parse
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from dask.delayed import delayed
from tqdm import tqdm

In [2]:
# PubMed API base URL
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# List of medical imaging techniques
techniques = [
    "X-ray",
    "Magnetic Resonance Imaging (MRI)",
    "Computed Tomography (CT) Scan",
    "Positron Emission Tomography (PET) Scan",
    "Ultrasound",
    "Endoscopy",
    "Histology"
]

In [3]:
def retrieve_technique_pmc_ids(techniques, limit_per_category=None):
    # PubMed API base URL
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    # Dictionary to store technique PMC IDs
    technique_dictionary = {}

    # Perform PMC search for each technique
    for technique in techniques:
        print("Retrieving ids for: ", technique)
        technique_dictionary[technique] = []

        # Construct the search query
        query = f"{technique} [Title/Abstract]"

        # Initial values for pagination
        retmax = 100  # Number of results to retrieve per request
        retstart = 0  # Starting index of the results

        # Track the number of retrieved PMC IDs for the category
        retrieved_count = 0

        # Retrieve PMC IDs using pagination
        while True:
            # Send the search request to PubMed API
            search_url = f"{base_url}esearch.fcgi?db=pmc&term={query}&retmode=json&retstart={retstart}&retmax={retmax}"
            response = requests.get(search_url)
            data = response.json()

            # Check if the "esearchresult" key exists in the response
            if "esearchresult" in data:
                # Extract the PMC IDs from the search results
                pmc_ids = data["esearchresult"].get("idlist", [])

                # Add the retrieved PMC IDs to the technique's list
                technique_dictionary[technique].extend(pmc_ids)

                # Print the retrieved PMC IDs
                for pmc_id in pmc_ids:
                    retrieved_count += 1

                    # Check if the limit is reached
                    if limit_per_category is not None and retrieved_count >= limit_per_category:
                        break

            # Update the starting index for the next batch
            retstart += retmax

            # Check if there are more results to retrieve or if the limit is reached
            if (
                "esearchresult" not in data
                or retstart >= int(data["esearchresult"]["count"])
                or (limit_per_category is not None and retrieved_count >= limit_per_category)
            ):
                break

            # Introduce a delay between requests (e.g., 1 second)
            time.sleep(1)

        # Limit the number of PMC IDs per category if limit_per_category is provided
        if limit_per_category is not None and limit_per_category < len(technique_dictionary[technique]):
            technique_dictionary[technique] = technique_dictionary[technique][:limit_per_category]

    return technique_dictionary

In [4]:
def download_image(url, filename):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    }
    response = requests.get(url, headers=headers)

    #print("Response status code:", response.status_code)
    #print("Response content length:", len(response.content))

    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
        #print("Image downloaded successfully!")
    else:
        print("Failed to download the image. Status code:", response.status_code)

In [5]:
def retrieve_information_from_xml(xml_content):
    soup = BeautifulSoup(xml_content, 'xml')
    
    article_meta = soup.find('article-meta')
    #print(article_meta)
    if article_meta is None:
        return None, None, None, None, None
    
    pmid_element = article_meta.find('article-id', {'pub-id-type': 'pmid'})
    pmid = pmid_element.text if pmid_element else None

    doi_element = article_meta.find('article-id', {'pub-id-type': 'doi'})
    doi = doi_element.text if doi_element else None

    title_element = article_meta.find('article-title')
    title = title_element.text if title_element else None
    
    abstract_elements = soup.find_all('abstract')
    abstract = ' '.join([abstract_element.text.strip() for abstract_element in abstract_elements])
    abstract = abstract.strip() if abstract else None
    
    subject_terms = []
    subj_group_elements = soup.find_all('subj-group')
    for subj_group_element in subj_group_elements:
        if subj_group_element.get('subj-group-type') == 'heading':
            subject_terms.extend([subject.text for subject in subj_group_element.find_all('subject')])
    subject_terms = subject_terms if subject_terms else None
    
    return pmid, doi, title, abstract, subject_terms

In [6]:
def retrieve_images_and_save_to_dataframe(category, base_url, article_id):
    category_folder = os.path.join("image", category)
    os.makedirs(category_folder, exist_ok=True)

    xml_url = f"{base_url}efetch.fcgi?db=pmc&id={article_id}"
    print(xml_url)
    response = requests.get(xml_url)
    xml_content = response.content

    pmid, doi, title, abstract, subject_terms = retrieve_information_from_xml(xml_content)

    soup = BeautifulSoup(xml_content, 'xml')

    figure_urls = []
    figure_captions = []
    image_names = []

    fig_elements = soup.find_all('fig')
    for fig in fig_elements:
        graphic_element = fig.find('graphic')
        if graphic_element:
            url = graphic_element.get('xlink:href')
            if url:
                figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{article_id}/bin/{url}.jpg"
                figure_urls.append(figure_url)

                caption_element = fig.find('caption')
                if caption_element:
                    caption = caption_element.get_text()
                    figure_captions.append(caption)
                else:
                    figure_captions.append("")

                image_name = f"{category}_{article_id}_figure_{os.path.basename(url)}"
                image_names.append(image_name)  # Add the image name to the list

    # Create a DataFrame with the image details
    data = {
        'PMCID': [],
        'PMID': [],
        'DOI': [],
        'Title': [],
        'Abstract': [],
        'Subject Terms': [],
        'Article URL': [],
        'Image URL': [],
        'Caption': [],
        'Category': [],
        'Image_Name': [],
        'xml_url': []
    }

    for i, url in enumerate(figure_urls):
        image_name = os.path.basename(url)
        filename = f"{category}_{article_id}_figure_{image_name}"
        filepath = os.path.join(category_folder, filename)
        #download_image(url, filepath)
        #print(f"Downloaded: {filename}")

        data['PMCID'].append(article_id)
        data['PMID'].append(pmid)
        data['DOI'].append(doi)
        data['Title'].append(title)
        data['Abstract'].append(abstract)
        data['Subject Terms'].append(subject_terms)
        data['Article URL'].append(f"https://www.ncbi.nlm.nih.gov/pmc/articles/{article_id}")
        data['Image URL'].append(url)
        data['Caption'].append(figure_captions[i])
        data['Category'].append(category)
        data['Image_Name'].append(image_name)
        data['xml_url'].append(xml_url)

        time.sleep(1)  # Sleep for 1 second between each image download

    df = pd.DataFrame(data)

    return df

In [7]:
limit_per_category = 3000
# Create an empty DataFrame
# ddf_combined = dd.from_pandas(pd.DataFrame(), npartitions=2)
#df_combined = pd.DataFrame()

# Retrieve PMC IDs for each technique
technique_pmc_ids = retrieve_technique_pmc_ids(techniques, limit_per_category)

# Create a list to store delayed function calls
delayed_calls = []

time.sleep(6)
# Iterate over the techniques and PMC IDs
for technique, pmc_ids in technique_pmc_ids.items():
    pbar = tqdm(total=len(pmc_ids), desc=f"Processing {technique}")  # Create a progress bar for each technique
    #print(f"Technique: {technique}")
    #print("PMC IDs:")
    for pmc_id in pmc_ids:
        #print(pmc_id)
        delayed_call = delayed(retrieve_images_and_save_to_dataframe)(technique, base_url, pmc_id)
        delayed_calls.append(delayed_call)
        pbar.update(1)
    pbar.close()




Retrieving ids for:  X-ray
Retrieving ids for:  Magnetic Resonance Imaging (MRI)
Retrieving ids for:  Computed Tomography (CT) Scan
Retrieving ids for:  Positron Emission Tomography (PET) Scan
Retrieving ids for:  Ultrasound
Retrieving ids for:  Endoscopy
Retrieving ids for:  Histology


Processing X-ray: 100%|██████████| 3000/3000 [00:00<00:00, 8325.20it/s]
Processing Magnetic Resonance Imaging (MRI): 100%|██████████| 893/893 [00:00<00:00, 21881.45it/s]
Processing Computed Tomography (CT) Scan: 100%|██████████| 1818/1818 [00:00<00:00, 21276.92it/s]
Processing Positron Emission Tomography (PET) Scan: 100%|██████████| 563/563 [00:00<00:00, 18320.00it/s]
Processing Ultrasound: 100%|██████████| 3000/3000 [00:00<00:00, 12171.79it/s]
Processing Endoscopy: 100%|██████████| 3000/3000 [00:00<00:00, 21068.68it/s]
Processing Histology: 100%|██████████| 3000/3000 [00:00<00:00, 20967.99it/s]


In [8]:
unique_counts_per_key = {}

for key, value in technique_pmc_ids.items():
    unique_values = len(set(value))
    unique_counts_per_key[key] = unique_values

print(unique_counts_per_key)

{'X-ray': 3000, 'Magnetic Resonance Imaging (MRI)': 893, 'Computed Tomography (CT) Scan': 1818, 'Positron Emission Tomography (PET) Scan': 563, 'Ultrasound': 3000, 'Endoscopy': 3000, 'Histology': 3000}


In [9]:
# Create a dask cluster and client
cluster = LocalCluster()
client = Client(cluster)

In [10]:

# Compute the delayed function calls in parallel
dfs = dask.compute(*delayed_calls, scheduler='distributed')

# Concatenate the resulting DataFrames into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)

# Reset the index of the combined DataFrame
df_combined.reset_index(drop=True, inplace=True)

In [11]:
df_combined.shape

(29321, 12)

In [12]:
df_combined.groupby("Category")["PMID"].nunique()

Category
Computed Tomography (CT) Scan               610
Endoscopy                                   962
Histology                                   756
Magnetic Resonance Imaging (MRI)            339
Positron Emission Tomography (PET) Scan     215
Ultrasound                                 1292
X-ray                                      1299
Name: PMID, dtype: int64

In [13]:
df_combined.head()

Unnamed: 0,PMCID,PMID,DOI,Title,Abstract,Subject Terms,Article URL,Image URL,Caption,Category,Image_Name,xml_url
0,10363893,37469184,10.1177/15330338231189593,X-ray and MR Contrast Bearing Nanoparticles En...,Introduction\nRadiation therapy for head and n...,"[Image-guided drug delivery in cancer, Origina...",https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,\nThe theranostic nanoparticle (TNP) synthesis...,X-ray,10.1177_15330338231189593-fig1.jpg,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/...
1,10363893,37469184,10.1177/15330338231189593,X-ray and MR Contrast Bearing Nanoparticles En...,Introduction\nRadiation therapy for head and n...,"[Image-guided drug delivery in cancer, Origina...",https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,\nThe theranostic nanoparticle (TNP) distribut...,X-ray,10.1177_15330338231189593-fig2.jpg,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/...
2,10363893,37469184,10.1177/15330338231189593,X-ray and MR Contrast Bearing Nanoparticles En...,Introduction\nRadiation therapy for head and n...,"[Image-guided drug delivery in cancer, Origina...",https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,\nAnnexin V apoptosis assay. The Annexin V ass...,X-ray,10.1177_15330338231189593-fig3.jpg,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/...
3,10363893,37469184,10.1177/15330338231189593,X-ray and MR Contrast Bearing Nanoparticles En...,Introduction\nRadiation therapy for head and n...,"[Image-guided drug delivery in cancer, Origina...",https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,\nEffect of theranostic nanoparticles (TNPs) o...,X-ray,10.1177_15330338231189593-fig4.jpg,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/...
4,10363893,37469184,10.1177/15330338231189593,X-ray and MR Contrast Bearing Nanoparticles En...,Introduction\nRadiation therapy for head and n...,"[Image-guided drug delivery in cancer, Origina...",https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,https://www.ncbi.nlm.nih.gov/pmc/articles/1036...,\nTumor localization and image-guided radiatio...,X-ray,10.1177_15330338231189593-fig5.jpg,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/...


In [14]:
df_combined.to_csv("description.csv", index=False)

In [15]:
client.close()
cluster.close()