<a href="https://colab.research.google.com/github/Sug-ar-N-Spice/Dr.Chats/blob/main/Pub_med_dr_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
import pandas as pd
import time
import xml.etree.ElementTree as ET
from typing import List, Dict
import numpy as np
from tqdm import tqdm

class PubMedCollector:
    def __init__(self, email: str, tool_name: str = "medical_nlp_project"):
        """
        Initialize the collector with your email (required by NCBI).
        """
        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        self.email = "tyzwhitt@gmail.com"
        self.tool = "Dr_chat"

    def search_articles(self, query: str, max_results: int = 1000) -> List[str]:
        """
        Search for articles matching the query and return their PMC IDs.
        """
        # First get the list of IDs
        search_url = f"{self.base_url}esearch.fcgi"
        params = {
            'db': 'pmc',
            'term': query,
            'retmax': max_results,
            'email': self.email,
            'tool': self.tool,
            'usehistory': 'y',
            'retmode': 'json'
        }

        response = requests.get(search_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        data = response.json()
        return data['esearchresult']['idlist']

    def fetch_article(self, pmc_id: str) -> Dict:
        """
        Fetch full text and metadata for a single article.
        """
        fetch_url = f"{self.base_url}efetch.fcgi" #api url
        params = {
            'db': 'pmc',
            'id': pmc_id,
            'retmode': 'xml', #try json
            'email': self.email,
            'tool': self.tool
        }

        response = requests.get(fetch_url, params=params)
        if response.status_code != 200:
            return None

        # Parse XML
        try:
            root = ET.fromstring(response.content) #

            # Extract article title
            title_element = root.find(".//article-title")
            title = title_element.text if title_element is not None else ""

            # Extract abstract
            abstract_elements = root.findall(".//abstract//p")
            abstract = " ".join([p.text for p in abstract_elements if p.text])

            # Extract main text
            body_elements = root.findall(".//body//p")
            full_text = " ".join([p.text for p in body_elements if p.text])

            return {
                'pmc_id': pmc_id,
                'title': title,
                'abstract': abstract,
                'full_text': full_text
            }
        except Exception as e:
            print(f"Error processing article {pmc_id}: {str(e)}")
            return None

    def collect_dataset(self, query: str, max_articles: int = 100) -> pd.DataFrame:
        """
        Collect a dataset of articles matching the query.
        """
        print(f"Searching for articles matching: {query}")
        article_ids = self.search_articles(query, max_articles)

        articles = []
        for article_id in tqdm(article_ids, desc="Fetching articles"):
            article_data = self.fetch_article(article_id)
            if article_data:
                articles.append(article_data)
            time.sleep(0.34)  # Respect NCBI's rate limit of 3 requests per second

        return pd.DataFrame(articles)

def preprocess_text(text: str) -> str:
    """
    Basic text preprocessing.
    """
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = " ".join(text.split())
    return text

def main():
    # Initialize collector with your email
    collector = PubMedCollector(email="tyzwhitt@gmail.com")

    # Example queries for medical text data
    queries = [
        '"clinical trials" AND "results"',
        '"clinical decision support"',
        '"electronic health records"',
        '"holistic"'
    ]

    # Collect data for each query
    all_data = []
    for query in queries:
        df = collector.collect_dataset(query, max_articles=100)
        all_data.append(df)

    # Combine all data
    final_df = pd.concat(all_data, ignore_index=True)

    # Preprocess the text
    final_df['processed_text'] = final_df['full_text'].apply(preprocess_text)

    # Save to CSV
    final_df.to_csv('medical_nlp_dataset.csv', index=False)
    print(f"Collected {len(final_df)} articles")

if __name__ == "__main__":
    main()

Searching for articles matching: "clinical trials" AND "results"


Fetching articles: 100%|██████████| 100/100 [00:58<00:00,  1.70it/s]


Searching for articles matching: "clinical decision support"


Fetching articles: 100%|██████████| 100/100 [00:56<00:00,  1.76it/s]


Searching for articles matching: "electronic health records"


Fetching articles: 100%|██████████| 100/100 [00:56<00:00,  1.76it/s]


Searching for articles matching: "holistic"


Fetching articles: 100%|██████████| 100/100 [01:03<00:00,  1.58it/s]


Collected 400 articles


In [None]:
#read in the CSV
df = pd.read_csv('medical_nlp_dataset.csv')

In [None]:
df

Unnamed: 0,pmc_id,title,abstract,full_text,processed_text
0,11456355,Obicetrapib as an Adjunct to Stable Statin The...,,ClinicalTrials.gov registration number: NCT054...,clinicaltrials.gov registration number: nct054...
1,11456350,Current Management and Therapy of Severe Aorti...,Intervention for severe aortic stenosis (AS) h...,\nThe incidence of aortic stenosis (AS) has be...,the incidence of aortic stenosis (as) has been...
2,11456347,Obicetrapib: There is still Life in the CETP I...,,\nIn the prevention of atherosclerotic cardiov...,in the prevention of atherosclerotic cardiovas...
3,11456343,Doxapram versus methylxanthine for apnea in pr...,"Recurrent apnea is common in preterm infants, ...",,
4,11456338,Achieving a Pathologic Complete Response for L...,Neo-adjuvant chemoradiotherapy (CRT) and perio...,Esophageal cancer (EC) ranks as the seventh mo...,esophageal cancer (ec) ranks as the seventh mo...
...,...,...,...,...,...
395,11448822,Nationwide Survey of Cystic Fibrosis Knowledge...,,Cystic fibrosis (CF) is commonly described in ...,cystic fibrosis (cf) is commonly described in ...
396,11448492,Efficacy of Intensified Hygiene Measures with ...,"Lymphedema, hydrocele, and acute adenolymphang...",Lymphatic filariasis (LF) is an incapacitating...,lymphatic filariasis (lf) is an incapacitating...
397,11448359,Factors associated with facility childbirth an...,Despite evidence of the beneficial effects of ...,"In recent years, coordinated efforts between t...","in recent years, coordinated efforts between t..."
398,11448346,Dual career policy at federal universities in ...,"Internationally, the implementation of holisti...",It takes an average of five to ten years for a...,it takes an average of five to ten years for a...


In [None]:
df['processed_text'][0]

'clinicaltrials.gov registration number: nct05421078 atherosclerosis resulting from plaque accumulation in the arterial wall progresses over decades and may lead to clinical events such as myocardial infarction and stroke obicetrapib is a novel, highly selective, cholesteryl ester transfer protein (cetp) inhibitor in development for the treatment of hypercholesterolemia and reduction of cardiovascular risk the present study was a placebo-controlled, double-blind, randomized, phase 2 trial of obicetrapib as an adjunct to stable statin therapy in japanese subjects (nct05421078). the trial was conducted from june 2022 to april 2023; nine clinical research sites in japan enrolled participants (see list of investigators in tomomi hakoda nippon kokan fukuyama hospital fukuyama, japan satoshi kuroyanagi kishiwada tokushu-kai hospital osaka, japan yoshimitsu yamasaki kyosokai amc nishi-umeda clinic osaka, japan kenshi fujii sakurabashi watanabe hospital osaka, japan atsushi sueyoshi uji tokush

In [None]:
df.head()

Unnamed: 0,pmc_id,title,abstract,full_text,processed_text
0,11456355,Obicetrapib as an Adjunct to Stable Statin The...,,ClinicalTrials.gov registration number: NCT054...,clinicaltrials.gov registration number: nct054...
1,11456350,Current Management and Therapy of Severe Aorti...,Intervention for severe aortic stenosis (AS) h...,\nThe incidence of aortic stenosis (AS) has be...,the incidence of aortic stenosis (as) has been...
2,11456347,Obicetrapib: There is still Life in the CETP I...,,\nIn the prevention of atherosclerotic cardiov...,in the prevention of atherosclerotic cardiovas...
3,11456343,Doxapram versus methylxanthine for apnea in pr...,"Recurrent apnea is common in preterm infants, ...",,
4,11456338,Achieving a Pathologic Complete Response for L...,Neo-adjuvant chemoradiotherapy (CRT) and perio...,Esophageal cancer (EC) ranks as the seventh mo...,esophageal cancer (ec) ranks as the seventh mo...


In [None]:
df.columns

Index(['pmc_id', 'title', 'abstract', 'full_text', 'processed_text'], dtype='object')

In [None]:
#ideas:
#summary model
#train chatnot model on our text (lang chain)
#conversational

In [7]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import urllib.request
import os
from tqdm import tqdm
import tarfile
import shutil
import time

class PMCOpenAccessCollector:
    def __init__(self, save_dir: str = "pmc_data", email: str = None):
        """
        Initialize the collector with a directory to save downloaded files.
        """
        self.base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
        self.esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(exist_ok=True)
        self.email = email

    def search_articles(self, query: str) -> list:
        """
        Search for articles matching the query in PMC.
        """
        params = {
            'db': 'pmc',
            'term': query,
            'retmax': 1000,  # Maximum results to return
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'email': self.email
        }

        response = requests.get(self.esearch_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        # Parse the XML response
        root = ET.fromstring(response.content)

        # Extract PMC IDs
        id_list = root.findall('.//IdList/Id')
        pmc_ids = [f"PMC{id_elem.text}" for id_elem in id_list]

        return pmc_ids

    def get_file_list(self, format: str = "xml", pmc_ids: list = None) -> list:
        """
        Get list of available files in the Open Access subset.
        format: can be 'xml' or 'txt'
        pmc_ids: list of PMC IDs to filter for
        """
        params = {
            'format': format
        }

        response = requests.get(self.base_url, params=params)
        if response.status_code != 200:
            raise Exception(f"API request failed with status code: {response.status_code}")

        # Parse the XML response
        root = ET.fromstring(response.content)

        # Extract file information
        files = []
        for record in root.findall('.//record'):
            record_id = record.find('id').text

            # If we have specific PMC IDs to filter for
            if pmc_ids:
                if any(pmc_id in record_id for pmc_id in pmc_ids):
                    files.append({
                        'id': record_id,
                        'link': record.find('link').text,
                        'format': format
                    })
            else:
                files.append({
                    'id': record_id,
                    'link': record.find('link').text,
                    'format': format
                })

        return files

    def collect_dataset(self, queries: list, max_articles_per_query: int = 100) -> pd.DataFrame:
        """
        Collect and process articles from the PMC OA subset based on specific queries.
        """
        all_articles = []

        for query in queries:
            print(f"\nProcessing query: {query}")

            # Search for articles matching the query
            pmc_ids = self.search_articles(query)
            print(f"Found {len(pmc_ids)} matching articles")

            # Get list of available files containing these PMC IDs
            files = self.get_file_list(format='xml', pmc_ids=pmc_ids)
            print(f"Found {len(files)} files containing matching articles")

            articles_collected = 0
            for file_info in files:
                if articles_collected >= max_articles_per_query:
                    break

                # Download and extract the tar.gz file
                extract_dir = self.download_and_extract(file_info)

                # Process XML files
                xml_files = list(Path(extract_dir).rglob('*.nxml'))
                print(f"Processing articles from {file_info['id']}...")

                for xml_path in tqdm(xml_files):
                    if articles_collected >= max_articles_per_query:
                        break

                    article_data = self.parse_article(str(xml_path))
                    if article_data:
                        article_data['search_query'] = query  # Add the query used to find this article
                        all_articles.append(article_data)
                        articles_collected += 1

                # Clean up extracted files
                shutil.rmtree(extract_dir)

                # Respect NCBI's rate limits
                time.sleep(0.34)

        return pd.DataFrame(all_articles)

    def download_and_extract(self, file_info: dict) -> str:
        """
        Download and extract a tar.gz file from PMC OA subset.
        """
        filename = file_info['link'].split('/')[-1]
        tar_path = self.save_dir / filename

        # Download the tar.gz file
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(file_info['link'], tar_path)

        # Create extraction directory
        extract_dir = self.save_dir / file_info['id']
        extract_dir.mkdir(exist_ok=True)

        # Extract the tar.gz file
        print("Extracting files...")
        with tarfile.open(tar_path, 'r:gz') as tar:
            tar.extractall(path=extract_dir)

        # Clean up tar file
        os.remove(tar_path)

        return str(extract_dir)

    def parse_article(self, xml_path: str) -> dict:
        """
        Parse a single PMC XML article file.
        """
        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # Extract article ID
            article_id = root.find(".//article-id[@pub-id-type='pmc']")
            article_id = article_id.text if article_id is not None else ""

            # Extract title
            title_element = root.find(".//article-title")
            title = title_element.text if title_element is not None else ""

            # Extract abstract
            abstract_elements = root.findall(".//abstract//p")
            abstract = " ".join([p.text for p in abstract_elements if p is not None and p.text])

            # Extract main text
            body_elements = root.findall(".//body//p")
            full_text = " ".join([p.text for p in body_elements if p is not None and p.text])

            # Extract keywords
            kwd_elements = root.findall(".//kwd")
            keywords = [k.text for k in kwd_elements if k is not None and k.text]

            return {
                'pmc_id': article_id,
                'title': title,
                'abstract': abstract,
                'full_text': full_text,
                'keywords': keywords
            }
        except Exception as e:
            print(f"Error processing {xml_path}: {str(e)}")
            return None

def main():
    # Initialize collector
    collector = PMCOpenAccessCollector(
        save_dir="pmc_oa_data",
        email="tyzwhitt@gmail.com"  # Required for NCBI E-utilities
    )

    # Define your queries
    queries = [
        '"clinical trials" AND "results"',
        '"clinical decision support"',
        '"electronic health records"'
        #'"holistic"'
    ]

    # Collect data
    print("Starting data collection...")
    df = collector.collect_dataset(
        queries=queries,
        max_articles_per_query=25  # This will collect up to 25 articles per query
    )

    # Save to CSV
    output_file = "pmc_oa_dataset.csv"
    df.to_csv(output_file, index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("\nArticles per query:")
    print(df.columns)
    #print(df['search_query'].value_counts())
    print("\nSample titles:")
    print(df[['title', 'search_query']]).head()

if __name__ == "__main__":
    main()

Starting data collection...

Processing query: "clinical trials" AND "results"
Found 1000 matching articles
Found 0 files containing matching articles

Processing query: "clinical decision support"
Found 1000 matching articles
Found 0 files containing matching articles

Processing query: "electronic health records"
Found 1000 matching articles
Found 0 files containing matching articles

Dataset Statistics:
Total articles: 0

Articles per query:
RangeIndex(start=0, stop=0, step=1)

Sample titles:


KeyError: "None of [Index(['title', 'search_query'], dtype='object')] are in the [columns]"

In [10]:
pmc_oa_data.columns()


NameError: name 'pmc_oa_data' is not defined

In [5]:
df = pd.read_csv("pmc_oa_dataset.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'pmc_oa_dataset.csv'