In [None]:
#ideas:
#summary model
#train chatnot model on our text (lang chain)
#conversational

In [33]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import time
from tqdm import tqdm
import re

class PMCOpenAccessCollector:
    def __init__(self, save_dir: str = "pmc_data", email: str = None):
        """
        Initialize the collector with a directory to save downloaded files.
        """
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(exist_ok=True)
        self.email = email or "tyzwhitt@gmail.com"

        # API endpoints
        self.esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    def search_articles(self, query: str, max_results: int = 100) -> list:
        """
        Search for articles matching the query in PMC.
        """
        params = {
            'db': 'pmc',
            'term': f"{query} AND open access[filter]",  # Only get open access articles
            'retmax': max_results,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'email': self.email
        }

        response = requests.get(self.esearch_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        root = ET.fromstring(response.content)
        id_list = root.findall('.//IdList/Id')
        return [id_elem.text for id_elem in id_list]

    def fetch_article(self, pmcid: str) -> dict:
        """
        Fetch and parse a single article by PMC ID.
        """
        params = {
            'db': 'pmc',
            'id': pmcid,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'email': self.email
        }

        try:
            response = requests.get(self.efetch_url, params=params)
            if response.status_code != 200:
                print(f"Failed to fetch PMCID {pmcid}")
                return None

            root = ET.fromstring(response.content)

            # Extract article data
            article_data = {
                'pmc_id': pmcid,
                'title': '',
                'abstract': '',
                'full_text': '',
                'keywords': [],
                'publication_date': '',
                'journal': ''
            }

            # Extract title
            title_elem = root.find(".//article-title")
            if title_elem is not None and title_elem.text:
                article_data['title'] = title_elem.text

            # Extract abstract
            abstract_paras = root.findall(".//abstract//p")
            article_data['abstract'] = " ".join(
                p.text for p in abstract_paras if p is not None and p.text
            )

            # Extract main text
            body_paras = root.findall(".//body//p")
            article_data['full_text'] = " ".join(
                p.text for p in body_paras if p is not None and p.text
            )

            # Extract keywords
            kwd_group = root.findall(".//kwd-group/kwd")
            article_data['keywords'] = [
                k.text for k in kwd_group if k is not None and k.text
            ]

            # Extract journal title
            journal_elem = root.find(".//journal-title")
            if journal_elem is not None and journal_elem.text:
                article_data['journal'] = journal_elem.text

            # Extract publication date
            pub_date = root.find(".//pub-date[@pub-type='epub']")
            if pub_date is not None:
                year = pub_date.find('year')
                month = pub_date.find('month')
                day = pub_date.find('day')
                date_parts = []
                for part in [year, month, day]:
                    if part is not None and part.text:
                        date_parts.append(part.text)
                article_data['publication_date'] = '-'.join(date_parts)

            return article_data

        except Exception as e:
            print(f"Error processing PMCID {pmcid}: {str(e)}")
            return None

    def collect_dataset(self, queries: list, max_articles_per_query: int = 25) -> pd.DataFrame:
        """
        Collect and process articles based on specific queries.
        """
        all_articles = []

        for query in queries:
            print(f"\nProcessing query: {query}")

            # Search for articles
            pmcids = self.search_articles(query, max_results=max_articles_per_query)
            print(f"Found {len(pmcids)} matching articles")

            # Fetch each article
            print("Fetching articles...")
            for pmcid in tqdm(pmcids):
                article_data = self.fetch_article(pmcid)
                if article_data:
                    article_data['search_query'] = query
                    all_articles.append(article_data)
                time.sleep(0.34)  # Respect NCBI's rate limits

        return pd.DataFrame(all_articles)

def main():
    # Initialize collector
    collector = PMCOpenAccessCollector(
        save_dir="pmc_oa_data",
        email="tyzwhitt@gmail.com"  # Replace with your email
    )

    # Define queries
    queries = [
        '"clinical trials" AND "results"',
        '"clinical decision support"',
        '"electronic health records"'

    ]

    # Collect data
    print("Starting data collection...")
    df = collector.collect_dataset(
        queries=queries,
        max_articles_per_query=25  # Adjust as needed
    )

    # Save to CSV
    output_file = "pmc_dataset.csv"
    df.to_csv(output_file, index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("\nArticles per query:")
    print(df['search_query'].value_counts())
    print("\nSample titles:")
    print(df[['title', 'search_query']].head())

    # Save some sample text for verification
    with open("sample_articles.txt", "w", encoding="utf-8") as f:
        for _, row in df.head().iterrows():
            f.write(f"Title: {row['title']}\n")
            f.write(f"Query: {row['search_query']}\n")
            f.write(f"Abstract: {row['abstract'][:500]}...\n")
            f.write("-" * 80 + "\n")

if __name__ == "__main__":
    main()

Starting data collection...

Processing query: "clinical trials" AND "results"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:18<00:00,  1.38it/s]



Processing query: "clinical decision support"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:17<00:00,  1.43it/s]



Processing query: "electronic health records"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:16<00:00,  1.56it/s]


Dataset Statistics:
Total articles: 75

Articles per query:
search_query
"clinical trials" AND "results"    25
"clinical decision support"        25
"electronic health records"        25
Name: count, dtype: int64

Sample titles:
                                               title  \
0  Obicetrapib as an Adjunct to Stable Statin The...   
1  Current Management and Therapy of Severe Aorti...   
2  Obicetrapib: There is still Life in the CETP I...   
3  Achieving a Pathologic Complete Response for L...   
4  Assessing the Efficacy of Thoracic Erector Spi...   

                      search_query  
0  "clinical trials" AND "results"  
1  "clinical trials" AND "results"  
2  "clinical trials" AND "results"  
3  "clinical trials" AND "results"  
4  "clinical trials" AND "results"  





In [35]:
df = pd.read_csv("pmc_dataset.csv")

In [39]:
df

Unnamed: 0,pmc_id,title,abstract,full_text,keywords,publication_date,journal,search_query
0,11456355,Obicetrapib as an Adjunct to Stable Statin The...,,ClinicalTrials.gov registration number: NCT054...,"['Obicetrapib', 'Cholesteryl ester transfer pr...",2024-4-3,Journal of Atherosclerosis and Thrombosis,"""clinical trials"" AND ""results"""
1,11456350,Current Management and Therapy of Severe Aorti...,Intervention for severe aortic stenosis (AS) h...,\nThe incidence of aortic stenosis (AS) has be...,"['Aortic stenosis', 'Preventive therapy']",2024-8-8,Journal of Atherosclerosis and Thrombosis,"""clinical trials"" AND ""results"""
2,11456347,Obicetrapib: There is still Life in the CETP I...,,\nIn the prevention of atherosclerotic cardiov...,[],2024-6-15,Journal of Atherosclerosis and Thrombosis,"""clinical trials"" AND ""results"""
3,11456338,Achieving a Pathologic Complete Response for L...,Neo-adjuvant chemoradiotherapy (CRT) and perio...,Esophageal cancer (EC) ranks as the seventh mo...,"['dosimetric parameters', 'locally advanced es...",,Cureus,"""clinical trials"" AND ""results"""
4,11456309,Assessing the Efficacy of Thoracic Erector Spi...,Background and aims Optimal postoperative care...,Major spine surgery causes severe postoperativ...,"['erector spinae plane block', 'general anaest...",,Cureus,"""clinical trials"" AND ""results"""
...,...,...,...,...,...,...,...,...
70,11452430,Health-related quality of life is an independe...,Transthyretin amyloid cardiomyopathy (ATTR-CM)...,Transthyretin amyloidosis is an infiltrative d...,"['Health-related quality of life', 'Quality of...",2024-8-6,Quality of Life Research,"""electronic health records"""
71,11452373,An intelligent learning system based on electr...,Stroke has a negative impact on people’s lives...,A stroke occurs when the blood supply to vario...,"['Stroke', 'Feature extraction', 'Machine lear...",2024-10-4,Scientific Reports,"""electronic health records"""
72,11452286,Development of a Multifaceted Program for Phar...,"In 2019, Indiana University launched the Preci...","Pharmacogenomics (PGx), the practice of tailor...",[],2024-8-21,Clinical pharmacology and therapeutics,"""electronic health records"""
73,11452046,Identifying individuals at high risk for demen...,Health policy in the UK and globally regarding...,Aging populations in many parts of the world a...,[],2024-10-4,PLOS ONE,"""electronic health records"""


In [37]:
df.columns

Index(['pmc_id', 'title', 'abstract', 'full_text', 'keywords',
       'publication_date', 'journal', 'search_query'],
      dtype='object')

In [38]:
df['full_text'][0]

'ClinicalTrials.gov registration number: NCT05421078 \nAtherosclerosis resulting from plaque accumulation in the arterial wall progresses over decades and may lead to clinical events such as myocardial infarction and stroke \nObicetrapib is a novel, highly selective, cholesteryl ester transfer protein (CETP) inhibitor in development for the treatment of hypercholesterolemia and reduction of cardiovascular risk \nThe present study was a placebo-controlled, double-blind, randomized, phase 2 trial of obicetrapib as an adjunct to stable statin therapy in Japanese subjects (NCT05421078). The trial was conducted from June 2022 to April 2023; nine clinical research sites in Japan enrolled participants (see list of Investigators in\n Tomomi Hakoda Nippon Kokan Fukuyama Hospital Fukuyama, Japan Satoshi Kuroyanagi Kishiwada Tokushu-Kai Hospital Osaka, Japan Yoshimitsu Yamasaki Kyosokai AMC NISHI-UMEDA Clinic Osaka, Japan Kenshi Fujii Sakurabashi Watanabe Hospital Osaka, Japan Atsushi Sueyoshi Uj

In [None]:
#preprocessing: readable 8th grade level
#transformers