In [3]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import time
from tqdm import tqdm
import re

class PMCOpenAccessCollector:
    def __init__(self, save_dir: str = "pmc_data", email: str = None):
        """
        Initialize the collector with a directory to save downloaded files.
        """
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(exist_ok=True)
        self.email = email or "tyzwhitt@gmail.com"

        # API endpoints
        self.esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    def search_articles(self, query: str, max_results: int = 100) -> list:
        """
        Search for articles matching the query in PMC.
        """
        params = {
            'db': 'pmc',
            'term': f"{query} AND open access[filter]",  # Only get open access articles
            'retmax': max_results,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'email': self.email
        }

        response = requests.get(self.esearch_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        root = ET.fromstring(response.content)
        id_list = root.findall('.//IdList/Id')
        return [id_elem.text for id_elem in id_list]

    def fetch_article(self, pmcid: str) -> dict:
        """
        Fetch and parse a single article by PMC ID.
        """
        params = {
            'db': 'pmc',
            'id': pmcid,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'email': self.email
        }

        try:
            response = requests.get(self.efetch_url, params=params)
            if response.status_code != 200:
                print(f"Failed to fetch PMCID {pmcid}")
                return None

            root = ET.fromstring(response.content)

            # Extract article data
            article_data = {
                'pmc_id': pmcid,
                'title': '',
                'abstract': '',
                'full_text': '',
                'keywords': [],
                'publication_date': '',
                'journal': ''
            }

            # Extract title
            title_elem = root.find(".//article-title")
            if title_elem is not None and title_elem.text:
                article_data['title'] = title_elem.text

            # Extract abstract
            abstract_paras = root.findall(".//abstract//p")
            article_data['abstract'] = " ".join(
                p.text for p in abstract_paras if p is not None and p.text
            )

            # Extract main text
            body_paras = root.findall(".//body//p")
            article_data['full_text'] = " ".join(
                p.text for p in body_paras if p is not None and p.text
            )

            # Extract keywords
            kwd_group = root.findall(".//kwd-group/kwd")
            article_data['keywords'] = [
                k.text for k in kwd_group if k is not None and k.text
            ]

            # Extract journal title
            journal_elem = root.find(".//journal-title")
            if journal_elem is not None and journal_elem.text:
                article_data['journal'] = journal_elem.text

            # Extract publication date
            pub_date = root.find(".//pub-date[@pub-type='epub']")
            if pub_date is not None:
                year = pub_date.find('year')
                month = pub_date.find('month')
                day = pub_date.find('day')
                date_parts = []
                for part in [year, month, day]:
                    if part is not None and part.text:
                        date_parts.append(part.text)
                article_data['publication_date'] = '-'.join(date_parts)

            return article_data

        except Exception as e:
            print(f"Error processing PMCID {pmcid}: {str(e)}")
            return None

    def collect_dataset(self, queries: list, max_articles_per_query: int = 25) -> pd.DataFrame:
        """
        Collect and process articles based on specific queries.
        """
        all_articles = []

        for query in queries:
            print(f"\nProcessing query: {query}")

            # Search for articles
            pmcids = self.search_articles(query, max_results=max_articles_per_query)
            print(f"Found {len(pmcids)} matching articles")

            # Fetch each article
            print("Fetching articles...")
            for pmcid in tqdm(pmcids):
                article_data = self.fetch_article(pmcid)
                if article_data:
                    article_data['search_query'] = query
                    all_articles.append(article_data)
                time.sleep(0.34)  # Respect NCBI's rate limits

        return pd.DataFrame(all_articles)

def main(): #why are we calling it main? can we call somthing else?
    # Initialize collector
    collector = PMCOpenAccessCollector(
        save_dir="pmc_oa_data",
        email="tyzwhitt@gmail.com"  # Replace with your email #replace with .env variable
    )

    # Define queries #possibly list of specific STIs to get better information #testing chlamydia first

    # sexually transmitted desease as a querey results in 0 articles found

    queries = [
        '"sexually transmitted infection"',
        '"chlamydia"',
        '"std"',
        '"sti"',

    ]

    # Collect data
    print("Starting data collection...")
    df = collector.collect_dataset(
        queries=queries,
        max_articles_per_query=25  # Adjust as needed
    )

    # Save to CSV
    output_file = "pmc_dataset.csv"
    df.to_csv(output_file, index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("\nArticles per query:")
    print(df['search_query'].value_counts())
    print("\nSample titles:")
    print(df[['title', 'search_query']].head())

    # Save some sample text for verification
    with open("sample_articles.txt", "w", encoding="utf-8") as f:
        for _, row in df.head().iterrows():
            f.write(f"Title: {row['title']}\n")
            f.write(f"Query: {row['search_query']}\n")
            f.write(f"Abstract: {row['abstract'][:500]}...\n")
            f.write("-" * 80 + "\n")

if __name__ == "__main__":
    main()

Starting data collection...

Processing query: "sexually transmitted infection"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:54<00:00,  2.17s/it]



Processing query: "chlamydia"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [01:06<00:00,  2.66s/it]



Processing query: "std"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:45<00:00,  1.81s/it]



Processing query: "sti"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [11:52<00:00, 28.48s/it]  


Dataset Statistics:
Total articles: 100

Articles per query:
search_query
"sexually transmitted infection"    25
"chlamydia"                         25
"std"                               25
"sti"                               25
Name: count, dtype: int64

Sample titles:
                                               title  \
0  Management of Gastroenteritis in an AIDS Patie...   
1  Barriers and facilitators to women’s access to...   
2  Risk of Clade II Mpox Associated with Intimate...   
3  Exploring cultural competence barriers in the ...   
4  Barriers and enablers that influence the uptak...   

                       search_query  
0  "sexually transmitted infection"  
1  "sexually transmitted infection"  
2  "sexually transmitted infection"  
3  "sexually transmitted infection"  
4  "sexually transmitted infection"  





In [4]:
df = pd.read_csv("pmc_dataset.csv")

In [5]:
df

Unnamed: 0,pmc_id,title,abstract,full_text,keywords,publication_date,journal,search_query
0,11469308,Management of Gastroenteritis in an AIDS Patie...,This case report explores the challenges assoc...,Human immunodeficiency virus (HIV) is predomin...,"['bipolar', 'depression\xa0', 'integrative med...",,Cureus,"""sexually transmitted infection"""
1,11468210,Barriers and facilitators to women’s access to...,Accessing sexual and reproductive health (SRH)...,Access to healthcare is a multifaceted indicat...,"['Sexual and reproductive health', 'Health ser...",2024-10-11,BMC Health Services Research,"""sexually transmitted infection"""
2,11466375,Risk of Clade II Mpox Associated with Intimate...,,Monkeypox virus can spread through intimate or...,[],2024-10-10,Morbidity and Mortality Weekly Report,"""sexually transmitted infection"""
3,11465850,Exploring cultural competence barriers in the ...,"Immigrant populations, especially women, conti...",Despite efforts made over the last two decades...,"['Cultural competence', 'Health inequities', '...",2024-10-9,International Journal for Equity in Health,"""sexually transmitted infection"""
4,11463776,Barriers and enablers that influence the uptak...,Heterosexual migrant men and women in the Neth...,While much has been accomplished in the past f...,[],2024-10-9,PLOS ONE,"""sexually transmitted infection"""
...,...,...,...,...,...,...,...,...
95,11460426,Prevalence of syphilis and associated factors ...,This systematic review accompanied by a meta-a...,Syphilis is considered a serious public health...,"['Syphilis', 'Pregnant women', 'Pregnancy comp...",2024-5-27,Revista Brasileira de Ginecologia e Obstetrícia,"""sti"""
96,11459845,Cervical cancer screening service utilisation ...,Cervical cancer is the predominant form of can...,Human papillomavirus (HPV) is the primary etio...,"['Attitude', 'Cervical cancer utilization', 'K...",2024-10-7,BMC Infectious Diseases,"""sti"""
97,11458712,Flood risk assessment under the shared socioec...,This study evaluates flood susceptibility and ...,"Over the years, various forms of natural and a...","['Flood modeling', 'SSPs', 'Buly supply points...",2024-10-7,Discover Water,"""sti"""
98,11458460,Time-course effects of exercise intervention o...,This study was to investigate the developmenta...,"Globally, there is a persistent public health ...","['obese adolescents', 'executive functions', '...",2024-9-24,Frontiers in Psychology,"""sti"""


In [6]:
df.columns

Index(['pmc_id', 'title', 'abstract', 'full_text', 'keywords',
       'publication_date', 'journal', 'search_query'],
      dtype='object')

In [7]:
df['full_text'][0]

"Human immunodeficiency virus (HIV) is predominantly a sexually transmitted infection that destroys CD4+ T lymphocyte cells, while acquired immunodeficiency syndrome (AIDS) is the final disease stage of the original viral infection. A diagnosis of AIDS is based on a CD4+ T lymphocyte count of <200 and/or the presence of an AIDS-defining illness. The rapid progression of HIV to AIDS can be due to poor treatment adherence [ The patient is a 36-year-old woman with a past medical history of Monkeypox, COVID-19, HIV, and syphilis. She also has a past psychiatric history of MDD, anxiety, and polysubstance abuse.\xa0She was diagnosed with depression and anxiety a year prior and denied taking any medication for either. She presented to the behavioral hospital on an involuntary admission for two weeks. The patient was brought in by police after she was found wandering and incoherent on the street. A drug test on admission was positive for marijuana and cocaine. The patient admitted to regular a

In [9]:
from collections import Counter
# Basic statistics
print(f"Total articles: {len(df)}")
print(f"Articles per query:\n{df['search_query'].value_counts()}")
print(f"Average text length: {df['full_text'].str.len().mean()}")

# Analyze keywords frequency
all_keywords = [kw for kws in df['keywords'] for kw in eval(kws)]
keyword_freq = Counter(all_keywords).most_common(20)

Total articles: 100
Articles per query:
search_query
"sexually transmitted infection"    25
"chlamydia"                         25
"std"                               25
"sti"                               25
Name: count, dtype: int64
Average text length: 15562.949494949495


In [14]:
all_keywords

['bipolar',
 'depression\xa0',
 'integrative medicine',
 'medical compliance',
 'poor adherence',
 'schizophrenia',
 'Sexual and reproductive health',
 'Health service',
 'Access',
 'Systematic review',
 'Delivery of healthcare',
 'Health equity',
 'Rural health',
 'Barriers and facilitators',
 'Women’s health',
 'Cultural competence',
 'Health inequities',
 'Immigrants',
 'Sexual and reproductive health',
 'Health service research',
 'Commercial sex work',
 'Prostitution',
 'Transactional sex',
 'Low-middle income countries',
 'Apicomplexan',
 'Kinetoplastid',
 'MDH',
 'parasitic protozoa',
 'SARS-CoV-2',
 'COVID-19',
 'RT-PCR',
 'Surveillance',
 'Human papillomavirus (HPV)',
 'HPV prevalence',
 'Viral genotyping',
 'Sexually transmitted infections',
 'Attitude',
 'Cervical cancer utilization',
 'Knowledge',
 'Women living with HIV',
 'schistosomiasis',
 'pregnancy',
 'parasites',
 'praziquantel',
 'South Africa',
 'Female sex workers',
 'Inconsistent condom use',
 'Risky sexual behav

In [13]:
keyword_freq

[('Sexual and reproductive health', 5),
 ('HIV', 4),
 ('Health service', 3),
 ('Access', 3),
 ('Systematic review', 3),
 ('Delivery of healthcare', 3),
 ('Health equity', 3),
 ('Rural health', 3),
 ('Barriers and facilitators', 3),
 ('Women’s health', 3),
 ('sexual health', 3),
 ('Cultural competence', 2),
 ('Health inequities', 2),
 ('Immigrants', 2),
 ('Health service research', 2),
 ('Commercial sex work', 2),
 ('Prostitution', 2),
 ('Transactional sex', 2),
 ('Low-middle income countries', 2),
 ('Attitude', 2)]