In [None]:
#ideas:
#summary model
#train chatnot model on our text (lang chain)
#conversational

In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import time
from tqdm import tqdm
import re

class PMCOpenAccessCollector:
    def __init__(self, save_dir: str = "pmc_data", email: str = None):
        """
        Initialize the collector with a directory to save downloaded files.
        """
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(exist_ok=True)
        self.email = email or "tyzwhitt@gmail.com"

        # API endpoints
        self.esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    def search_articles(self, query: str, max_results: int = 100) -> list:
        """
        Search for articles matching the query in PMC.
        """
        params = {
            'db': 'pmc',
            'term': f"{query} AND open access[filter]",  # Only get open access articles
            'retmax': max_results,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'email': self.email
        }

        response = requests.get(self.esearch_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        root = ET.fromstring(response.content)
        id_list = root.findall('.//IdList/Id')
        return [id_elem.text for id_elem in id_list]

    def fetch_article(self, pmcid: str) -> dict:
        """
        Fetch and parse a single article by PMC ID.
        """
        params = {
            'db': 'pmc',
            'id': pmcid,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'email': self.email
        }

        try:
            response = requests.get(self.efetch_url, params=params)
            if response.status_code != 200:
                print(f"Failed to fetch PMCID {pmcid}")
                return None

            root = ET.fromstring(response.content)

            # Extract article data
            article_data = {
                'pmc_id': pmcid,
                'title': '',
                'abstract': '',
                'full_text': '',
                'keywords': [],
                'publication_date': '',
                'journal': ''
            }

            # Extract title
            title_elem = root.find(".//article-title")
            if title_elem is not None and title_elem.text:
                article_data['title'] = title_elem.text

            # Extract abstract
            abstract_paras = root.findall(".//abstract//p")
            article_data['abstract'] = " ".join(
                p.text for p in abstract_paras if p is not None and p.text
            )

            # Extract main text
            body_paras = root.findall(".//body//p")
            article_data['full_text'] = " ".join(
                p.text for p in body_paras if p is not None and p.text
            )

            # Extract keywords
            kwd_group = root.findall(".//kwd-group/kwd")
            article_data['keywords'] = [
                k.text for k in kwd_group if k is not None and k.text
            ]

            # Extract journal title
            journal_elem = root.find(".//journal-title")
            if journal_elem is not None and journal_elem.text:
                article_data['journal'] = journal_elem.text

            # Extract publication date
            pub_date = root.find(".//pub-date[@pub-type='epub']")
            if pub_date is not None:
                year = pub_date.find('year')
                month = pub_date.find('month')
                day = pub_date.find('day')
                date_parts = []
                for part in [year, month, day]:
                    if part is not None and part.text:
                        date_parts.append(part.text)
                article_data['publication_date'] = '-'.join(date_parts)

            return article_data

        except Exception as e:
            print(f"Error processing PMCID {pmcid}: {str(e)}")
            return None

    def collect_dataset(self, queries: list, max_articles_per_query: int = 25) -> pd.DataFrame:
        """
        Collect and process articles based on specific queries.
        """
        all_articles = []

        for query in queries:
            print(f"\nProcessing query: {query}")

            # Search for articles
            pmcids = self.search_articles(query, max_results=max_articles_per_query)
            print(f"Found {len(pmcids)} matching articles")

            # Fetch each article
            print("Fetching articles...")
            for pmcid in tqdm(pmcids):
                article_data = self.fetch_article(pmcid)
                if article_data:
                    article_data['search_query'] = query
                    all_articles.append(article_data)
                time.sleep(0.34)  # Respect NCBI's rate limits

        return pd.DataFrame(all_articles)

def main():
    # Initialize collector
    collector = PMCOpenAccessCollector(
        save_dir="pmc_oa_data",
        email="tyzwhitt@gmail.com"  # Replace with your email
    )

    # Define queries
    queries = [
        '"sexually transmitted infection"',
        '"sexually transmitted desease"',
        '"std"',
        '"sti"',

    ]

    # Collect data
    print("Starting data collection...")
    df = collector.collect_dataset(
        queries=queries,
        max_articles_per_query=25  # Adjust as needed
    )

    # Save to CSV
    output_file = "pmc_dataset.csv"
    df.to_csv(output_file, index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("\nArticles per query:")
    print(df['search_query'].value_counts())
    print("\nSample titles:")
    print(df[['title', 'search_query']].head())

    # Save some sample text for verification
    with open("sample_articles.txt", "w", encoding="utf-8") as f:
        for _, row in df.head().iterrows():
            f.write(f"Title: {row['title']}\n")
            f.write(f"Query: {row['search_query']}\n")
            f.write(f"Abstract: {row['abstract'][:500]}...\n")
            f.write("-" * 80 + "\n")

if __name__ == "__main__":
    main()

Starting data collection...

Processing query: "sexually transmitted infection"
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:54<00:00,  2.17s/it]



Processing query: "sexually transmitted desease"
Found 0 matching articles
Fetching articles...


0it [00:00, ?it/s]


Processing query: "std"





Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:40<00:00,  1.63s/it]



Processing query: "sti"
Found 25 matching articles
Fetching articles...


 12%|█▏        | 3/25 [00:05<00:41,  1.87s/it]

Failed to fetch PMCID 11462318


100%|██████████| 25/25 [00:43<00:00,  1.72s/it]


Dataset Statistics:
Total articles: 74

Articles per query:
search_query
"sexually transmitted infection"    25
"std"                               25
"sti"                               24
Name: count, dtype: int64

Sample titles:
                                               title  \
0  Barriers and enablers that influence the uptak...   
1  Female transactional sex workers’ experiences ...   
2                                            Allplex   
3  Malate dehydrogenase in parasitic protozoans: ...   
4  Extraction Free RT-PCR Surveillance Testing an...   

                       search_query  
0  "sexually transmitted infection"  
1  "sexually transmitted infection"  
2  "sexually transmitted infection"  
3  "sexually transmitted infection"  
4  "sexually transmitted infection"  





In [2]:
df = pd.read_csv("pmc_dataset.csv")

In [3]:
df

Unnamed: 0,pmc_id,title,abstract,full_text,keywords,publication_date,journal,search_query
0,11463776,Barriers and enablers that influence the uptak...,Heterosexual migrant men and women in the Neth...,While much has been accomplished in the past f...,[],2024-10-9,PLOS ONE,"""sexually transmitted infection"""
1,11462721,Female transactional sex workers’ experiences ...,"For a variety of reasons related to biology, b...",Female transactional sexual behaviour is asso...,"['Commercial sex work', 'Prostitution', 'Trans...",2024-10-9,BMC Public Health,"""sexually transmitted infection"""
2,11462318,Allplex,,"Sir, Sexually transmitted infections (STI) by ...",[],2024-7-01,Revista Española de Quimioterapia,"""sexually transmitted infection"""
3,11461325,Malate dehydrogenase in parasitic protozoans: ...,The role of malate dehydrogenase (MDH) in the ...,Protozoan parasites are responsible for a vari...,"['Apicomplexan', 'Kinetoplastid', 'MDH', 'para...",2024-10-03,Essays in Biochemistry,"""sexually transmitted infection"""
4,11460792,Extraction Free RT-PCR Surveillance Testing an...,"The COVID-19 pandemic necessitated sensitive, ...",The emergence of SARS-CoV-2 in late 2019 leadi...,"['SARS-CoV-2', 'COVID-19', 'RT-PCR', 'Surveill...",2023-7-21,COVID,"""sexually transmitted infection"""
...,...,...,...,...,...,...,...,...
69,11451046,The Tulumbe! Partnership: a case study in deve...,"Grassroots, community organizations are truste...",Communities have been at the forefront of orga...,"['community-engaged research', 'HIV/AIDS', 'pa...",2024-9-20,Frontiers in Public Health,"""sti"""
70,11449374,Breaking the silence of female genital schisto...,Female Genital Schistosomiasis (FGS) remains a...,One of the most prevalent Neglected Tropical D...,[],2024-9-23,PLOS Neglected Tropical Diseases,"""sti"""
71,11449042,You can have your cake and eat it too: Ectopic...,,One of my favorite sayings is “You can’t have ...,[],2024-8-14,The Plant Cell,"""sti"""
72,11448872,Transport of β-amyloid from brain to eye cause...,The article identifies a direct connection bet...,Increasing evidence indicates that the patholo...,[],2024-9-24,The Journal of Experimental Medicine,"""sti"""


In [37]:
df.columns

Index(['pmc_id', 'title', 'abstract', 'full_text', 'keywords',
       'publication_date', 'journal', 'search_query'],
      dtype='object')

In [4]:
df['full_text'][0]

'While much has been accomplished in the past four decades to reduce transmission of Human Immunodeficiency Virus (HIV) infections, it remains a global health concern with over 39 million people affected, of whom 2.3 million people include the World Health Organization (WHO) European Region [ Certain vulnerable groups being affected by HIV, such as heterosexual migrants, require more attention to ensure the UNAIDS’ 95-95-95 targets are reached. Stigma and discrimination continue to play roles in the low uptake of HIV testing. Gender also plays a role, with an overall higher proportion of migrant women testing for HIV compared with migrant men [ About one-third of all migrants that arrive in the Netherlands each year are from outside the EU [ Most migrants in Europe who live with HIV are often infected postmigration, which demonstrates a need for prevention and testing in countries of arrival. Studies conducted in Europe (including the Netherlands) showed that over 50% of heterosexual m

In [None]:
#preprocessing: readable 8th grade level
#transformers