In [12]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import time
from tqdm import tqdm
import re

class PMCOpenAccessCollector:
    def __init__(self, save_dir: str = "pmc_data", email: str = None):
        """
        Initialize the collector with a directory to save downloaded files.
        """
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(exist_ok=True)
        self.email = "NCBI_API_KEY"

        # API endpoints
        self.esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    def search_articles(self, query: str, max_results: int = 100) -> list:
        """
        Search for articles matching the query in PMC.
        """
        params = {
            'db': 'pmc',
            'term': f"{query} AND open access[filter]",  # Only get open access articles
            'retmax': max_results,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'API_KEY': self.email
        }

        response = requests.get(self.esearch_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        root = ET.fromstring(response.content)
        id_list = root.findall('.//IdList/Id')
        return [id_elem.text for id_elem in id_list]

    def fetch_article(self, pmcid: str) -> dict:
        """
        Fetch and parse a single article by PMC ID.
        """
        params = {
            'db': 'pmc',
            'id': pmcid,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'API_KEY': self.email
        }

        try:
            response = requests.get(self.efetch_url, params=params)
            if response.status_code != 200:
                print(f"Failed to fetch PMCID {pmcid}")
                return None

            root = ET.fromstring(response.content)

            # Extract article data
            article_data = {
                'pmc_id': pmcid,
                'title': '',
                'abstract': '',
                'full_text': '',
                'keywords': [],
                'publication_date': '',
                'journal': ''
            }

            # Extract title
            title_elem = root.find(".//article-title")
            if title_elem is not None and title_elem.text:
                article_data['title'] = title_elem.text

            # Extract abstract
            abstract_paras = root.findall(".//abstract//p")
            article_data['abstract'] = " ".join(
                p.text for p in abstract_paras if p is not None and p.text
            )

            # Extract main text
            body_paras = root.findall(".//body//p")
            article_data['full_text'] = " ".join(
                p.text for p in body_paras if p is not None and p.text
            )

            # Extract keywords
            kwd_group = root.findall(".//kwd-group/kwd")
            article_data['keywords'] = [
                k.text for k in kwd_group if k is not None and k.text
            ]

            # Extract journal title
            journal_elem = root.find(".//journal-title")
            if journal_elem is not None and journal_elem.text:
                article_data['journal'] = journal_elem.text

            # Extract publication date
            pub_date = root.find(".//pub-date[@pub-type='epub']")
            if pub_date is not None:
                year = pub_date.find('year')
                month = pub_date.find('month')
                day = pub_date.find('day')
                date_parts = []
                for part in [year, month, day]:
                    if part is not None and part.text:
                        date_parts.append(part.text)
                article_data['publication_date'] = '-'.join(date_parts)

            return article_data

        except Exception as e:
            print(f"Error processing PMCID {pmcid}: {str(e)}")
            return None

    def collect_dataset(self, queries: list, max_articles_per_query: int = 25) -> pd.DataFrame:
        """
        Collect and process articles based on specific queries.
        """
        all_articles = []

        for query in queries:
            print(f"\nProcessing query: {query}")

            # Search for articles
            pmcids = self.search_articles(query, max_results=max_articles_per_query)
            print(f"Found {len(pmcids)} matching articles")

            # Fetch each article
            print("Fetching articles...")
            for pmcid in tqdm(pmcids):
                article_data = self.fetch_article(pmcid)
                if article_data:
                    article_data['search_query'] = query
                    all_articles.append(article_data)
                time.sleep(0.34)  # Respect NCBI's rate limits

        return pd.DataFrame(all_articles)

def main():
    collector = PMCOpenAccessCollector()
    queries = [
        "STI",
        "HIV",
        "AIDS",
        "STD",
        "Gonorrhea",
        "Chlamydia",
    ]

    # Collect data
    print("Starting data collection...")
    df = collector.collect_dataset(
        queries=queries,
        max_articles_per_query=25  # Adjust as needed
    )

    # Save to CSV
    output_file = "pmc_dataset_simon.csv"
    df.to_csv(output_file, index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("\nArticles per query:")
    print(df['search_query'].value_counts())
    print("\nSample titles:")
    print(df[['title', 'search_query']].head())

    # Save some sample text for verification
    with open("sample_articles.txt", "w", encoding="utf-8") as f:
        for _, row in df.head().iterrows():
            f.write(f"Title: {row['title']}\n")
            f.write(f"Query: {row['search_query']}\n")
            f.write(f"Abstract: {row['abstract'][:500]}...\n")
            f.write("-" * 80 + "\n")

if __name__ == "__main__":
    main()

Starting data collection...

Processing query: STI
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:18<00:00,  1.37it/s]



Processing query: HIV
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:16<00:00,  1.50it/s]



Processing query: AIDS
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:20<00:00,  1.19it/s]



Processing query: STD
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:19<00:00,  1.29it/s]



Processing query: Gonorrhea
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:19<00:00,  1.28it/s]



Processing query: Chlamydia
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Dataset Statistics:
Total articles: 150

Articles per query:
search_query
STI          25
HIV          25
AIDS         25
STD          25
Gonorrhea    25
Chlamydia    25
Name: count, dtype: int64

Sample titles:
                                               title search_query
0  Examining concordance of sexual-related factor...          STI
1  Halotolerant phosphate solubilizing bacteria i...          STI
2  Methamphetamine abuse impairs sequential worki...          STI
3  Barriers and facilitators to women’s access to...          STI
4  8th Public Health Palliative Care Internationa...          STI





In [3]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import time
from tqdm import tqdm
import re

class PMCOpenAccessCollector:
    def __init__(self, save_dir: str = "pmc_data", email: str = None):
        """
        Initialize the collector with a directory to save downloaded files.
        """
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(exist_ok=True)
        self.email = "NCBI_API_KEY"

        # API endpoints
        self.esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    def search_articles(self, query: str, max_results: int = 100) -> list:
        """
        Search for articles matching the query in PMC.
        """
        params = {
            'db': 'pmc',
            'term': f"{query} AND open access[filter]",  # Only get open access articles
            'retmax': max_results,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'API_KEY': self.email
        }

        response = requests.get(self.esearch_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        root = ET.fromstring(response.content)
        id_list = root.findall('.//IdList/Id')
        return [id_elem.text for id_elem in id_list]

    def fetch_article(self, pmcid: str) -> dict:
        """
        Fetch and parse a single article by PMC ID.
        """
        params = {
            'db': 'pmc',
            'id': pmcid,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'API_KEY': self.email
        }

        try:
            response = requests.get(self.efetch_url, params=params)
            if response.status_code != 200:
                print(f"Failed to fetch PMCID {pmcid}")
                return None

            root = ET.fromstring(response.content)

            # Extract article data
            article_data = {
                'pmc_id': pmcid,
                'title': '',
                'abstract': '',
                'full_text': '',
                'keywords': [],
                'publication_date': '',
                'journal': ''
            }

            # Extract title
            title_elem = root.find(".//article-title")
            if title_elem is not None and title_elem.text:
                article_data['title'] = title_elem.text

            # Extract abstract
            abstract_paras = root.findall(".//abstract//p")
            article_data['abstract'] = " ".join(
                p.text for p in abstract_paras if p is not None and p.text
            )

            # Extract main text
            body_paras = root.findall(".//body//p")
            article_data['full_text'] = " ".join(
                p.text for p in body_paras if p is not None and p.text
            )

            # Extract keywords
            kwd_group = root.findall(".//kwd-group/kwd")
            article_data['keywords'] = [
                k.text for k in kwd_group if k is not None and k.text
            ]

            # Extract journal title
            journal_elem = root.find(".//journal-title")
            if journal_elem is not None and journal_elem.text:
                article_data['journal'] = journal_elem.text

            # Extract publication date
            pub_date = root.find(".//pub-date[@pub-type='epub']")
            if pub_date is not None:
                year = pub_date.find('year')
                month = pub_date.find('month')
                day = pub_date.find('day')
                date_parts = []
                for part in [year, month, day]:
                    if part is not None and part.text:
                        date_parts.append(part.text)
                article_data['publication_date'] = '-'.join(date_parts)

            return article_data

        except Exception as e:
            print(f"Error processing PMCID {pmcid}: {str(e)}")
            return None

    def collect_dataset(self, queries: list, max_articles_per_query: int = 25) -> pd.DataFrame:
        """
        Collect and process articles based on specific queries.
        """
        all_articles = []

        for query in queries:
            print(f"\nProcessing query: {query}")

            # Search for articles
            pmcids = self.search_articles(query, max_results=max_articles_per_query)
            print(f"Found {len(pmcids)} matching articles")

            # Fetch each article
            print("Fetching articles...")
            for pmcid in tqdm(pmcids):
                article_data = self.fetch_article(pmcid)
                if article_data:
                    article_data['search_query'] = query
                    all_articles.append(article_data)
                time.sleep(0.34)  # Respect NCBI's rate limits

        return pd.DataFrame(all_articles)

def main(): #why are we calling it main? can we call somthing else?
    # Initialize collector
    collector = PMCOpenAccessCollector(
        save_dir="pmc_oa_data",
        emails="/Users/simonponce/Desktop/AI Bootcamp /06-Sourcing-AI-Project-Data/1/02-Ins_Pandas_Read_HTML/Solved/thekeys.env"  
        # Replace with your email #replace with .env variable
    )

    # Define queries #possibly list of specific STIs to get better information #testing chlamydia first

    # sexually transmitted desease as a querey results in 0 articles found

    queries = [
        '"sexually transmitted infection"',
        '"chlamydia"',
        '"std"',
        '"sti"',

    ]

    # Collect data
    print("Starting data collection...")
    df = collector.collect_dataset(
        queries=queries,
        max_articles_per_query=25  # Adjust as needed
    )

    # Save to CSV
    output_file = "pmc_dataset.csv"
    df.to_csv(output_file, index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("\nArticles per query:")
    print(df['search_query'].value_counts())
    print("\nSample titles:")
    print(df[['title', 'search_query']].head())

    # Save some sample text for verification
    with open("sample_articles.txt", "w", encoding="utf-8") as f:
        for _, row in df.head().iterrows():
            f.write(f"Title: {row['title']}\n")
            f.write(f"Query: {row['search_query']}\n")
            f.write(f"Abstract: {row['abstract'][:500]}...\n")
            f.write("-" * 80 + "\n")

if __name__ == "__main__":
    main()

TypeError: PMCOpenAccessCollector.__init__() got an unexpected keyword argument 'emails'