# Data Collection From PubMed

### Abstract Collection
For this project, abstracts from a variety of scientific research papers were collected from PubMed to simulate a real-world scenario.

The papers were selected based on specific topics relevant to common healthcare concerns, including:

- Diabetes Management
- Cardiovascular Risk
- Oncology Treatment Updates
- Thyroid Disorders
- Respiratory Care
- Vaccination Awareness
  
These topics were chosen randomly while considering their prevalence and relevance in current healthcare research.

In [11]:
# import the required libraries

from Bio import Entrez
from time import sleep
import pandas as pd
from bs4 import BeautifulSoup

In [13]:
#

Entrez.email = "bhandarirhea0697@gmail.com"

# Define search terms
TOPICS = ["Diabetes Management", "Cardiovascular Risk", "Oncology Treatment Update", "Thyroid Disorders", "Respiratory Care", "Vaccination Awareness"]
START_YEAR = "2020"
END_YEAR = "2025"
MAX_PER_TOPIC = 100


# fetching scientific research articles topics from year 2020 to 2025
def fetch_for_topic(term):
    query = f'{term}[Title/Abstract] AND ("{START_YEAR}"[PDAT] : "{END_YEAR}"[PDAT])'
    handle = Entrez.esearch(db="pubmed", term=query, retmax=MAX_PER_TOPIC)
    record = Entrez.read(handle)
    pmids = record.get("IdList", [])
    if not pmids:
        return []

    handle = Entrez.efetch(db="pubmed", id=",".join(pmids), rettype="abstract", retmode="xml")
    soup = BeautifulSoup(handle.read(), "lxml")
    results = []
    for art in soup.find_all("pubmedarticle"):
        pmid = art.find("pmid").text if art.find("pmid") else ""
        title = art.find("articletitle").text if art.find("articletitle") else ""
        abstract = art.find("abstracttext").text if art.find("abstracttext") else ""
        results.append({"PMID": pmid, "Title": title, "Abstract": abstract})
    return results

# Based on topics all the data were as dataframe and exported as csv
def main():
    all_data = []
    for topic in TOPICS:
        print(f"Fetching: {topic}")
        data = fetch_for_topic(topic)
        print(f"  → {len(data)} abstracts")
        for d in data:
            d["Topic"] = topic
        all_data.extend(data)
        sleep(0.4)

    df = pd.DataFrame(all_data)
    df.to_csv("pubmed_abstracts_2020_2025.csv", index=False)
    print("Saved results to CSV")

if __name__ == "__main__":
    main()

Fetching: Diabetes Management


  soup = BeautifulSoup(handle.read(), "lxml")


  → 100 abstracts
Fetching: Cardiovascular Risk
  → 100 abstracts
Fetching: Oncology Treatment Update
  → 11 abstracts
Fetching: Thyroid Disorders
  → 99 abstracts
Fetching: Respiratory Care
  → 99 abstracts
Fetching: Vaccination Awareness
  → 100 abstracts
Saved results to CSV
