# Connecting to crossref, pubmed and others
## Imports

In [1]:
from datasets import load_dataset
import csv
from tqdm import tqdm
import pandas as pd



## Downloading Pubmed

In [2]:
# Stream the dataset
dataset = load_dataset("Datasets/pubmed.py", streaming=True,split='train')

In [3]:
first_example = next(iter(dataset))
print(first_example)

{'MedlineCitation': {'PMID': 1, 'DateCompleted': {'Year': 1976, 'Month': 1, 'Day': 16}, 'NumberOfReferences': 0, 'DateRevised': {'Year': 2024, 'Month': 1, 'Day': 9}, 'Article': {'Abstract': {'AbstractText': ''}, 'ArticleTitle': 'Formate assay in body fluids: application in methanol poisoning.', 'AuthorList': {'Author': {'LastName': ['Makar', 'McMartin', 'Palese', 'Tephly'], 'ForeName': ['A B', 'K E', 'M', 'T R'], 'Initials': ['AB', 'KE', 'M', 'TR'], 'CollectiveName': ['', '', '', '']}}, 'Language': 'eng', 'GrantList': {'Grant': {'GrantID': ['F32 AG064886', 'MC_UU_12013/5'], 'Agency': ['NIA NIH HHS', 'MRC'], 'Country': ['United States', 'United Kingdom']}}, 'PublicationTypeList': {'PublicationType': ['Journal Article', "Research Support, U.S. Gov't, P.H.S."]}}, 'MedlineJournalInfo': {'Country': 'United States'}, 'ChemicalList': {'Chemical': {'RegistryNumber': ['0', '142M471B3J', 'EC 1.2.-', 'Y4S76JWI15'], 'NameOfSubstance': ['Formates', 'Carbon Dioxide', 'Aldehyde Oxidoreductases', 'Met

In [22]:
output_file = "H:/Datasets/pubmed_since_2005.csv"
fieldnames = [
    "pmid", "title", "abstract", "authors", "language", "year", "month",
    "publication_type", "country", "agency", "mesh_terms", "doi"
]

def get_first_doi(article_id_list):
    try:
        return article_id_list["ArticleId"][1][1]
    except:
        return ""

def get_year(example):
    try:
        return int(example["MedlineCitation"]["DateCompleted"]["Year"])
    except:
        return 0

stream = iter(dataset)
row_count = 0
skipped_rows = 0

with open(output_file, mode="w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

    with tqdm(total=32_500_000, desc="Streaming PubMed", unit="records") as pbar:
        while True:
            try:
                example = next(stream)
                pbar.update(1)
            except StopIteration:
                break
            except Exception:
                continue  # skip broken rows

            year = get_year(example)

            # Heuristic skipping
            if year < 2005:
                if year < 1995:
                    skip_n = 50
                elif year < 2000:
                    skip_n = 5
                else:
                    skip_n = 10

                for _ in range(skip_n):
                    try:
                        next(stream)
                        pbar.update(1)
                        skipped_rows += 1
                    except:
                        break
                continue

            citation = example.get("MedlineCitation", {})
            article = citation.get("Article", {})
            pubmed_data = example.get("PubmedData", {})

            # Extract fields
            pmid = citation.get("PMID", "")
            title = article.get("ArticleTitle", "")
            abstract = article.get("Abstract", {}).get("AbstractText", "")
            language = article.get("Language", "")
            doi = get_first_doi(pubmed_data.get("ArticleIdList", {}))

            date_completed = citation.get("DateCompleted", {})
            month = date_completed.get("Month", "")

            authors_raw = article.get("AuthorList", {}).get("Author", {})
            last_names = authors_raw.get("LastName", [])
            fore_names = authors_raw.get("ForeName", [])
            authors = [f"{fn} {ln}" for fn, ln in zip(fore_names, last_names)]

            mesh = citation.get("MeshHeadingList", {}).get("MeshHeading", {})
            mesh_terms = mesh.get("DescriptorName", []) if isinstance(mesh, dict) else []

            publication_type = article.get("PublicationTypeList", {}).get("PublicationType", [])
            publication_type_str = "; ".join(publication_type)

            country = citation.get("MedlineJournalInfo", {}).get("Country", "")

            grant = article.get("GrantList", {}).get("Grant", {})
            agencies = grant.get("Agency", []) if isinstance(grant, dict) else []
            agencies_str = "; ".join(agencies)

            writer.writerow({
                "pmid": pmid,
                "title": title,
                "abstract": abstract,
                "authors": "; ".join(authors),
                "language": language,
                "year": year,
                "month": month,
                "publication_type": publication_type_str,
                "country": country,
                "agency": agencies_str,
                "mesh_terms": "; ".join(mesh_terms),
                "doi": doi
            })
            row_count += 1

Streaming PubMed:  13%|█▎        | 4379869/32500000 [1:23:02<3:41:13, 2118.51records/s]Got disconnected from remote data host. Retrying in 5sec [1/20]
Streaming PubMed:  30%|███       | 9899840/32500000 [3:06:46<2:23:37, 2622.46records/s]Got disconnected from remote data host. Retrying in 5sec [1/20]
Streaming PubMed:  42%|████▏     | 13529947/32500000 [4:18:00<2:03:32, 2559.31records/s]Got disconnected from remote data host. Retrying in 5sec [1/20]
Streaming PubMed:  52%|█████▏    | 16830000/32500000 [5:27:42<2:10:30, 2001.13records/s]Got disconnected from remote data host. Retrying in 5sec [1/20]
Streaming PubMed:  60%|█████▉    | 19428606/32500000 [6:33:00<2:05:30, 1735.80records/s]Ignoring field PublicationTypeList it's a <class 'str'> and we expect a <class 'dict'>
Streaming PubMed:  68%|██████▊   | 22215511/32500000 [7:49:38<1:28:39, 1933.18records/s]Ignoring field PublicationTypeList it's a <class 'str'> and we expect a <class 'dict'>
Streaming PubMed:  75%|███████▍  | 24291008/

In [None]:
output_file = "H:/Datasets/pubmed_full.csv"
fieldnames = [
    "pmid", "title", "abstract", "authors", "language", "year", "month",
    "publication_type", "country", "agency", "mesh_terms", "doi"
]

def get_first_doi(article_id_list):
    try:
        return article_id_list["ArticleId"][1][1]
    except:
        return ""

def get_year(example):
    try:
        return int(example["MedlineCitation"]["DateCompleted"]["Year"])
    except:
        return 0

stream = iter(dataset)
row_count = 0

with open(output_file, mode="w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

    with tqdm(total=32_500_000, desc="Streaming PubMed", unit="records") as pbar:
        while True:
            try:
                example = next(stream)
                pbar.update(1)
            except StopIteration:
                break
            except Exception:
                continue  # skip broken rows

            year = get_year(example)

            citation = example.get("MedlineCitation", {})
            article = citation.get("Article", {})
            pubmed_data = example.get("PubmedData", {})

            # Extract fields
            pmid = citation.get("PMID", "")
            title = article.get("ArticleTitle", "")
            abstract = article.get("Abstract", {}).get("AbstractText", "")
            language = article.get("Language", "")
            doi = get_first_doi(pubmed_data.get("ArticleIdList", {}))

            date_completed = citation.get("DateCompleted", {})
            month = date_completed.get("Month", "")

            authors_raw = article.get("AuthorList", {}).get("Author", {})
            last_names = authors_raw.get("LastName", [])
            fore_names = authors_raw.get("ForeName", [])
            authors = [f"{fn} {ln}" for fn, ln in zip(fore_names, last_names)]

            mesh = citation.get("MeshHeadingList", {}).get("MeshHeading", {})
            mesh_terms = mesh.get("DescriptorName", []) if isinstance(mesh, dict) else []

            publication_type = article.get("PublicationTypeList", {}).get("PublicationType", [])
            publication_type_str = "; ".join(publication_type)

            country = citation.get("MedlineJournalInfo", {}).get("Country", "")

            grant = article.get("GrantList", {}).get("Grant", {})
            agencies = grant.get("Agency", []) if isinstance(grant, dict) else []
            agencies_str = "; ".join(agencies)

            writer.writerow({
                "pmid": pmid,
                "title": title,
                "abstract": abstract,
                "authors": "; ".join(authors),
                "language": language,
                "year": year,
                "month": month,
                "publication_type": publication_type_str,
                "country": country,
                "agency": agencies_str,
                "mesh_terms": "; ".join(mesh_terms),
                "doi": doi
            })
            row_count += 1

## Browsaing PubMed

In [19]:
df_t = pd.read_csv("Datasets/pubmed_since_2005.csv")

In [20]:
df_t

Unnamed: 0,pmid,title,abstract,authors,language,year,month,publication_type,country,agency,mesh_terms,doi
0,10393939,Species richness and resource availability: a ...,The data on the number of species of insects a...,C K Kelly; T R Southwood,eng,2008,4,Journal Article,United States,,,PMC22179
1,11038575,Paul-Straubel-Kingdon trap for true zero-point...,A modification of the Paul-Straubel trap previ...,H Dehmelt; N Yu,eng,2008,4,Journal Article,United States,,,14192
2,11038577,The cloud-ionosphere discharge: a newly observ...,This paper deals with a luminous electric disc...,J R Winckler,eng,2008,4,Journal Article,United States,,,10848
3,11038576,Implications of a possible clustering of highe...,"Recently, a possible clustering of a subset of...",G Sigl; D N Schramm; S Lee; C T Hill,eng,2008,4,Journal Article,United States,,,10846
4,11038578,Ordinary representations and modular forms.,,C M Skinner; A J Wiles,eng,2008,4,Journal Article,United States,,,10849
...,...,...,...,...,...,...,...,...,...,...,...,...
1663747,38065126,European Court of Human Rights.,,Joseph Dute; Tom Goffin,eng,2023,12,Case Reports; Journal Article,Netherlands,,,10.1163/15718093-bja10116
1663748,38085332,Impact of the intra-aortic balloon pump on the...,,Nien-Hsun Wu; Tsung-Han Hsieh; Chun-Yu Chang; ...,eng,2024,2,Letter,Japan,,"Humans; Coronary Artery Bypass, Off-Pump; Repr...",10.1007/s00380-023-02347-5
1663749,38085330,"SHP2 clinical phenotype, cancer, or RASopathie...",SHP2 phosphatase promotes full activation of t...,Yonglan Liu; Wengang Zhang; Hyunbum Jang; Ruth...,eng,2023,12,Journal Article,Switzerland,,"Humans; Protein Tyrosine Phosphatase, Non-Rece...",10.1007/s00018-023-05052-8
1663750,38085333,SMARCB1-deficient sinonasal adenocarcinoma: a ...,SMARCB1-deficient sinonasal adenocarcinoma is ...,Alena Skálová; Touraj Taheri; Martina Bradová;...,eng,2024,8,Journal Article,Germany,,Humans; SMARCB1 Protein; Middle Aged; Male; Fe...,10.1007/s00428-023-03650-2


## Connecting to APIs
### PubMed

In [7]:
from Bio import Entrez
import time
import datetime
import pandas as pd

In [21]:
# Set email
Entrez.email = "perlinski.h@gmail.com"

# Calculate date range: last 30 days
end_date = datetime.date.today()
start_date = end_date - datetime.timedelta(days=30)
start_date_str = start_date.strftime("%Y/%m/%d")
end_date_str = end_date.strftime("%Y/%m/%d")

# Perform the search
handle = Entrez.esearch(
    db="pubmed",
    term="",
    datetype="pdat",
    mindate=start_date_str,
    maxdate=end_date_str,
    retmode="xml"
)
results = Entrez.read(handle)
handle.close()

# Output the count
print(f"Number of PubMed articles published from {start_date_str} to {end_date_str}: {results['Count']}")


Number of PubMed articles published from 2025/04/05 to 2025/05/05: 129749


128k articles per month is a lot. The requrests through Entrez allow 10k max, which on some days is exceeded. An option is to download using pre-made day files. 

In [None]:
import ftplib, requests, gzip, xml.etree.ElementTree as ET
import datetime, time, csv

# ── CONFIGURATION ─────────────────────────────────────────────────────────────
FTP_HOST   = "ftp.ncbi.nlm.nih.gov"
FTP_DIR    = "pubmed/updatefiles"
BASE_URL   = f"https://{FTP_HOST}/{FTP_DIR}/"
OUTPUT_CSV = "Datasets/pubmed_recent2.csv"

# Set your desired window here (inclusive; format YYYY-MM-DD)
START_DATE = "2025-04-25"
END_DATE   = "2025-05-05"

FIELDNAMES = [
    "pmid", "title", "abstract", "authors", "language",
    "pub_year", "pub_month",      # ← re-added month here
    "publication_type", "country",
    "agency", "mesh_terms", "doi"
]

# ── HELPER: iterate dates ─────────────────────────────────────────────────────
def daterange(start, end):
    days = (end - start).days + 1
    for i in range(days):
        yield start + datetime.timedelta(i)

# ── HELPER: list only .xml.gz for a date ───────────────────────────────────────
def list_xml_gz_for_date(day):
    ftp = ftplib.FTP(FTP_HOST)
    ftp.login()
    ftp.cwd(FTP_DIR)
    files = ftp.nlst()
    matched = []
    for fn in files:
        if not fn.endswith(".xml.gz"):
            continue
        try:
            ts = ftp.sendcmd(f"MDTM {fn}").split()[1]
            if datetime.datetime.strptime(ts, "%Y%m%d%H%M%S").date() == day:
                matched.append(fn)
        except:
            pass
    ftp.quit()
    return matched

# ── PARSE & FILTER ON YEAR ONLY ────────────────────────────────────────────────
def parse_updatefile(url, writer, start_year, end_year):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with gzip.GzipFile(fileobj=r.raw) as gz:
            for _, elem in ET.iterparse(gz, events=("end",)):
                if elem.tag != "PubmedArticle":
                    continue

                mc = elem.find("MedlineCitation")
                ai = mc.find("Article") if mc is not None else None
                pd = elem.find("PubmedData")

                # Extract publication year & month
                pub_year = pub_month = None
                if ai is not None:
                    pdj = ai.find("Journal/JournalIssue/PubDate")
                    if pdj is not None:
                        y = pdj.findtext("Year","")
                        m = pdj.findtext("Month","")
                        if y and y.isdigit():
                            pub_year = int(y)
                        # Month may be numeric or abbrev
                        if m and m.isdigit():
                            pub_month = int(m)
                        else:
                            MONTH_MAP = {
                                "Jan":1,"Feb":2,"Mar":3,"Apr":4,
                                "May":5,"Jun":6,"Jul":7,"Aug":8,
                                "Sep":9,"Oct":10,"Nov":11,"Dec":12
                            }
                            pub_month = MONTH_MAP.get(m, None)

                # Filter on year only
                if pub_year is None or not (start_year <= pub_year <= end_year):
                    elem.clear()
                    continue

                # Extract other fields, including pub_month
                pmid  = mc.findtext("PMID","")
                title = ai.findtext("ArticleTitle","") if ai is not None else ""
                abstract = ""
                if ai is not None:
                    ab = ai.find("Abstract")
                    if ab is not None:
                        abstract = "".join(ab.itertext())
                authors = ""
                if ai is not None:
                    authors = "; ".join(
                        f"{a.findtext('ForeName','')} {a.findtext('LastName','')}"
                        for a in ai.findall("AuthorList/Author")
                    )
                language = ai.findtext("Language","") if ai is not None else ""
                mesh_terms = "; ".join(
                    mh.findtext("DescriptorName","")
                    for mh in mc.findall("MeshHeadingList/MeshHeading")
                ) if mc is not None else ""
                publication_type = "; ".join(
                    pt.text for pt in ai.findall("PublicationTypeList/PublicationType")
                ) if ai is not None else ""
                country = mc.findtext("MedlineJournalInfo/Country","")
                agency = "; ".join(
                    g.findtext("Agency","") for g in ai.findall("GrantList/Grant")
                ) if ai is not None else ""
                doi = ""
                if pd is not None:
                    for aid in pd.findall("ArticleIdList/ArticleId"):
                        if aid.get("IdType")=="doi" and aid.text:
                            doi = aid.text
                            break

                writer.writerow({
                    "pmid": pmid,
                    "title": title,
                    "abstract": abstract,
                    "authors": authors,
                    "language": language,
                    "pub_year": pub_year,
                    "pub_month": pub_month,    # ← and here
                    "publication_type": publication_type,
                    "country": country,
                    "agency": agency,
                    "mesh_terms": mesh_terms,
                    "doi": doi
                })

                elem.clear()

# ── MAIN EXECUTION ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    start_dt = datetime.datetime.strptime(START_DATE, "%Y-%m-%d").date()
    end_dt   = datetime.datetime.strptime(END_DATE,   "%Y-%m-%d").date()
    start_year, end_year = start_dt.year, end_dt.year

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as outf:
        writer = csv.DictWriter(outf, FIELDNAMES)
        writer.writeheader()

        for day in daterange(start_dt, end_dt):
            print(f"Processing {day}…")
            for fn in list_xml_gz_for_date(day):
                url = BASE_URL + fn
                print(f"  ↳ {fn}")
                try:
                    parse_updatefile(url, writer, start_year, end_year)
                except Exception as e:
                    print(f"    ! parse error: {e}")
                time.sleep(0.1)

    print(f"\n✅ Done – data with pub_month saved to {OUTPUT_CSV}")


Processing 2025-04-25…
  ↳ pubmed25n1385.xml.gz
Processing 2025-04-26…
  ↳ pubmed25n1386.xml.gz
Processing 2025-04-27…
Processing 2025-04-28…
  ↳ pubmed25n1387.xml.gz
  ↳ pubmed25n1388.xml.gz
Processing 2025-04-29…
  ↳ pubmed25n1389.xml.gz
  ↳ pubmed25n1390.xml.gz
Processing 2025-04-30…
  ↳ pubmed25n1391.xml.gz
Processing 2025-05-01…
  ↳ pubmed25n1392.xml.gz
Processing 2025-05-02…
  ↳ pubmed25n1393.xml.gz
Processing 2025-05-03…
  ↳ pubmed25n1394.xml.gz
Processing 2025-05-04…
Processing 2025-05-05…

✅ Done – data with pub_month saved to Datasets/pubmed_recent.csv


In [36]:
df = pd.read_csv("Datasets/pubmed_recent.csv")
df

Unnamed: 0,pmid,title,abstract,authors,language,pub_year,pub_month,publication_type,country,agency,mesh_terms,doi
0,33534651,Reconstruction of elbow flexion after sarcoma ...,\n The latissimus dorsi flap is widel...,Matthieu Garcia; Yohan Legallois; Eberhard Sto...,eng,2025,4.0,Journal Article; Case Reports,England,,Humans; Surgical Flaps; Sarcoma; Superficial B...,10.1080/00015458.2021.1883392
1,34590439,Association Between Rheumatoid Arthritis Disea...,\n To analyze the effect of tooth los...,Katinka Albrecht; Paola de Pablo; Thorsten Eid...,eng,2025,2.0,Journal Article; Observational Study,United States,Deutsche Rheuma-Liga Bundesverband e.V.; Pfize...,"Humans; Arthritis, Rheumatoid; Male; Tooth Los...",10.1002/acr.24799
2,34075844,Let us ask the patient: psychological well-bei...,\n The onset of the COVID-19 pandemic...,Ashley Welch; Jan Nijs; Ines Van Loo; Marina M...,eng,2025,4.0,Journal Article,England,,Humans; COVID-19; Male; Female; Cardiac Surgic...,10.1080/00015458.2021.1917749
3,35437059,The Effect of EDS-FLU on Objective and Patient...,\n Exhalation delivery system with fl...,Randall A Ow; John P McGinnis; Harry J Sacks; ...,eng,2025,2.0,Journal Article; Randomized Controlled Trial,United States,,Humans; Sinusitis; Nasal Polyps; Rhinitis; Chr...,10.1177/01455613221088698
4,35487204,Two cases of extraluminal migration of fishbon...,\n Laryngopharyngeal or cervical pain...,Tae-Hun Lee; Sang-Wook Park; Somi Ryu; Ki Ju C...,eng,2025,2.0,Journal Article; Case Reports,United States,,Animals; Female; Humans; Middle Aged; Fishes; ...,10.1177/01455613221098787
...,...,...,...,...,...,...,...,...,...,...,...,...
191674,40317095,Bridging acute and chronic stress effects on i...,\n Acute stress triggers adaptive phy...,Lennart Seizer; Anja Pascher; Sonja Branz; Nad...,eng,2025,5.0,Journal Article,England,,"Humans; Longitudinal Studies; Stress, Psycholo...",10.1186/s40359-025-02777-y
191675,40317093,Activated alpha 9 integrin expression enables ...,\n Full recovery from spinal cord inj...,Katerina Stepankova; Barbora Smejkalova; Lucia...,eng,2025,5.0,Journal Article,England,"Grantová Agentura, Univerzita Karlova; Grantov...",Animals; Spinal Cord Injuries; Nerve Regenerat...,10.1186/s40478-025-01995-0
191676,40317094,"Redlining, reinvestment, and racial segregatio...","\n In the United States, firearm-rela...",Gia Barboza-Salerno; Brittany Liebhard; Sharef...,eng,2025,5.0,Journal Article,England,,,10.1186/s40621-025-00579-9
191677,40317097,Prevention of Heart Failure Induced by Doxorub...,"\n Dexrazoxane, a putative iron chela...",Hui-Ming Chang; Jinn-Yuan Hsu; Chul Ahn; Edwar...,eng,2025,5.0,Journal Article,England,NIH HHS; NIH HHS; NIH HHS; NIH HHS,,10.1186/s40959-025-00339-0


### Crossref

In [None]:
#!/usr/bin/env python3
import requests, csv, time, datetime

# ── CONFIG ─────────────────────────────────────────────────────────────────────
START_DATE = "2025-04-05"   # inclusive
END_DATE   = "2025-05-05"   # inclusive
OUTPUT_CSV = "Datasets/crossref_last_month2.csv"

# Fields to extract
FIELDNAMES = [
    "doi", "title", "abstract", "authors", "language",
    "pub_year", "pub_month", "type", "publisher",
    "funders", "subjects", "url"
]

# Base API endpoint
API_URL = "https://api.crossref.org/works"
# Build the date filter
date_filter = f"from-pub-date:{START_DATE},until-pub-date:{END_DATE}"

# ── 1) Get the total count of matching works -------------------------------
# rows=0 returns no items but populates message["total-results"]
resp = requests.get(API_URL, params={
    "filter": date_filter,
    "rows": 0
})
resp.raise_for_status()
message = resp.json()["message"]
total = message["total-results"]    # approximate total count :contentReference[oaicite:3]{index=3}
print(f"Total works in period: {total}")

# ── 2) Page through all records with cursor pagination ----------------------
cursor = "*"    # initial cursor for deep paging :contentReference[oaicite:4]{index=4}
rows   = 1000   # max items per batch

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, FIELDNAMES)
    writer.writeheader()
    fetched = 0

    while fetched < total:
        params = {
            "filter": date_filter,
            "rows": rows,
            "cursor": cursor,
        }
        r = requests.get(API_URL, params=params)
        r.raise_for_status()
        msg = r.json()["message"]

        items = msg.get("items", [])
        if not items:
            break  # no more data

        # Extract and write each item
        for item in items:
            doi = item.get("DOI", "")
            title = " ".join(item.get("title", []))
            abstract = item.get("abstract", "").replace("<jats:p>","").replace("</jats:p>","")
            authors = "; ".join(
                f"{a.get('given','')} {a.get('family','')}"
                for a in item.get("author", [])
                if a.get("given") and a.get("family")
            )
            language = item.get("language", "")
            issued = item.get("issued", {}).get("date-parts", [])
            pub_year = issued[0][0] if issued and issued[0] else ""
            pub_month = issued[0][1] if issued and len(issued[0]) > 1 else ""
            typ = item.get("type", "")
            publisher = item.get("publisher", "")
            funders = "; ".join(f.get("name","") for f in item.get("funder", []))
            subjects = "; ".join(item.get("subject", []))
            url = item.get("URL", "")

            writer.writerow({
                "doi": doi, "title": title, "abstract": abstract,
                "authors": authors, "language": language,
                "pub_year": pub_year, "pub_month": pub_month,
                "type": typ, "publisher": publisher,
                "funders": funders, "subjects": subjects, "url": url
            })

        fetched += len(items)
        print(f"Fetched {fetched}/{total} works…")

        # Prepare next cursor
        next_cursor = msg.get("next-cursor")
        if not next_cursor:
            break
        cursor = next_cursor  # update for next loop

        time.sleep(0.4)  # polite pacing: ~1 request/sec :contentReference[oaicite:5]{index=5}

print(f"\n✅ Done! Retrieved {fetched} works into {OUTPUT_CSV}")


Total works in period: 418221
Fetched 1000/418221 works…
Fetched 2000/418221 works…
Fetched 3000/418221 works…
Fetched 4000/418221 works…
Fetched 5000/418221 works…
Fetched 6000/418221 works…
Fetched 7000/418221 works…
Fetched 8000/418221 works…
Fetched 9000/418221 works…
Fetched 10000/418221 works…
Fetched 11000/418221 works…
Fetched 12000/418221 works…
Fetched 13000/418221 works…
Fetched 14000/418221 works…
Fetched 15000/418221 works…
Fetched 16000/418221 works…
Fetched 17000/418221 works…
Fetched 18000/418221 works…
Fetched 19000/418221 works…
Fetched 20000/418221 works…
Fetched 21000/418221 works…
Fetched 22000/418221 works…
Fetched 23000/418221 works…
Fetched 24000/418221 works…
Fetched 25000/418221 works…
Fetched 26000/418221 works…
Fetched 27000/418221 works…
Fetched 28000/418221 works…
Fetched 29000/418221 works…
Fetched 30000/418221 works…
Fetched 31000/418221 works…
Fetched 32000/418221 works…
Fetched 33000/418221 works…
Fetched 34000/418221 works…
Fetched 35000/418221 works…

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Datasets/crossref_last_month.csv")
df

Unnamed: 0,doi,title,abstract,authors,language,pub_year,pub_month,type,publisher,funders,subjects,url
0,10.1093/oso/9780197743720.003.0008,"What Should Enforcement Do, and Not Do?",<jats:title>Abstract</jats:title>\n ...,Hiroshi Motomura,en,2025,4,book-chapter,Oxford University PressNew York,,,https://doi.org/10.1093/oso/9780197743720.003....
1,10.1016/j.ijbiomac.2025.141672,Magnesium-gallate MOF integrated conductive cr...,,Guangyu Li; Yue Wang; Yanyun Pang; Xinyu Wang;...,en,2025,5,journal-article,Elsevier BV,Sinopec Ministry of Science and Technology Bas...,,https://doi.org/10.1016/j.ijbiomac.2025.141672
2,10.1016/j.fusengdes.2025.114900,Steady-state Superconducting Advanced Spherica...,,Yoshio Nagayama; Takaaki Fujita,en,2025,5,journal-article,Elsevier BV,Nagoya Institute of Technology; National Insti...,,https://doi.org/10.1016/j.fusengdes.2025.114900
3,10.1016/j.solener.2025.113344,Deriving the orientation of existing solar ene...,,David Lingfors; Robert Johansson; Johan Lindahl,en,2025,5,journal-article,Elsevier BV,STandUP for Energy; Swedish Energy Agency,,https://doi.org/10.1016/j.solener.2025.113344
4,10.1016/j.solener.2025.113381,Cyclostationary analysis for fault detection i...,,Mohammed Telidjane; Benaoumeur Bakhti,en,2025,5,journal-article,Elsevier BV,,,https://doi.org/10.1016/j.solener.2025.113381
...,...,...,...,...,...,...,...,...,...,...,...,...
418216,10.1016/j.ajog.2025.04.062,Robotic-assisted surgery for endometrial cance...,,Elina KIVEKÄS; Synnöve STAFF; Minna M. MÄENPÄÄ,en,2025,5,journal-article,Elsevier BV,,,https://doi.org/10.1016/j.ajog.2025.04.062
418217,10.4103/jiaps.jiaps_292_24,Urinary Bladder/ Posterior Urethra Morphology ...,,Ramesh Babu,en,2025,4,journal-article,Ovid Technologies (Wolters Kluwer Health),,,https://doi.org/10.4103/jiaps.jiaps_292_24
418218,10.1021/acs.macromol.4c03231,Phase Behavior and Thermal Properties of Preci...,,Michael Patrick Blatt; Cecilia Hansen; Victori...,en,2025,5,journal-article,American Chemical Society (ACS),Vehicle Technologies Office; National Science ...,,https://doi.org/10.1021/acs.macromol.4c03231
418219,10.21511/imfi.22(2).2025.09,The moderating role of investor sentiment on p...,Common market anomalies tested in developed ma...,Zaida Rizqi Zainul; Khaira Amalia Fachrudin; N...,,2025,4,journal-article,LLC CPC Business Perspectives,,,https://doi.org/10.21511/imfi.22(2).2025.09


### arXiv

In [8]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time, csv, datetime
import xml.etree.ElementTree as ET
from tqdm import tqdm

# ── USER CONFIGURATION ─────────────────────────────────────────────────────────
START_DATE   = "2025-04-25"   # inclusive, YYYY-MM-DD
END_DATE     = "2025-05-05"   # inclusive
OUTPUT_CSV   = "Datasets/arxiv_recent2.csv"

FIELDNAMES = [
    "arxiv_id", "title", "abstract", "authors",
    "pub_year", "pub_month", "doi",
    "primary_category", "categories", "pdf_url"
]

# OAI-PMH endpoint & metadata format
OAI_URL       = "http://export.arxiv.org/oai2"
METADATA_PREF = "arXiv"          
PAUSE_SECONDS = 5

# parse ISO date
def iso_to_date(s):
    return datetime.datetime.strptime(s, "%Y-%m-%d").date()

start_dt = iso_to_date(START_DATE)
end_dt   = iso_to_date(END_DATE)

# XML namespaces
ns = {
    "oai":   "http://www.openarchives.org/OAI/2.0/",
    "arxiv": "http://arxiv.org/OAI/arXiv/"
}

session = requests.Session()
retries = Retry(total=5, backoff_factor=1,
                status_forcelist=[500,502,503,504])
session.mount("http://", HTTPAdapter(max_retries=retries))

# OPEN OUTPUT CSV 
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvf:
    writer = csv.DictWriter(csvf, FIELDNAMES, quoting=csv.QUOTE_MINIMAL)
    writer.writeheader()

    resumption_token = None
    fetched = 0
    pbar = None

    while True:
        # build params
        if resumption_token:
            params = {
                "verb":            "ListRecords",
                "resumptionToken": resumption_token
            }
        else:
            params = {
                "verb":           "ListRecords",
                "metadataPrefix": METADATA_PREF,
                "from":           START_DATE,
                "until":          END_DATE
            }

        resp = session.get(OAI_URL, params=params)
        resp.raise_for_status()
        root = ET.fromstring(resp.text)

        # initialize progress bar once we know the total size
        if not pbar:
            token_el = root.find(".//oai:resumptionToken", ns)
            if token_el is not None:
                size = token_el.attrib.get("completeListSize")
                if size and size.isdigit():
                    pbar = tqdm(total=int(size), desc="records", unit="rec")
            if not pbar:
                pbar = tqdm(desc="records", unit="rec")

        for rec in root.findall(".//oai:record", ns):
            md = rec.find("oai:metadata/arxiv:arXiv", ns)
            if md is None:
                continue  # skip deletions or empty

            # true submission date
            created = md.findtext("arxiv:created", namespaces=ns)
            if not created:
                continue
            cdate = datetime.datetime.strptime(created, "%Y-%m-%d").date()
            if cdate < start_dt or cdate > end_dt:
                continue

            # extract fields
            aid    = md.findtext("arxiv:id",       namespaces=ns).strip()
            title  = md.findtext("arxiv:title",    namespaces=ns).strip()
            abstr  = md.findtext("arxiv:abstract", namespaces=ns).strip()

            # build full author names
            authors = []
            for a in md.findall("arxiv:authors/arxiv:author", ns):
                fore = a.findtext("arxiv:forenames", namespaces=ns) or ""
                key  = a.findtext("arxiv:keyname",   namespaces=ns) or ""
                full = " ".join(p for p in (fore, key) if p)
                if full:
                    authors.append(full)
            authors = "; ".join(authors)

            py, pm = cdate.year, cdate.month
            doi = md.findtext("arxiv:doi", namespaces=ns) or ""

            # categories is its own element
            cat_text   = md.findtext("arxiv:categories", namespaces=ns) or ""
            categories = cat_text.split()
            primary    = categories[0] if categories else ""

            pdf_url = f"http://arxiv.org/pdf/{aid}.pdf"

            writer.writerow({
                "arxiv_id":         aid,
                "title":            title,
                "abstract":         abstr,
                "authors":          authors,
                "pub_year":         py,
                "pub_month":        pm,
                "doi":              doi,
                "primary_category": primary,
                "categories":       "; ".join(categories),
                "pdf_url":          pdf_url
            })
            fetched += 1
            pbar.update(1)

        # handle resumptionToken
        token_el = root.find(".//oai:resumptionToken", ns)
        if token_el is None or not token_el.text:
            break
        resumption_token = token_el.text
        time.sleep(PAUSE_SECONDS)

    if pbar:
        pbar.close()

print(f"\n✅ Done – harvested {fetched} new arXiv records into {OUTPUT_CSV}")

records:  45%|████▍     | 5049/11315 [02:27<03:02, 34.25rec/s] 


✅ Done – harvested 5049 new arXiv records into Datasets/arxiv_recent2.csv





In [13]:
df = pd.read_csv("Datasets/arxiv_recent.csv")
df

Unnamed: 0,arxiv_id,title,abstract,authors,pub_year,pub_month,doi,primary_category,categories,pdf_url
0,2504.18055,Why Does My Transaction Fail? A First Look at ...,"Solana is an emerging blockchain platform, rec...",Xiaoye Zheng; Zhiyuan Wan; David Lo; Difan Xie...,2025,4,10.1145/3728943,cs.SE,cs.SE,http://arxiv.org/pdf/2504.18055.pdf
1,2504.18056,Range-based 6-DoF Monte Carlo SLAM with Gradie...,This paper presents range-based 6-DoF Monte Ca...,Takumi Nakao; Kenji Koide; Aoki Takanose; Shuj...,2025,4,,cs.RO,cs.RO,http://arxiv.org/pdf/2504.18056.pdf
2,2504.18057,Opportunistic Collaborative Planning with Larg...,Navigating autonomous vehicles in open scenari...,Jiayi Chen; Shuai Wang; Guoliang Li; Wei Xu; G...,2025,4,,cs.RO,cs.RO; cs.AI,http://arxiv.org/pdf/2504.18057.pdf
3,2504.18058,Exploring Personality-Aware Interactions in Sa...,The integration of dialogue agents into the sa...,Sijia Cheng; Wen-Yu Chang; Yun-Nung Chen,2025,4,,cs.CL,cs.CL; cs.AI,http://arxiv.org/pdf/2504.18058.pdf
4,2504.18059,POET: Prompt Offset Tuning for Continual Human...,As extended reality (XR) is redefining how use...,Prachi Garg; Joseph K J; Vineeth N Balasubrama...,2025,4,10.1007/978-3-031-73039-9_25,cs.CV,cs.CV,http://arxiv.org/pdf/2504.18059.pdf
...,...,...,...,...,...,...,...,...,...,...
5044,2505.01421,Galaxy Zoo CEERS: Bar fractions up to z~4.0,We study the evolution of the bar fraction in ...,Tobias Géron; R. J. Smethurst; Hugh Dickinson;...,2025,5,,astro-ph.GA,astro-ph.GA,http://arxiv.org/pdf/2505.01421.pdf
5045,2505.01422,Neutrino mass generation in asymptotically saf...,There exist several distinct phenomenological ...,Gustavo P. de Brito; Astrid Eichhorn; Antonio ...,2025,5,,hep-ph,hep-ph; gr-qc; hep-th,http://arxiv.org/pdf/2505.01422.pdf
5046,2505.01423,Negative Stepsizes Make Gradient-Descent-Ascen...,Efficient computation of min-max problems is a...,Henry Shugart; Jason M. Altschuler,2025,5,,math.OC,math.OC; cs.DS; cs.LG,http://arxiv.org/pdf/2505.01423.pdf
5047,2505.01424,"Computational, Data-Driven, and Physics-Inform...",Metal additive manufacturing enables unprecede...,D. Patel; R. Sharma; Y. B. Guo,2025,5,,cs.LG,cs.LG,http://arxiv.org/pdf/2505.01424.pdf
