In [1]:
import time, json
import pandas as pd
import requests

INPUT_CSV = "scopus_DOI.csv"
OUTPUT_CSV = "openalex_works_full.csv"
DOI_COLUMN = "DOI"

MAILTO = "mirco.senes@gmail.com"
SLEEP_SECONDS = 0.2

df = pd.read_csv(INPUT_CSV)

dois = (
    df[DOI_COLUMN].dropna().astype(str).str.strip().drop_duplicates().tolist()
)

rows = []
found = 0
skipped = 0

for i, doi in enumerate(dois, start=1):
    url = f"https://api.openalex.org/works/https://doi.org/{doi}"
    r = requests.get(url, params={"mailto": MAILTO}, timeout=30)

    if r.status_code != 200:
        skipped += 1
        continue

    w = r.json()
    found += 1

    authors = []
    institutions = set()
    for a in (w.get("authorships") or []):
        name = (a.get("author") or {}).get("display_name")
        if name:
            authors.append(name)
        for inst in (a.get("institutions") or []):
            dn = inst.get("display_name")
            if dn:
                institutions.add(dn)

    rows.append({
        "input_doi": doi,
        "openalex_id": w.get("id"),
        "doi": w.get("doi"),
        "title": w.get("title"),
        "publication_year": w.get("publication_year"),
        "type": w.get("type"),
        "cited_by_count": w.get("cited_by_count"),
        "journal": (w.get("host_venue") or {}).get("display_name"),
        "authors": "; ".join(authors),
        "institutions": "; ".join(sorted(institutions)),
        "raw_json": json.dumps(w, ensure_ascii=False),
    })

    if i % 50 == 0:
        print(f"{i}/{len(dois)} DOI processati")

    time.sleep(SLEEP_SECONDS)

out = pd.DataFrame(rows)
out.to_csv(OUTPUT_CSV, index=False)


print(f"unique DOIs: {len(dois)}")
print(f"Found on OpenAlex: {found}")
print(f"DOI discarded/not found: {skipped}")


100/243 DOI processati
150/243 DOI processati
200/243 DOI processati
unique DOIs: 243
Found on OpenAlex: 236
DOI discarded/not found: 7
