In [5]:
import os, csv, re, time, json, pathlib, requests
from urllib.parse import quote
from datetime import datetime

API_KEY    = "2666c3270101dc6d670514f592d567a8"      
INST_TOKEN = ""   # EPFL token (optional until you receive it)
BASE = "https://api.elsevier.com"
OUT_DIR = pathlib.Path("xml_membranes"); OUT_DIR.mkdir(exist_ok=True)
MANIFEST = OUT_DIR / "manifest.csv"

HEADERS_JSON = {"Accept": "application/json", "X-ELS-APIKey": API_KEY}
HEADERS_XML  = {"Accept": "text/xml",         "X-ELS-APIKey": API_KEY}
if INST_TOKEN:
    HEADERS_JSON["X-ELS-Insttoken"] = INST_TOKEN
    HEADERS_XML["X-ELS-Insttoken"]  = INST_TOKEN

# Configurable filters
ALLOW_JOURNALS = {
    "Journal of Membrane Science",
    "Separation and Purification Technology",
    "Microporous and Mesoporous Materials",
    "Chemical Engineering Journal",
    "Journal of Power Sources",
    "Carbon",
}
MIN_YEAR = 2000  # Minimum publication year to consider
SKIP_IF_TITLE_HAS = {"review", "perspective", "editorial"}  
PROPERTY_REGEXES = [
    r"\bBarrer(s)?\b", r"\bGPU\b", r"permeabilit(y|ies)", r"permeance",
    r"selectivit(y|ies)", r"\bflux\b", r"water flux", r"salt rejection",
    r"ionic conductivity", r"proton conductivity", r"\bmS\s*cm-?1\b",
    r"\bpore size\b", r"\bporosity\b", r"free volume", r"Young.?s modulus",
    r"\bÅ\b|\bnm\b"
]
PROP_PAT = re.compile("|".join(PROPERTY_REGEXES), re.IGNORECASE)

def score_entry(e):
    # Extract fields safely
    title = (e.get("dc:title") or "").strip()
    journal = (e.get("prism:publicationName") or "").strip()
    date = (e.get("prism:coverDate") or "")  # YYYY-MM-DD
    year = int(date[:4]) if date[:4].isdigit() else 0
    subtype = (e.get("subtypeDescription") or e.get("prism:aggregationType") or "").lower()
    abstract = (e.get("dc:description") or "").strip()

    s = 0
    if journal in ALLOW_JOURNALS: s += 2
    if year >= MIN_YEAR: s += 1
    text = f"{title}\n{abstract}"
    if len(re.findall(PROP_PAT, text)) >= 2: s += 2
    if any(k in title.lower() for k in SKIP_IF_TITLE_HAS): s -= 2
    if "review" in subtype: s -= 2
    return s

def save_manifest_row(row):
    newfile = not MANIFEST.exists()
    with MANIFEST.open("a", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        if newfile:
            w.writerow(["doi","pii","title","journal","date","score","status","path_or_reason"])
        w.writerow(row)

def fetch_xml(doi=None, pii=None):
    if doi:
        url = f"{BASE}/content/article/doi/{quote(doi)}?httpAccept=text/xml"
        fname = OUT_DIR / f"{doi.replace('/', '_')}.xml"
    else:
        url = f"{BASE}/content/article/pii/{quote(pii)}?httpAccept=text/xml"
        fname = OUT_DIR / f"{pii}.xml"
    if fname.exists() and fname.stat().st_size > 0:
        return str(fname), "cached"

    r = requests.get(url, headers=HEADERS_XML, timeout=60)
    if r.status_code == 200 and r.text.strip():
        fname.write_text(r.text, encoding="utf-8")
        return str(fname), "downloaded"
    return None, f"http {r.status_code}"

def harvest(query, max_records=500, page_size=25, sleep_s=0.25, preview=False, min_score=2):
    # Paginate search
    for offset in range(0, max_records, page_size):
        url = (f"{BASE}/content/search/sciencedirect"
               f"?query={quote(query)}&show=all&count={page_size}&offset={offset}")
        r = requests.get(url, headers=HEADERS_JSON, timeout=30)
        if r.status_code != 200:
            print(f"Search error {r.status_code}: {r.text[:200]}")
            break
        data = r.json()
        entries = data.get("search-results", {}).get("entry", []) or []
        if not entries:
            break

        for e in entries:
            doi = e.get("prism:doi")
            pii = e.get("pii") or (e.get("dc:identifier","").replace("PII:","") if "PII:" in (e.get("dc:identifier","")) else None)
            title = (e.get("dc:title") or "").strip()
            journal = (e.get("prism:publicationName") or "").strip()
            date = (e.get("prism:coverDate") or "")
            sc = score_entry(e)

            if sc < min_score:
                save_manifest_row([doi,pii,title,journal,date,sc,"filtered_low_score",""])
                continue

            if preview:
                save_manifest_row([doi,pii,title,journal,date,sc,"preview_only",""])
            else:
                path, status = fetch_xml(doi=doi) if doi else fetch_xml(pii=pii)
                if path:
                    save_manifest_row([doi,pii,title,journal,date,sc,status,path])
                    print(f"{status.upper()}: {title[:80]} -> {path}")
                else:
                    save_manifest_row([doi,pii,title,journal,date,sc,"skipped",status])
                    print(f"SKIP ({status}): {title[:80]}")
            time.sleep(sleep_s)

if __name__ == "__main__":
    # Start broad; refine later
    qry = 'TITLE-ABSTR-KEY("membrane") AND (permeability OR permeance OR selectivity OR "water flux" OR "salt rejection" OR "ionic conductivity" OR porosity OR "pore size") AND NOT doc-subtype(corr)'
    # First run in preview mode to tune filters without downloading:
    # harvest(qry, max_records=200, preview=True)
    # When happy, download:
    harvest(qry, max_records=600, page_size=25, preview=False, min_score=2)
    print(f"Manifest at: {MANIFEST.resolve()}\nXML dir: {OUT_DIR.resolve()}")


Search error 400: {"service-error":{"status":{"statusCode":"INVALID_INPUT","statusText":"Unable to translate query provided. Error=[Bad Field!]"}}}
Manifest at: /Users/oscarrosseneu/Desktop/EPFL/Summer_2025_LAS/MEX/src/xml_membranes/manifest.csv
XML dir: /Users/oscarrosseneu/Desktop/EPFL/Summer_2025_LAS/MEX/src/xml_membranes


In [3]:
import os, requests
API_KEY = (os.getenv("ELSEVIER_API_KEY") or "").strip()
print("key_present:", bool(API_KEY), "len:", len(API_KEY), "sample:", API_KEY[:6])

url = "https://api.elsevier.com/content/search/sciencedirect"
r = requests.get(url, params={"query":"membrane", "count":"1"},
                 headers={"Accept":"application/json", "X-ELS-APIKey": API_KEY}, timeout=20)
print("status:", r.status_code)
print("body:", r.text[:300])


key_present: False len: 0 sample: 
status: 401
body: {"service-error":{"status":{"statusCode":"AUTHENTICATION_ERROR","statusText":"Invalid API Key"}}}
