In [4]:
#pip install 'requests<2.29' 'urllib3<2' 'charset_normalizer<3'
import os
import requests
import pandas as pd
from urllib.parse import quote

In [6]:
csv_path = "/mmfs1/home/a.patronigranda/Documents/6043.004US1_MRR/References 6043.004US1 MRR.csv"
download_dir = "/mmfs1/home/a.patronigranda/Documents/6043.004US1_MRR"
os.makedirs(download_dir, exist_ok=True)

In [7]:
df = pd.read_csv(csv_path)

In [8]:
# Function to download a PDF using DOI via doi.org or arXiv
def try_download_pdf(url, filename):
    """Attempt to download a PDF from a direct URL"""
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, allow_redirects=True, timeout=20)
        if 'pdf' in response.headers.get('Content-Type', '') or response.url.endswith('.pdf'):
            file_path = os.path.join(download_dir, f"{filename}.pdf")
            with open(file_path, "wb") as f:
                f.write(response.content)
            return True
    except Exception as e:
        print(f"Download error for {filename}: {e}")
    return False

def get_pdf_from_semantic_scholar(title, author=""):
    """Search Semantic Scholar for a paper by title and optionally author"""
    api_url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={quote(title)}&limit=1&fields=title,authors,year,url,openAccessPdf"
    try:
        r = requests.get(api_url, timeout=10)
        if r.status_code == 200:
            data = r.json()
            if data["total"] > 0:
                paper = data["data"][0]
                pdf_url = paper.get("openAccessPdf", {}).get("url", "")
                return pdf_url
    except Exception as e:
        print(f"Semantic Scholar lookup failed: {e}")
    return None


In [9]:
# Counters
success = 0
failed = 0
fail_log = []

In [10]:
# Process each reference

for idx, row in df.iterrows():
    title = str(row.get("Title", f"paper_{idx}")).strip()
    title_safe = title[:80].replace("/", "_").replace(" ", "_")
    doi = str(row.get("DOI", "")).strip()
    url = str(row.get("Website", "")).strip()
    author = str(row.get("Authors", "")).split(",")[0]  # Use first author
    downloaded = False

    # Try DOI
    if doi:
        downloaded = try_download_pdf(f"https://doi.org/{quote(doi)}", title_safe)

    # Try URL
    if not downloaded and url.startswith("http"):
        downloaded = try_download_pdf(url, title_safe)

    # Try Semantic Scholar search
    if not downloaded:
        print(f"🔍 Searching Semantic Scholar for: {title}")
        pdf_link = get_pdf_from_semantic_scholar(title, author)
        if pdf_link:
            downloaded = try_download_pdf(pdf_link, title_safe)

    if downloaded:
        print(f"✅ Downloaded: {title}")
        success += 1
    else:
        print(f"❌ Failed: {title}")
        failed += 1
        fail_log.append(title)

🔍 Searching Semantic Scholar for: Explainable Explainable Artificial Intelligence (XAI): Concepts, taxonomies, opportunities and challenges toward responsible AI,
✅ Downloaded: Explainable Explainable Artificial Intelligence (XAI): Concepts, taxonomies, opportunities and challenges toward responsible AI,
🔍 Searching Semantic Scholar for: A Particle Swarm Optimization Backtracking Technique Inspired by Science-Fiction Time Travel,
❌ Failed: A Particle Swarm Optimization Backtracking Technique Inspired by Science-Fiction Time Travel,
🔍 Searching Semantic Scholar for: A blackboard architecture for perception planning in autonomous vehicles,
❌ Failed: A blackboard architecture for perception planning in autonomous vehicles,
🔍 Searching Semantic Scholar for: Medical Expert Systems Survey,
❌ Failed: Medical Expert Systems Survey,
🔍 Searching Semantic Scholar for: Fundamentals of Expert Systems,
❌ Failed: Fundamentals of Expert Systems,
🔍 Searching Semantic Scholar for: Constructing an expert

In [11]:
# Summary
print("\n=== Download Summary ===")
print(f"✅ Successful downloads: {success}")
print(f"❌ Failed downloads: {failed}")


=== Download Summary ===
✅ Successful downloads: 27
❌ Failed downloads: 95
