In [5]:
import os
import argparse
import pandas as pd
import requests
from urllib.parse import urlparse, unquote

def download_files(
    csv_path="Researchpapers_PhD_Synthetic_Data - Maths.csv",
    output_dir="docs/data"
) -> None:
    """
    Reads a CSV with a 'Link' column and downloads each URL to output_dir,
    saving each file with a .pdf extension.
    """
    # Load CSV
    df = pd.read_csv(csv_path)
    # print columns for debugging
    print("Columns in CSV:", df.columns.tolist())
    if 'Link' not in df.columns:
        raise ValueError("CSV must contain a 'Link' column.")

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    total = len(df['Link'].dropna())
    for idx, url in enumerate(df['Link'].dropna(), start=1):
        try:
            # Parse URL and derive a safe filename
            parsed = urlparse(url)
            raw_name = os.path.basename(parsed.path)
            raw_name = unquote(raw_name)  # decode URL-encoded characters

            if raw_name:
                name, ext = os.path.splitext(raw_name)
                # If there's no .pdf extension, force it
                filename = f"{name}.pdf" if ext.lower() != '.pdf' else raw_name
            else:
                filename = f"file_{idx}.pdf"

            out_path = os.path.join(output_dir, filename)

            # Stream download
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(out_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)

            print(f"[{idx}/{total}] Downloaded → {filename}")

        except Exception as e:
            print(f"[{idx}/{total}] FAILED  → {url}\n    {e}")

if __name__ == "__main__":
    download_files()


Columns in CSV: ['Research title', 'Link']
[1/20] Downloaded → 2505.pdf
[2/20] Downloaded → 2505.pdf
[3/20] Downloaded → 2505.pdf
[4/20] Downloaded → 2505.pdf
[5/20] Downloaded → 2505.pdf
[6/20] Downloaded → 2505.pdf
[7/20] Downloaded → 2505.pdf
[8/20] Downloaded → 2505.pdf
[9/20] Downloaded → 2505.pdf
[10/20] Downloaded → 2505.pdf
[11/20] Downloaded → 2505.pdf
[12/20] Downloaded → 2505.pdf
[13/20] Downloaded → 2505.pdf
[14/20] Downloaded → 2505.pdf
[15/20] Downloaded → 2505.pdf
[16/20] Downloaded → 2505.pdf
[17/20] Downloaded → 2404.pdf
[18/20] Downloaded → 2310.pdf
[19/20] Downloaded → 2403.pdf
[20/20] Downloaded → 2407.pdf
