In [2]:
import pandas as pd
import os
import urllib.request
import time

df = pd.read_csv('cites.csv')

pdf_folder = 'pdfs'
if not os.path.exists(pdf_folder):
    os.makedirs(pdf_folder)
    print(f"Created folder: {pdf_folder}")
else:
    print(f"Folder already exists: {pdf_folder}")

df_with_pdfs = df[df['pdf_url'].notna()].copy()

print(f"\nTotal PDFs to download: {len(df_with_pdfs)}")
print("\nStarting downloads...\n")

success_count = 0
failed_count = 0
failed_list = []

for idx, row in df_with_pdfs.iterrows():
    pdf_url = row['pdf_url']
    result_id = row['result_id']

    filename = f"{result_id}.pdf"
    filepath = os.path.join(pdf_folder, filename)

    if os.path.exists(filepath):
        print(f"[{idx+1}] Skipped (already exists): {filename}")
        success_count += 1
        continue

    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        req = urllib.request.Request(pdf_url, headers=headers)

        with urllib.request.urlopen(req, timeout=30) as response, open(filepath, 'wb') as out_file:
            data = response.read()
            out_file.write(data)

        print(f"[{idx+1}] Downloaded: {filename}")
        success_count += 1

        time.sleep(0.5)

    except Exception as e:
        print(f"[{idx+1}] Failed: {filename} - Error: {str(e)[:50]}")
        failed_count += 1
        failed_list.append({'result_id': result_id, 'url': pdf_url, 'error': str(e)[:100]})

print(f"\n{'='*60}")
print(f"Download Summary:")
print(f"  Successfully downloaded/existing: {success_count}")
print(f"  Failed: {failed_count}")
print(f"  Total: {len(df_with_pdfs)}")
print(f"{'='*60}")

if failed_list:
    print(f"\nFailed downloads:")
    for item in failed_list:
        print(f"  - {item['result_id']}: {item['error']}")

Folder already exists: pdfs

Total PDFs to download: 40

Starting downloads...

[1] Skipped (already exists): L0bsiVYsSXAJ.pdf
[2] Skipped (already exists): 8GGORR0MaSsJ.pdf
[6] Downloaded: BBM06Azg7dsJ.pdf
[8] Failed: 7kQ9ek3YM_wJ.pdf - Error: HTTP Error 403: Forbidden
[10] Failed: DRUrGSfWIm8J.pdf - Error: HTTP Error 403: Forbidden
[11] Failed: s2BzHa5iEXIJ.pdf - Error: HTTP Error 403: Forbidden
[13] Skipped (already exists): 4BYBPucAl94J.pdf
[19] Skipped (already exists): 92wqEI9nj3kJ.pdf
[21] Skipped (already exists): DV5jL8OjxvAJ.pdf
[23] Downloaded: jC68NnJ36K8J.pdf
[24] Failed: 2QLuf7u6GTIJ.pdf - Error: <urlopen error _ssl.c:983: The handshake operation
[25] Failed: 4fxhbChYrKwJ.pdf - Error: <urlopen error [Errno 104] Connection reset by pee
[27] Skipped (already exists): t5gKJgsKyx4J.pdf
[32] Skipped (already exists): upmOB6l0mqcJ.pdf
[34] Downloaded: dM9--Pf9JwYJ.pdf
[35] Skipped (already exists): tEdhwBtai2UJ.pdf
[37] Skipped (already exists): DqVgd_mTbkAJ.pdf
[38] Skipped (a