### Install the Necessary Libraries Before Running the Crawler

In [1]:
# pip install requests
# pip install pymupdf
# pip install selenium

### Importing Libraries

In [2]:
import os
import json
import re
import time
import requests  # For downloading PDF
import fitz      # PyMuPDF for PDF text extraction

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

### Setup

In [None]:
CONFERENCE_URL = "https://openreview.net/..." #change this to the conference URL
OUTPUT_FILE = "output.json" #change this to the output file name
ERROR_FILE = "errors.json" #change this to the error file name

chrome_options = webdriver.ChromeOptions()
chrome_options.headless = True
driver = webdriver.Chrome(options=chrome_options)

### Checkpoint Settings

In [4]:
papers_data = []
errors_data = []
processed_ids = set()

if os.path.exists(OUTPUT_FILE):
    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            existing = json.load(f)
        if isinstance(existing, list):
            papers_data = existing
            for p in papers_data:
                if "id" in p:
                    processed_ids.add(p["id"])
        print(f"[INFO] Loaded {len(processed_ids)} papers from {OUTPUT_FILE}, will skip these.")
    except (json.JSONDecodeError, OSError):
        print("[WARN] Could not parse existing success file. Starting fresh.")
        papers_data = []
        processed_ids = set()

if os.path.exists(ERROR_FILE):
    try:
        with open(ERROR_FILE, 'r', encoding='utf-8') as f:
            existing_errors = json.load(f)
        if isinstance(existing_errors, list):
            errors_data = existing_errors
    except (json.JSONDecodeError, OSError):
        print("[WARN] Could not parse existing error file. Starting fresh for errors.")
        errors_data = []
else:
    errors_data = []

### Gathering Links

In [None]:
print(f"\n[INFO] Opening main tab page: {CONFERENCE_URL}")
driver.get(CONFERENCE_URL)

try:
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/forum?id=')]"))
    )
except Exception:
    print("[ERROR] Forum links did not load; quitting.")
    driver.quit()
    raise

link_elems = driver.find_elements(By.XPATH, "//a[contains(@href, '/forum?id=')]")
scroll_attempts = 0
while True:
    driver.execute_script("window.scrollBy(0, 1000);")
    time.sleep(2)
    new_elems = driver.find_elements(By.XPATH, "//a[contains(@href, '/forum?id=')]")
    if len(new_elems) > len(link_elems):
        link_elems = new_elems
        scroll_attempts = 0
    else:
        scroll_attempts += 1
        if scroll_attempts > 3:
            break

forum_urls = []
seen_forum_ids = set()
for elem in link_elems:
    href = elem.get_attribute('href')
    if href and "/forum?id=" in href:
        fid = href.split("id=")[-1]
        if fid not in seen_forum_ids:
            seen_forum_ids.add(fid)
            forum_urls.append(href)

print(f"[INFO] Found {len(forum_urls)} unique forum links.")

### PDF Download and Extraction

In [6]:
def download_pdf(pdf_url, output_path="temp.pdf"):
    """
    Downloads a PDF from pdf_url, saving it locally to output_path.
    Returns the path to the saved file.
    """
    print(f"[INFO] Downloading PDF from: {pdf_url}")
    response = requests.get(pdf_url)
    response.raise_for_status()  # if status != 200, raise an HTTPError
    with open(output_path, 'wb') as f:
        f.write(response.content)
    print(f"[INFO] Saved PDF to {output_path}")
    return output_path

def extract_text_from_pdf(pdf_path):
    """
    Opens the local PDF using PyMuPDF and extracts text from all pages.
    """
    print(f"[INFO] Extracting text from PDF: {pdf_path}")
    doc = fitz.open(pdf_path)
    all_text = []
    for page in doc:
        page_text = page.get_text("text")
        all_text.append(page_text)
    doc.close()
    return "\n".join(all_text)

def extract_emails_from_text(text):
    """
    Handles both standard emails and bracketed variants like {name1, name2}@domain.com.
    Splits local parts by commas, then appends the domain.
    Returns a list of all extracted emails (or an empty list).
    """
    # Pattern for bracketed local parts
    bracketed_pat = r"\{([^}@]+)\}@([a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})"
    # Pattern for standard addresses
    standard_pat  = r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"

    found_emails = []

    # 1) bracketed pattern matches, e.g. {user1, user2}@domain.com
    bracketed = re.findall(bracketed_pat, text)
    for local_parts, domain in bracketed:
        # e.g. "user1, user2", "example.com"
        sub_parts = [x.strip() for x in local_parts.split(',')]
        for sub in sub_parts:
            found_emails.append(f"{sub}@{domain}")

    # 2) standard pattern matches, e.g. user@domain.com
    standard = re.findall(standard_pat, text)
    found_emails.extend(standard)

    # Optionally deduplicate results
    found_emails = list(set(found_emails))
    return found_emails

def extract_emails_from_pdf(pdf_url, output_path="temp.pdf"):
    """
    1. Download the PDF from pdf_url to output_path.
    2. Extract text with PyMuPDF.
    3. Parse all email addresses (bracketed and standard).
    4. Remove the local PDF file whether or not emails are found.
    5. Return a list of emails or an empty list if none exist.
    """
    # Download
    local_path = download_pdf(pdf_url, output_path)
    try:
        # Extract text
        pdf_text = extract_text_from_pdf(local_path)
        # Extract emails
        email_list = extract_emails_from_text(pdf_text)
        return email_list
    finally:
        # Remove PDF file after finished
        if os.path.exists(local_path):
            os.remove(local_path)
            print(f"[INFO] Deleted temporary PDF: {local_path}")

### Scraping Author Profile

In [7]:
def scrape_author_profile(profile_url):
    """
    Opens the author's profile in a new tab, extracts name and email domain.
    """
    profile_data = {"name": None, "id": None, "email_domain": None}
    if not profile_url:
        return profile_data

    try:
        author_id = profile_url.split("id=")[-1]
        profile_data["id"] = author_id

        main_window = driver.current_window_handle
        driver.execute_script("window.open(arguments[0], '_blank');", profile_url)
        WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) == 2)
        driver.switch_to.window(driver.window_handles[-1])

        print(f"[INFO] Scraping author profile: {profile_url}")
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        try:
            h1_elem = driver.find_element(By.TAG_NAME, 'h1')
            profile_data["name"] = h1_elem.text.strip()
        except NoSuchElementException:
            try:
                h2_elem = driver.find_element(By.TAG_NAME, 'h2')
                profile_data["name"] = h2_elem.text.strip()
            except NoSuchElementException:
                profile_data["name"] = None

        page_text = driver.find_element(By.TAG_NAME, "body").text
        email_match = re.search(r'@([\w\.-]+\.\w+)\s+\(Confirmed\)', page_text)
        if not email_match:
            email_match = re.search(r'@([\w\.-]+\.\w+)', page_text)
        if email_match:
            profile_data["email_domain"] = email_match.group(1)

        print(f"[INFO] Author name: {profile_data['name']}, domain: {profile_data['email_domain']}")
    except Exception as e:
        profile_data["error"] = f"Author profile error: {str(e)}"
        print(f"[WARN] Could not scrape author profile: {e}")
    finally:
        if len(driver.window_handles) == 2:
            driver.close()
        driver.switch_to.window(main_window)

    return profile_data

### Scraping Official Reviews

In [8]:
def parse_official_review_text(text_block):
    review_dict = {
        "summary": "",
        "strengths": "",
        "weaknesses": "",
        "questions": "",
        "ethics": "",
        "rating": "",
        "confidence": "",
        "code_of_conduct": ""
    }
    lines = text_block.splitlines()

    if lines and "Official Review of Submission" in lines[0]:
        lines = lines[1:]

    current_field = None
    headings_map = {
        "summary": "summary",
        "strengths": "strengths",
        "weaknesses": "weaknesses",
        "questions": "questions",
        "flag for ethics review": "ethics",
        "rating": "rating",
        "confidence": "confidence",
        "code of conduct": "code_of_conduct"
    }

    for line in lines:
        stripped = line.strip()
        lower_line = stripped.lower()

        matched_heading = None
        for heading_text, key in headings_map.items():
            if lower_line.startswith(heading_text):
                matched_heading = key
                colon_idx = stripped.find(":")
                if colon_idx != -1:
                    remainder = stripped[colon_idx + 1:].strip()
                    review_dict[key] = remainder
                else:
                    review_dict[key] = ""
                break

        if matched_heading:
            current_field = matched_heading
        else:
            if current_field:
                review_dict[current_field] += " " + stripped

    return review_dict

### Scraping Core Paper

In [9]:
def extract_paper_data(forum_url):
    """
    Scrape a single paper's data, returning a dict.
    Also downloads the PDF if available and parses emails from it.
    """
    paper_info = {
        "id": forum_url.split("id=")[-1],
        "url": forum_url,
        "title": None,
        "abstract": None,
        "decision": None,
        "first_author": None,
        "second_author": None,
        "last_author": None,
        "reviews": [],
        "pdf_link": None,
        "pdf_emails": []
    }

    print(f"[INFO] Navigating to forum: {forum_url}")
    driver.get(forum_url)

    # Wait for the forum content or fallback
    try:
        loading_elem = driver.find_element(By.XPATH, "//*[text()='Loading']")
        WebDriverWait(driver, 15).until(EC.staleness_of(loading_elem))
    except NoSuchElementException:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Abstract') or contains(text(),'Official Review')]"))
        )
    except Exception as e:
        print(f"[WARN] Timeout waiting for forum content. {e}")

    # Scrape title
    print("[INFO] Scraping title...")
    try:
        title_elem = driver.find_element(By.TAG_NAME, 'h2')
        title_text = title_elem.text.strip()
        if "Download PDF" in title_text:
            title_text = title_text.split("[")[0].strip()
        paper_info["title"] = title_text
    except Exception as e:
        paper_info["title"] = None
        raise Exception(f"Failed to extract title: {e}")

    # Scrape abstract
    print("[INFO] Scraping abstract...")
    abstract_text = ""
    try:
        abstract_label = driver.find_element(
            By.XPATH,
            "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'abstract')]"
        )
        parent_text = abstract_label.find_element(By.XPATH, "..").text
        abstract_text = parent_text.replace("Abstract:", "").replace("ABSTRACT:", "").strip()
    except NoSuchElementException:
        print("[WARN] Could not locate an element containing 'Abstract'. Abstract set to None.")
    paper_info["abstract"] = abstract_text

    # Scrape decision
    print("[INFO] Scraping decision...")
    try:
        body_text = driver.find_element(By.TAG_NAME, "body").text
        decision_match = re.search(r'Decision:\s*(.+)', body_text)
        if decision_match:
            decision_str = decision_match.group(1).split('\n')[0].strip()
            paper_info["decision"] = decision_str
        else:
            pattern = re.search(r'Accept\s*\([\w\s]+\)|Reject|Withdrawn', body_text)
            if pattern:
                paper_info["decision"] = pattern.group(0)
            else:
                paper_info["decision"] = None
    except Exception as e:
        print(f"[WARN] Could not parse decision: {e}")
        paper_info["decision"] = None

    # Scrape authors
    print("[INFO] Scraping authors...")
    author_elems = driver.find_elements(By.XPATH, "//h3//a")
    author_urls = [elem.get_attribute('href') for elem in author_elems]
    num_authors = len(author_urls)

    if num_authors >= 1:
        paper_info["first_author"] = scrape_author_profile(author_urls[0])
    if num_authors >= 2:
        paper_info["second_author"] = scrape_author_profile(author_urls[1])
    if num_authors == 1:
        paper_info["last_author"] = paper_info["first_author"]
    elif num_authors == 2:
        paper_info["last_author"] = paper_info["second_author"]
    elif num_authors >= 3:
        paper_info["last_author"] = scrape_author_profile(author_urls[-1])

    # Scrape PDF link
    print("[INFO] Scraping PDF link...")
    try:
        pdf_elem = driver.find_element(By.XPATH, "//a[@class='citation_pdf_url']")
        pdf_link = pdf_elem.get_attribute("href")
        paper_info["pdf_link"] = pdf_link
    except NoSuchElementException:
        paper_info["pdf_link"] = None

    # If PDF link found, download PDF and extract emails
    if paper_info["pdf_link"]:
        try:
            print("[INFO] Downloading and parsing PDF for emails...")
            local_pdf_path = download_pdf(paper_info["pdf_link"], "temp_openreview.pdf")
            pdf_text = extract_text_from_pdf(local_pdf_path)
            emails_in_pdf = extract_emails_from_text(pdf_text)
            paper_info["pdf_emails"] = emails_in_pdf

            # Optionally remove the PDF after finishing
            # os.remove(local_pdf_path)

            print(f"[INFO] Found {len(emails_in_pdf)} email(s) in PDF.")

            # Attach an email to first, second, and last author if needed:
            if emails_in_pdf:
                # first author gets the first email
                if paper_info["first_author"]:
                    paper_info["first_author"]["full_email"] = emails_in_pdf[0]

                # second author gets the second if available
                if len(emails_in_pdf) >= 2 and paper_info["second_author"]:
                    paper_info["second_author"]["full_email"] = emails_in_pdf[1]

                # last author gets the last in the list if we have 3 or more
                if len(emails_in_pdf) >= 3 and paper_info["last_author"]:
                    paper_info["last_author"]["full_email"] = emails_in_pdf[-1]

        except Exception as e:
            print(f"[WARN] Could not download/parse PDF: {e}")

    # Scrape reviews
    print("[INFO] Scraping reviews...")
    reviews_data = []
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((
                By.XPATH,
                "//h4/span[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'official review of submission')]"
            ))
        )
    except Exception:
        print("[WARN] No 'Official Review of Submission' found after 10 seconds.")
        page_html = driver.page_source
        print("[DEBUG] Partial Page Source (first 2000 chars):")
        print(page_html[:2000])
        all_spans = driver.find_elements(By.XPATH, "//h4/span")
        print(f"[DEBUG] Found {len(all_spans)} <h4><span> elements. Listing their text:")
        for i, sp in enumerate(all_spans, start=1):
            print(f"  Span #{i} text = {repr(sp.text)}")
    else:
        review_blocks = driver.find_elements(
            By.XPATH,
            "//h4/span[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'official review of submission')]/../../following-sibling::div[contains(@class,'note-content') or contains(@class,'note-content-container')]"
        )
        for block in review_blocks:
            review_text_block = block.text.strip()
            parsed_review = parse_official_review_text(review_text_block)
            reviews_data.append(parsed_review)

    paper_info["reviews"] = reviews_data

    print("[INFO] Finished scraping this paper.")
    return paper_info

### Main Loop

In [None]:
errors_list = []

print(f"[INFO] Found {len(forum_urls)} forum links. Beginning scraping...")

for i, link in enumerate(forum_urls, start=1):
    fid = link.split('id=')[-1]
    if fid in processed_ids:
        print(f"[{i}/{len(forum_urls)}] Skipping already processed paper (Forum ID={fid}).")
        continue

    print(f"\n[{i}/{len(forum_urls)}] Processing forum: {fid}")
    partial_data = {"id": fid, "url": link}

    try:
        paper_data = extract_paper_data(link)
        papers_data.append(paper_data)
        processed_ids.add(fid)

        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(papers_data, f, indent=2)
        print(f"[INFO] Successfully saved data for forum {fid}.")
    except Exception as e:
        partial_data["error_message"] = str(e)
        errors_list.append(partial_data)
        print(f"[ERROR] Encountered an error with forum {fid}: {str(e)}")

    time.sleep(5)

errors_data.extend(errors_list)

if errors_data:
    with open(ERROR_FILE, 'w', encoding='utf-8') as f:
        json.dump(errors_data, f, indent=2)
    print(f"[INFO] Wrote {len(errors_data)} total errors to {ERROR_FILE}.")

driver.quit()
print(f"\n[INFO] All done! Processed {len(papers_data)} papers successfully, with {len(errors_data)} errors.")