In [1]:
import numpy as np
import pandas as pd

import requests
import time
import openreview

In [2]:
##### CHECK THIS CODE ######
# https://github.com/berenslab/iclr-dataset/blob/main/scripts/01-dk-scrape-openreview.ipynb

In [3]:
from tqdm import tqdm
tqdm.pandas()


In [4]:
def fetch_json_with_retry(url, max_retries=5, sleep_seconds=30):
    """Robust helper to fetch JSON, handling empty/non-JSON responses."""
    attempt = 0
    while attempt < max_retries:
        attempt += 1
        try:
            resp = requests.get(url, timeout=30)
            resp.raise_for_status()
            return resp.json()
        except requests.exceptions.JSONDecodeError:
            # OpenReview occasionally returns empty bodies; wait and retry
            print(f"JSON decode failed for {url}. Retrying in {sleep_seconds}s (attempt {attempt}/{max_retries})...")
        except (requests.exceptions.RequestException, ValueError) as exc:
            print(f"Request error for {url}: {exc}. Retrying in {sleep_seconds}s (attempt {attempt}/{max_retries})...")
        time.sleep(sleep_seconds)
    raise RuntimeError(f"Failed to fetch JSON after {max_retries} attempts: {url}")


In [None]:
YEAR = 2024
API_URL = 'https://api2.openreview.net'

client = openreview.api.OpenReviewClient(baseurl=API_URL)
print(f"Fetching ICLR {YEAR} submissions via API v2...")

submissions = client.get_all_notes(
    invitation=f'ICLR.cc/{YEAR}/Conference/-/Submission',
    details='directReplies'
)
print(f"Found {len(submissions)} submissions.")


Downloading submission metadata across years...
2017: Request error for https://api.openreview.net/notes?invitation=ICLR.cc%2F2017%2Fconference%2F-%2Fsubmission&offset=0: 429 Client Error: Too Many Requests for url: https://api.openreview.net/notes?invitation=ICLR.cc%2F2017%2Fconference%2F-%2Fsubmission&offset=0. Retrying in 30s (attempt 1/5)...
Request error for https://api.openreview.net/notes?invitation=ICLR.cc%2F2017%2Fconference%2F-%2Fsubmission&offset=0: 429 Client Error: Too Many Requests for url: https://api.openreview.net/notes?invitation=ICLR.cc%2F2017%2Fconference%2F-%2Fsubmission&offset=0. Retrying in 30s (attempt 2/5)...


In [None]:
def parse_score(val):
    if val is None:
        return None
    if isinstance(val, (int, float)):
        return float(val)
    if isinstance(val, str):
        head = val.split(':')[0].strip()
        try:
            return float(head)
        except ValueError:
            return None
    return None


def get_paper_number(note):
    number = getattr(note, 'number', None)
    if number:
        return number
    return note.content.get('submission_number', {}).get('value')


def fetch_reviews_for_note(note, sleep_seconds=1.0, max_retries=5):
    paper_number = get_paper_number(note)
    if not paper_number:
        return []

    invitation = f"ICLR.cc/{YEAR}/Conference/Paper{paper_number}/-/Official_Review"

    for attempt in range(1, max_retries + 1):
        try:
            reviews = client.get_all_notes(invitation=invitation)
            time.sleep(sleep_seconds)
            return reviews
        except Exception as exc:
            message = str(exc)
            print(f"Error fetching reviews for Paper {paper_number}: {message}. Attempt {attempt}/{max_retries}")
            if 'Too Many Requests' in message or '429' in message:
                time.sleep(30)
                continue
            time.sleep(5)
    print(f"Giving up on Paper {paper_number} after {max_retries} attempts.")
    return []


In [None]:
records = []

for note in tqdm(submissions, desc="Processing submissions"):
    content = note.content
    keywords = content.get('keywords', {}).get('value', []) or []
    authors = content.get('authors', {}).get('value', []) or []

    row = {
        'year': YEAR,
        'id': note.id,
        'forum': note.forum,
        'title': content.get('title', {}).get('value', '').strip(),
        'abstract': content.get('abstract', {}).get('value', '').strip(),
        'authors': ', '.join(authors),
        'venue': content.get('venue', {}).get('value', 'Unknown'),
        'decision': '',
        'scores': [],  # keep legacy column name
        'score': [],
        'soundness_score': [],
        'presentation_score': [],
        'contribution_score': [],
        'keywords': [k.lower() for k in keywords],
    }

    replies = note.details.get('directReplies', []) or []
    for reply in replies:
        invitation = reply.get('invitation', '')
        if 'Decision' in invitation:
            row['decision'] = reply.get('content', {}).get('decision', {}).get('value', '')
            break

    if not row['decision']:
        venue_lower = row['venue'].lower()
        if 'withdrawn' in venue_lower:
            row['decision'] = 'Withdrawn'
        elif 'desk reject' in venue_lower:
            row['decision'] = 'Desk Reject'
        elif 'accept' in venue_lower:
            row['decision'] = 'Accept'

    reviews = fetch_reviews_for_note(note)
    for review in reviews:
        review_content = review.content
        rating = parse_score(review_content.get('rating', {}).get('value'))
        soundness = parse_score(review_content.get('soundness', {}).get('value'))
        presentation = parse_score(review_content.get('presentation', {}).get('value'))
        contribution = parse_score(review_content.get('contribution', {}).get('value'))

        if rating is not None:
            row['scores'].append(rating)
            row['score'].append(rating)
        if soundness is not None:
            row['soundness_score'].append(soundness)
        if presentation is not None:
            row['presentation_score'].append(presentation)
        if contribution is not None:
            row['contribution_score'].append(contribution)

    records.append(row)

iclr = pd.DataFrame(records)
print(f"Processed {len(iclr)} submissions.")


In [None]:
iclr


In [None]:
records = []

for note in tqdm(submissions, desc="Processing submissions"):
    content = note.content
    keywords = content.get('keywords', {}).get('value', []) or []
    authors = content.get('authors', {}).get('value', []) or []

    row = {
        'year': YEAR,
        'id': note.id,
        'forum': note.forum,
        'title': content.get('title', {}).get('value', '').strip(),
        'abstract': content.get('abstract', {}).get('value', '').strip(),
        'authors': ', '.join(authors),
        'venue': content.get('venue', {}).get('value', 'Unknown'),
        'decision': '',
        'score': [],
        'soundness_score': [],
        'presentation_score': [],
        'contribution_score': [],
        'contributions': [k.lower() for k in keywords],
    }

    replies = note.details.get('directReplies', []) or []
    for reply in replies:
        invitation = reply.get('invitation', '')
        if 'Decision' in invitation:
            row['decision'] = reply.get('content', {}).get('decision', {}).get('value', '')
            break

    if not row['decision']:
        venue_lower = row['venue'].lower()
        if 'withdrawn' in venue_lower:
            row['decision'] = 'Withdrawn'
        elif 'desk reject' in venue_lower:
            row['decision'] = 'Desk Reject'
        elif 'accept' in venue_lower:
            row['decision'] = 'Accept'

    reviews = fetch_reviews_for_note(note)
    for review in reviews:
        review_content = review.content
        rating = parse_score(review_content.get('rating', {}).get('value'))
        soundness = parse_score(review_content.get('soundness', {}).get('value'))
        presentation = parse_score(review_content.get('presentation', {}).get('value'))
        contribution = parse_score(review_content.get('contribution', {}).get('value'))

        if rating is not None:
            row['score'].append(rating)
        if soundness is not None:
            row['soundness_score'].append(soundness)
        if presentation is not None:
            row['presentation_score'].append(presentation)
        if contribution is not None:
            row['contribution_score'].append(contribution)

    records.append(row)

iclr = pd.DataFrame(records)


Processing submissions: 100%|██████████| 7404/7404 [2:11:49<00:00,  1.07s/it]  


In [None]:
iclr


Unnamed: 0,year,id,forum,title,abstract,authors,venue,decision,score,soundness_score,presentation_score,contribution_score,contributions
0,2024,zzv4Bf50RW,zzv4Bf50RW,Learning SO(3)-Invariant Correspondence via Po...,Establishing accurate dense 3D correspondences...,"Chunghyun Park, Seungwook Kim, Jaesik Park, Mi...",ICLR 2024 Conference Withdrawn Submission,Withdrawn,[],[],[],[],"[point cloud understanding, 3d dense correspon..."
1,2024,zzqn5G9fjn,zzqn5G9fjn,Breaking Physical and Linguistic Borders: Mult...,Pretrained large language models (LLMs) have e...,"Wanru Zhao, Yihong Chen, Royson Lee, Xinchi Qi...",ICLR 2024 poster,,[],[],[],[],"[multilingual federated learning, natural lang..."
2,2024,zz61V8bIab,zz61V8bIab,Stochastic Adversarial Networks for Multi-Doma...,Adversarial training has played a pivotal role...,"Xu Wang, Yuan Wu",ICLR 2024 Conference Withdrawn Submission,Withdrawn,[],[],[],[],"[multi-domain text classification, adversarial..."
3,2024,zyBJodMrn5,zyBJodMrn5,On the generalization capacity of neural netwo...,The advent of the Transformer has led to the d...,"Takuya Ito, Soham Dan, Mattia Rigotti, James K...",ICLR 2024 poster,,[],[],[],[],"[compositional generalization, compositionalit..."
4,2024,zxPDdw8koz,zxPDdw8koz,CLIP meets Model Zoo Experts: Pseudo-Supervisi...,Contrastive language image pretraining (CLIP) ...,"Mohammadreza Salehi, Mehrdad Farajtabar, Maxwe...",ICLR 2024 Conference Withdrawn Submission,Withdrawn,[],[],[],[],"[contrastive learning, clip, distillation, den..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7399,2024,02Ug9N8DCI,02Ug9N8DCI,GateLoop: Fully Data-Controlled Linear Recurre...,Linear Recurrence has proven to be a powerful ...,Tobias Katsch,Submitted to ICLR 2024,,[],[],[],[],"[data-controlled, linear recurrence, sequence ..."
7400,2024,01ep65umEr,01ep65umEr,TeLLMe what you see: Using LLMs to Explain Neu...,As the role of machine learning models continu...,"Leon Guertler, M Ganesh Kumar, Anh Tuan Luu, C...",Submitted to ICLR 2024,,[],[],[],[],"[explainable ai, explaining neurons in vision ..."
7401,2024,01Yi8rzoNs,01Yi8rzoNs,Visual Chain of Thought: Bridging Logical Gaps...,Recent advances in large language models elici...,"Daniel Philip Rose, Vaishnavi Himakunthala, An...",ICLR 2024 Conference Withdrawn Submission,Withdrawn,[],[],[],[],"[chain of thought, vision and language, large ..."
7402,2024,014CgNPAGy,014CgNPAGy,On the Role of Momentum in the Implicit Bias o...,Momentum is a widely adopted and crucial modif...,"Bochen Lyu, Zhanxing Zhu",ICLR 2024 Conference Withdrawn Submission,Withdrawn,[],[],[],[],"[gd, momentum, implicit bias, linear networks]"


In [None]:
# Kicking out nonsense abstracts

mask = np.array([len(a) >= 100 for a in iclr.abstract])

print(f'Removing {np.sum(~mask)} submissions with abstract length below 100 characters:')
for abstract in iclr[~mask]['abstract'].values:
    print('  ' + abstract)    
iclr = iclr[mask].reset_index(drop=True)


Removing 3 submissions with abstract length below 100 characters:
  test
  xxx
  test


In [None]:
iclr.shape


(7401, 13)

In [None]:
# Scores and decisions already populated above during submission processing.
# Additional aggregation or exports can be added below if needed.


Processing forum IDs:   0%|          | 0/7401 [00:00<?, ?it/s]

Request error for https://api2.openreview.net/notes?forum=zzv4Bf50RW: 429 Client Error: Too Many Requests for url: https://api2.openreview.net/notes?forum=zzv4Bf50RW. Retrying in 30s (attempt 1/5)...
Request error for https://api2.openreview.net/notes?forum=zzv4Bf50RW: 429 Client Error: Too Many Requests for url: https://api2.openreview.net/notes?forum=zzv4Bf50RW. Retrying in 30s (attempt 2/5)...
Request error for https://api2.openreview.net/notes?forum=zzv4Bf50RW: 429 Client Error: Too Many Requests for url: https://api2.openreview.net/notes?forum=zzv4Bf50RW. Retrying in 30s (attempt 3/5)...
Request error for https://api2.openreview.net/notes?forum=zzv4Bf50RW: 429 Client Error: Too Many Requests for url: https://api2.openreview.net/notes?forum=zzv4Bf50RW. Retrying in 30s (attempt 4/5)...
Request error for https://api2.openreview.net/notes?forum=zzv4Bf50RW: 429 Client Error: Too Many Requests for url: https://api2.openreview.net/notes?forum=zzv4Bf50RW. Retrying in 30s (attempt 5/5)...


Processing forum IDs:   0%|          | 0/7401 [02:30<?, ?it/s]


RuntimeError: Failed to fetch JSON after 5 attempts: https://api2.openreview.net/notes?forum=zzv4Bf50RW

In [None]:
# Sort alphabetically by id within year
iclr = iclr.sort_values(by=['year', 'id']).reset_index(drop=True)
iclr.to_parquet('iclr24.parquet')
