In [1]:
import numpy as np
import pandas as pd

import requests
import time

In [None]:
##### CHECK THIS CODE ######
# https://github.com/berenslab/iclr-dataset/blob/main/scripts/01-dk-scrape-openreview.ipynb

In [10]:
from tqdm import tqdm
tqdm.pandas()

In [4]:
year = 2025

titles = []
abstracts = []
years = []
forum_ids = []
decisions = []
authors = []
keywords = []

for query in ['Withdrawn_Submission', 'Rejected_Submission', 'Desk_Rejected_Submission',
              '', # this is the accpeted one
             ]:
    if query != '':
        query = '/' + query
    url = f'https://api2.openreview.net/notes?content.venueid=ICLR.cc/{year}/Conference{query}'  
    
    for offset in range(0, 10_000, 1000):
        df = pd.DataFrame(requests.get(url + f'&offset={offset}').json()['notes'])
        
        if len(df) > 0:
            print(len(df), end=' ')

            titles    += [d['title']['value'].strip() for d in df['content'].values]
            abstracts += [d['abstract']['value'].strip() for d in df['content'].values]
            keywords  += [d['keywords']['value'] for d in df['content'].values]
            if 'authors' in df['content'].values[0]:
                authors   += [', '.join(d['authors']['value']) for d in df['content'].values]
            else:
                authors += [''] * len(df)
                    
            years     += [year] * len(df)
            forum_ids += list(df.forum)
                                            
            if 'Withdrawn_Submission' in query:
                decisions += ['Withdrawn'] * len(df)
            elif 'Desk_Rejected_Submission' in query:
                decisions += ['Desk rejected'] * len(df)
            elif 'Rejected_Submission' in query:
                decisions += ['Reject'] * len(df)    
            else:
                decisions += [''] * len(df)
        else:
            break

1000 1000 956 1000 1000 1000 1000 942 70 1000 1000 1000 704 

In [5]:
iclr = pd.DataFrame.from_dict({
    'year': np.array(years).astype(int), 
    'id': forum_ids, 
    'title': titles, 
    'abstract': abstracts,
    'authors': authors,
    'decision': decisions,
    
    'score': [[]] * len(abstracts),
    'soundness_score': [[]] * len(abstracts),
    'presentation_score': [[]] * len(abstracts),
    'contribution_score': [[]] * len(abstracts),
    
    'contributions': [[kk.lower() for kk in k] for k in keywords],
})

In [6]:
iclr

Unnamed: 0,year,id,title,abstract,authors,decision,score,soundness_score,presentation_score,contribution_score,contributions
0,2025,5sRnsubyAK,Neuroacoustic Patterns: Constant Q Cepstral Co...,Early identification of neurodegenerative dise...,"Aastha Kachhi, Shashank Ojha, Megha Pandey, Aj...",Withdrawn,[],[],[],[],"[neurodegenerative disorder, constant q cepstr..."
1,2025,J1SGf2lyr6,A Feature-Aware Federated Learning Framework f...,The expansion of 5G networks has led to remark...,Saeid Sheikhi,Withdrawn,[],[],[],[],"[federated learning, anomaly detection, 5g net..."
2,2025,49ti6LOUw5,UnoLoRA: Single Low-Rank Adaptation for Effici...,Recent advances in Parameter-Efficient Fine-Tu...,"Anirudh Lakhotia, Akash Kamalesh, Prerana Sanj...",Withdrawn,[],[],[],[],"[lora, multi-task learning, peft]"
3,2025,LJWPYzjDz4,Extending Flexibility of Image Coding Enhancem...,"Neural image compression, necessary in various...","Yu Mao, Jingzong LI, Jun Wang, Hong Xu, Tei-We...",Withdrawn,[],[],[],[],"[data compression, iot infrastructure, edge co..."
4,2025,Y0kmI2zqqi,Stochastic Sparse Sampling: A Framework for Va...,ile the majority of time series classification...,"Xavier Mootoo, Alan Arnoldo Diaz Montiel, Mila...",Withdrawn,[],[],[],[],"[time series, healthcare, medicine, epilepsy, ..."
...,...,...,...,...,...,...,...,...,...,...,...
11667,2025,asR9FVd4eL,Understanding and Enhancing the Transferabilit...,Jailbreaking attacks can effectively manipulat...,"Runqi Lin, Bo Han, Fengwang Li, Tongliang Liu",,[],[],[],[],"[jailbreaking attack, black-box transferable a..."
11668,2025,z1nSpA2dAW,FLOPS: Forward Learning with OPtimal Sampling,"Given the limitations of backpropagation, pert...","Tao Ren, Zishi Zhang, Jinyang Jiang, Guanghao ...",,[],[],[],[],"[stochastic optimization, zeroth-order optimiz..."
11669,2025,JZLon6cvx8,Storybooth: Training-Free Multi-Subject Consis...,Consistent text-to-image generation depicting ...,"Jaskirat Singh, Junshen K Chen, Jonas K Kohler...",,[],[],[],[],"[consistent text-to-image generation, visual s..."
11670,2025,Q6a9W6kzv5,PhysBench: Benchmarking and Enhancing Vision-L...,Understanding the physical world is a fundamen...,"Wei Chow, Jiageng Mao, Boyi Li, Daniel Seita, ...",,[],[],[],[],"[vision-language, multi-modal understanding]"


In [7]:
# Kicking out nonsense abstracts

mask = np.array([len(a) >= 100 for a in iclr.abstract])

print(f'Removing {np.sum(~mask)} submissions with abstract length below 100 characters:')
for abstract in iclr[~mask]['abstract'].values:
    print('  ' + abstract)    
iclr = iclr[mask].reset_index(drop=True)

Removing 9 submissions with abstract length below 100 characters:
  Abstract.
  We have withdrawn our paper.
  We have withdrawn our paper.
  -
  We decide to withdraw our work from the conference.
  none
  ...
  xx
  xx


In [8]:
iclr.shape

(11663, 11)

In [None]:

# Query the accept/reject decisions and scores. Warning: TAKES MANY HOURS
# API cuts you off every 60 queries, then the code sleeps for 30 seconds

# for num, forum_id in enumerate(iclr.id):
for num, forum_id in enumerate(tqdm(iclr.id, desc="Processing forum IDs")):
    # forum_url = f"https://openreview.net/forum?id={forum_id}"

    forum_url = f'https://api2.openreview.net/notes?forum={forum_id}'
        
    json = requests.get(forum_url).json()

    while True:
        json = requests.get(forum_url).json()
    
        if 'name' in json and json['name'] == 'RateLimitError':
            print("Rate limit hit. Sleeping for 30 seconds...")
            time.sleep(30)
        else:
            break  # Exit the loop when no longer rate-limited
        
    if iclr.decision[num] == '':
        
        found_decision = False
        for i in range(len(json['notes'])):
            if 'decision' in json['notes'][i]['content']:
                decision = json['notes'][i]['content']['decision']['value']
                found_decision = True
                break
                    
        if found_decision:
            iclr.at[num, 'decision'] = decision
        else:
            print(f'No decision found: {num}, {forum_id}')
        
    scores = []; soundness_scores = []; presentation_scores = []; contribution_scores = []; 
    
    for i in range(len(json['notes'])):
        score_field = 'rating'
        if score_field in json['notes'][i]['content']:
            score = json['notes'][i]['content'][score_field]['value']
            soundness = json['notes'][i]['content']['soundness']['value']
            presentation = json['notes'][i]['content']['presentation']['value']
            contribution = json['notes'][i]['content']['contribution']['value']
            
            scores.append(score)
            soundness_scores.append(soundness)
            presentation_scores.append(presentation)
            contribution_scores.append(contribution)
            
    iclr.at[num, 'score'] = scores
    iclr.at[num, 'soundness_score'] = soundness_scores
    iclr.at[num, 'presentation_score'] = presentation_scores
    iclr.at[num, 'contribution_score'] = contribution_scores

print('')


Processing forum IDs:   0%|          | 58/11663 [00:10<33:24,  5.79it/s]

Rate limit hit. Sleeping for 30 seconds...


Processing forum IDs:   1%|          | 114/11663 [00:51<32:34,  5.91it/s]  

Rate limit hit. Sleeping for 30 seconds...
Rate limit hit. Sleeping for 30 seconds...


Processing forum IDs:   1%|▏         | 166/11663 [02:00<32:15,  5.94it/s]   

Rate limit hit. Sleeping for 30 seconds...
Rate limit hit. Sleeping for 30 seconds...


Processing forum IDs:   2%|▏         | 219/11663 [03:09<30:45,  6.20it/s]   

Rate limit hit. Sleeping for 30 seconds...
Rate limit hit. Sleeping for 30 seconds...


In [None]:
# Sort alphabetically by id within year
iclr = iclr.sort_values(by=['year', 'id']).reset_index(drop=True)
iclr.to_parquet('iclr25.parquet')

In [21]:
presentation_scores

[2, 3, 3, 2, 3]

In [19]:
json['notes'][i]['content']['soundness']['value']

3