In [1]:
from pathlib import Path
import requests
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
conference = "iclr"
year = 2023

data_path = Path("../papers_info")
if not os.path.exists(data_path):
    os.mkdir(data_path)

In [3]:
%%time
titles = []
abstracts = []
years = []
forum_ids = []
decisions = []
authors = []
keywords = []

#for year in [2018, 2019, 2020, 2021, 2022, 2023]:
for year in [2023]:
    for query in ['Blind_Submission', 'Withdrawn_Submission', 'Desk_Rejected_Submission']:
        url = f'https://api.openreview.net/notes?invitation=ICLR.cc%2F{year}%2FConference%2F-%2F{query}'
    
        for offset in [0, 1000, 2000, 3000, 4000]:
            df = pd.DataFrame(requests.get(url + f'&offset={offset}').json()['notes'])
            if len(df) > 0:
                titles    += [d['title'].strip() for d in df['content'].values]
                abstracts += [d['abstract'].strip() for d in df['content'].values]
                keywords  += [d['keywords'] for d in df['content'].values]
                authors   += [', '.join(d['authors']) for d in df['content']]
                years     += [year] * len(df)
                forum_ids += list(df.forum)
                                                
                if query == 'Withdrawn_Submission':
                    decisions += ['Withdrawn'] * len(df)
                elif query == 'Desk_Rejected_Submission':
                    decisions += ['Desk rejected'] * len(df)
                else:
                    decisions += [''] * len(df)
print('')
print(f'Found {len(titles)} papers')

iclr = pd.DataFrame.from_dict({
    'year': np.array(years).astype(int), 
    'id': forum_ids, 
    'title': titles, 
    'abstract': abstracts,
    'authors': authors,
    'decision': decisions,
    'scores': [[]] * len(abstracts),
    'keywords': [[kk.lower() for kk in k] for k in keywords],
    'conference': conference,
})

mask = np.array([len(a) >= 200 for a in iclr.abstract])
iclr = iclr[mask].reset_index(drop=True)
print(f'Removing {np.sum(~mask)} submissions with abstract length below 200 characters.')

iclr.to_pickle(data_path / f'{conference}_{year}.pickle.zip')


Found 4955 papers
Removing 0 submissions with abstract length below 200 characters.
CPU times: user 1.1 s, sys: 129 ms, total: 1.23 s
Wall time: 20.1 s


In [4]:
%%time
# Query the accept/reject decisions and scores. Warning: TAKES A COUPLE OF HOURS

for num, forum_id in enumerate(tqdm(iclr.id)): 
    forum_url = f'https://api.openreview.net/notes?forum={forum_id}'
    json = requests.get(forum_url).json()

    if iclr.decision[num] == '':
        for i in range(len(json['notes'])):
            if 'decision' in json['notes'][i]['content']:
                decision = json['notes'][i]['content']['decision']
        iclr.at[num, 'decision'] = decision
        
    if iclr.decision[num] != 'Desk rejected':
        scores = []
        for i in range(len(json['notes'])):
            if iclr.year[num] < 2023:
                score_field = 'rating'
            else:
                score_field = 'recommendation'
            if score_field in json['notes'][i]['content']:
                score = int(json['notes'][i]['content'][score_field].split(':')[0])
                scores.append(score)
        iclr.at[num, 'scores'] = scores

print('')

iclr.to_pickle(data_path / f'{conference}_{year}.pickle.zip')

  0%|          | 0/4955 [00:00<?, ?it/s]


CPU times: user 2min 20s, sys: 13.7 s, total: 2min 34s
Wall time: 55min
