In [1]:
# using openreview-py requests tqdm pdfplumber

In [26]:
import openreview
import requests
import os
import re
import csv
import pdfplumber
from tqdm import tqdm

def clean_filename(name: str) -> str:
    # remove special characters
    name = re.sub(r'[<>:"/\\|?*]', '', name)
    name = re.sub(r'\s+', '_', name).strip('_')
    return name

def download_neurips_articles():
    try:
        # initialize openreview client
        client = openreview.api.OpenReviewClient(baseurl='https://api2.openreview.net')

        # set up constants
        ACCEPTED_VENUE_ID = 'NeurIPS.cc/2024/Datasets_and_Benchmarks_Track'
        OUTPUT_DIR = 'NeurIPS2024_Datasets_and_Benchmarks'
        METADATA_CSV_FILE = os.path.join(OUTPUT_DIR, 'metadata.csv')

        # make output directory
        os.makedirs(OUTPUT_DIR, exist_ok=True)

        # get all accepted articles
        print(f"Querying openreview for venue ID: {ACCEPTED_VENUE_ID}")
        notes_iterator = openreview.tools.iterget_notes(client, content={'venueid': ACCEPTED_VENUE_ID})

        #notes_iterator = openreview.tools.iterget_notes(client, content={'venueid': ACCEPTED_VENUE_ID})
        all_notes = list(notes_iterator)
        print(f"Total notes retrieved: {len(all_notes)}")

        if not all_notes:
            print("No accepted articles found")
            return

        # downloading pdfs and metadata, extracting fulltext to .csv
        all_metadata = []
        print(f"Downloading {len(all_notes)} articles to '{OUTPUT_DIR}'")

        for note in tqdm(all_notes):
            try:
                title = note.content['title']['value']
                pdf_url_relative = note.content['pdf']['value']

                pdf_url_full = f"https://openreview.net{pdf_url_relative}"
                pdf_filename = f"{clean_filename(title)}.pdf"
                pdf_path = os.path.join(OUTPUT_DIR, pdf_filename)

                # download pdf
                if not os.path.exists(pdf_path):
                    response = requests.get(pdf_url_full, stream=True)
                    response.raise_for_status()
                    with open(pdf_path, 'wb') as f:
                        for chunk in response.iter_content(chunk_size=8192):
                            f.write(chunk)

                # extract fulltext
                fulltext = ""
                try:
                    with pdfplumber.open(pdf_path) as pdf:
                        fulltext = "\n".join(page.extract_text() or "" for page in pdf.pages)
                except Exception as e:
                    tqdm.write(f"Error extracting fulltext from {pdf_path}: {e}")
                    fulltext = "Could not extract fulltext"

                # collect metadata
                article_metadata = {
                    'title': title,
                    'keywords': note.content.get('keywords', {}).get('value', []),
                    'openreview_url': f"https://openreview.net/forum?id={note.id}",
                    'pdf_filename': pdf_filename,
                    'fulltext': fulltext
                }
                all_metadata.append(article_metadata)

            except Exception as e:
                tqdm.write(f"Error processing article '{note.content.get('title', {}).get('value', 'Unknown')}': {e}")

        if not all_metadata:
            print("No metadata found")
            return

        # save metadata to .csv
        with open(METADATA_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=all_metadata[0].keys())
            writer.writeheader()
            writer.writerows(all_metadata)

        print(f"Metadata saved to '{METADATA_CSV_FILE}'")

    except Exception as e:
        print(f"Error processing articles: {e}")

if __name__ == '__main__':
    download_neurips_articles()
    print("Done")







Querying openreview for venue ID: NeurIPS.cc/2024/Datasets_and_Benchmarks_Track
Total notes retrieved: 459
Downloading 459 articles to 'NeurIPS2024_Datasets_and_Benchmarks'


  1%|          | 3/459 [00:18<46:17,  6.09s/it]Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray stroke color because /'P9' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value
Cannot set gray stroke color because /'P13' is an invalid float value
Cannot set gray non-stroke color because /'P13' is an invalid float value
Cannot set gray stroke color because /'P14' is an invalid float value
Cannot set gray non-stroke color because /'P14' is an invalid float value
Cannot set gray stroke color because /'P15' is an invalid float value
Cannot set gray non-stroke color because /'P15' is an invalid float value
Cannot set gray stroke color because /'P16' is an invalid float value
Cannot set gray non-strok

Metadata saved to 'NeurIPS2024_Datasets_and_Benchmarks/metadata.csv'
Done


In [27]:
import pandas as pd
df = pd.read_csv('NeurIPS2024_Datasets_and_Benchmarks/metadata.csv')
df.head()

Unnamed: 0,title,keywords,openreview_url,pdf_filename,fulltext
0,DevBench: A multimodal developmental benchmark...,"['multimodal', 'developmental', 'language', 'e...",https://openreview.net/forum?id=zogaeVpbaE,DevBench_A_multimodal_developmental_benchmark_...,DEVBENCH: A multimodal developmental benchmark...
1,Point Cloud Matters: Rethinking the Impact of ...,"['Point Cloud', 'RGB-D images', 'Robot Learnin...",https://openreview.net/forum?id=zgSnSZ0Re6,Point_Cloud_Matters_Rethinking_the_Impact_of_D...,Point Cloud Matters: Rethinking the Impact of\...
2,XLand-MiniGrid: Scalable Meta-Reinforcement Le...,"['reinforcement learning', 'meta-reinforcement...",https://openreview.net/forum?id=zg8dpAGl1I,XLand-MiniGrid_Scalable_Meta-Reinforcement_Lea...,XLand-MiniGrid: Scalable Meta-Reinforcement\nL...
3,kGym: A Platform and Dataset to Benchmark Larg...,"['Benchmarks', 'Datasets', 'Natural Language P...",https://openreview.net/forum?id=zQ3qU0xWZ5,kGym_A_Platform_and_Dataset_to_Benchmark_Large...,KGYM: A Platform and Dataset to Benchmark Larg...
4,$E^3$: Exploring Embodied Emotion Through A La...,['emotion analysis; egocentric datasets ; vide...,https://openreview.net/forum?id=zGfKPqunJG,$E^3$_Exploring_Embodied_Emotion_Through_A_Lar...,E3\n: Exploring Embodied Emotion Through\nA La...


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           459 non-null    object
 1   keywords        459 non-null    object
 2   openreview_url  459 non-null    object
 3   pdf_filename    459 non-null    object
 4   fulltext        459 non-null    object
dtypes: object(5)
memory usage: 18.1+ KB


In [32]:
# checking for failed fulltext extractions
empty_mask = df['fulltext'].fillna('').str.strip() == ''
missing_fulltext = df.loc[empty_mask, ['title', 'pdf_filename']]

if missing_fulltext.empty:
    print("No failed fulltext extractions")
else:
    print("Failed fulltext extractions:")
    print(missing_fulltext)

Failed fulltext extractions:
                                                 title  \
180  LINGOLY: A Benchmark of Olympiad-Level Linguis...   

                                          pdf_filename  
180  LINGOLY_A_Benchmark_of_Olympiad-Level_Linguist...  


In [30]:
"""
# The failed fulltext extraction seems to be image-based. Will use tesseract to extract text instead.
# I will manually input the fulltext into the .csv file, since it is only one article.
import pytesseract
from pdf2image import convert_from_path
import os

def extract_text_from_image_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)
        full_text = ""
        for i, image in enumerate(images):
            print(f"   - Reading page {i + 1}/{len(images)}")
            text = pytesseract.image_to_string(image)
            full_text += text + "\n\n" # Add page breaks

        print(full_text)

    except Exception as e:
        print(f"An error occurred during OCR processing: {e}")
        print("Please ensure Tesseract is installed and accessible in your system's PATH.")


if __name__ == "__main__":
    filename = "LINGOLY_A_Benchmark_of_Olympiad-Level_Linguistic_Reasoning_Puzzles_in_Low_Resource_and_Extinct_Languages.pdf"
    folder = "NeurIPS2024_Datasets_and_Benchmarks"
    file_to_process = os.path.join(folder, filename)

    extract_text_from_image_pdf(file_to_process)
"""

'\n# The failed fulltext extraction seems to be image-based. Will use tesseract to extract text instead.\n# I will manually input the fulltext into the .csv file, since it is only one article.\nimport pytesseract\nfrom pdf2image import convert_from_path\nimport os\n\ndef extract_text_from_image_pdf(pdf_path):\n    try:\n        images = convert_from_path(pdf_path)\n        full_text = ""\n        for i, image in enumerate(images):\n            print(f"   - Reading page {i + 1}/{len(images)}")\n            text = pytesseract.image_to_string(image)\n            full_text += text + "\n\n" # Add page breaks\n\n        print(full_text)\n\n    except Exception as e:\n        print(f"An error occurred during OCR processing: {e}")\n        print("Please ensure Tesseract is installed and accessible in your system\'s PATH.")\n\n\nif __name__ == "__main__":\n    filename = "LINGOLY_A_Benchmark_of_Olympiad-Level_Linguistic_Reasoning_Puzzles_in_Low_Resource_and_Extinct_Languages.pdf"\n    folder = 

In [33]:
#removing the row where the fulltext is empty
df.to_csv('NeurIPS2024_Datasets_and_Benchmarks/backup_metadata.csv', index=False)

In [37]:
row = df.iloc[180]
print(row['title'])
df.info()

LINGOLY: A Benchmark of Olympiad-Level Linguistic Reasoning Puzzles in Low Resource and Extinct Languages
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           459 non-null    object
 1   keywords        459 non-null    object
 2   openreview_url  459 non-null    object
 3   pdf_filename    459 non-null    object
 4   fulltext        459 non-null    object
dtypes: object(5)
memory usage: 18.1+ KB


In [38]:
# SIKRE AT DET ER DET RIGTIGE INDEx!!!!
df = df.drop(df.index[180])

In [39]:
row = df.iloc[180]
print(row['title'])
df.info()

WebUOT-1M: Advancing Deep Underwater Object Tracking with A Million-Scale Benchmark
<class 'pandas.core.frame.DataFrame'>
Int64Index: 458 entries, 0 to 458
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           458 non-null    object
 1   keywords        458 non-null    object
 2   openreview_url  458 non-null    object
 3   pdf_filename    458 non-null    object
 4   fulltext        458 non-null    object
dtypes: object(5)
memory usage: 21.5+ KB


In [42]:
def add_word_column():
    file_path = os.path.join('NeurIPS2024_Datasets_and_Benchmarks', 'metadata.csv')

    search_terms = {
        'representation_mentioned': r'represent',
        'diversity_mentioned': r'divers',
        'similarity_mentioned': r'similar',
    }

    try:
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)

        for column_name, pattern in search_terms.items():
            df[column_name] = df['fulltext'].str.contains(
                pattern,
                case=False,
                regex=True
            ).astype(int)

        df.to_csv(file_path, index=False)

        #checking the results
        print(df[['title'] + list(search_terms.keys())].head())

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    add_word_column()

Reading file: NeurIPS2024_Datasets_and_Benchmarks/metadata.csv
                                               title  \
0  DevBench: A multimodal developmental benchmark...   
1  Point Cloud Matters: Rethinking the Impact of ...   
2  XLand-MiniGrid: Scalable Meta-Reinforcement Le...   
3  kGym: A Platform and Dataset to Benchmark Larg...   
4  $E^3$: Exploring Embodied Emotion Through A La...   

   representation_mentioned  diversity_mentioned  similarity_mentioned  
0                         1                    1                     1  
1                         1                    1                     1  
2                         1                    1                     1  
3                         1                    1                     1  
4                         1                    1                     0  


In [45]:
df2 = pd.read_csv('NeurIPS2024_Datasets_and_Benchmarks/metadata.csv')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   title                     459 non-null    object
 1   keywords                  459 non-null    object
 2   openreview_url            459 non-null    object
 3   pdf_filename              459 non-null    object
 4   fulltext                  459 non-null    object
 5   representation_mentioned  459 non-null    int64 
 6   diversity_mentioned       459 non-null    int64 
 7   similarity_mentioned      459 non-null    int64 
dtypes: int64(3), object(5)
memory usage: 28.8+ KB


In [48]:
#checking how many articles contain each of the three words and their frequency
word_columns = ['representation_mentioned', 'diversity_mentioned', 'similarity_mentioned']
total_articles = len(df2)

for col in word_columns:
    count = df2[col].sum()
    percentage = (count / total_articles) * 100
    print(f"Number of articles containing '{col}': {count}")
    print(f"Percentage of articles containing '{col}': {percentage:.2f}%")

combination_counts = df2.groupby(word_columns).size().reset_index(name='article_count')
combination_counts = combination_counts.sort_values(by='article_count', ascending=False)

print("Combination of word mentions (1=mentioned, 0=not mentioned):")
print(combination_counts.to_string(index=False))

Number of articles containing 'representation_mentioned': 452
Percentage of articles containing 'representation_mentioned': 98.47%
Number of articles containing 'diversity_mentioned': 428
Percentage of articles containing 'diversity_mentioned': 93.25%
Number of articles containing 'similarity_mentioned': 436
Percentage of articles containing 'similarity_mentioned': 94.99%
Combination of word mentions (1=mentioned, 0=not mentioned):
 representation_mentioned  diversity_mentioned  similarity_mentioned  article_count
                        1                    1                     1            407
                        1                    0                     1             24
                        1                    1                     0             17
                        1                    0                     0              4
                        0                    1                     1              3
                        0                    0              

In [50]:
#checking the frequency of different keywords from article metadata
from collections import Counter

cleaned_keywords = (
    df2['keywords']
    .str.lower()  #lowercase
    .str.replace(r"[\[\]']", "", regex=True)  #remove brackets and quotes
    .str.split(',')
    .explode()
    .str.strip()  #remove leading/trailing whitespace
)

#counting frequencies
keyword_counts = cleaned_keywords.value_counts()
keyword_counts = keyword_counts[keyword_counts.index != '']
print("keyword frequencies:")
print(keyword_counts.head(20))

keyword frequencies:
benchmark                           88
large language models               40
dataset                             40
evaluation                          20
large language model                16
reinforcement learning              15
llms                                15
multimodal                          14
llm                                 13
benchmarks                          12
computer vision                     12
machine learning                    10
vision-language models               9
remote sensing                       8
datasets                             8
benchmarking                         8
multimodal large language models     7
synthetic data                       7
graph neural networks                7
healthcare                           6
Name: keywords, dtype: int64


In [None]:
#TODO
#FINISH ARTICLE CODING/ANNOTATION
#ADD COLUMN WITH PARAGRAPHS WHERE WORDS ARE MENTIONED
#WRITE ARTICLE :))