In [None]:
! pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [None]:
import numpy as np
import pandas as pd
import pylab as plt

import requests
import time

import requests
from bs4 import BeautifulSoup
import re
import fitz

from google.colab import files

In [None]:
# works for 2018/2019/2020/2021/2022/2023/2024
year = 2024
base_url = "https://papers.neurips.cc"
book_url = f"https://papers.neurips.cc/paper_files/paper/{year}"

headers = {
    "User-Agent": "Mozilla/5.0"
}

resp = requests.get(book_url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

papers = []
for li in soup.select("ul.paper-list > li"):
    a = li.find("a", attrs={"title": "paper title"})
    i = li.find("i")

    if a and i:
        href = a["href"]
        title = a.text.strip()
        authors = i.text.strip()

        match = re.search(r"hash/([a-f0-9]{32})", href)
        if match:
            hash_id = match.group(1)
            papers.append({
                "hash_id": hash_id,
                'year' : year
            })

print(f"Extracted {len(papers)} papers")
print("Sanity check:", papers[0])

Extracted 4494 papers
Sanity check: {'hash_id': '000f947dcaff8fbffcc3f53a1314f358', 'year': 2024}


In [None]:
# works for 2018/2019/2020/2021/2022/2023/2024
year = 2024
hash_id = '000f947dcaff8fbffcc3f53a1314f358'
url = f"https://papers.neurips.cc/paper_files/paper/{year}/hash/{hash_id}-Abstract.html"
base_url = "https://papers.neurips.cc"
headers = {"User-Agent": "Mozilla/5.0"}

resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

title_meta = soup.find("meta", attrs={"name": "citation_title"})
title = title_meta["content"] if title_meta else "N/A"

authors = "N/A"
for h4 in soup.find_all("h4"):
    if "Authors" in h4.text:
        p_tag = h4.find_next_sibling("p")
        if p_tag:
            i_tag = p_tag.find("i")
            if i_tag:
                authors = i_tag.get_text(strip=True)
        break

pdf_meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
pdf_url = pdf_meta["content"] if pdf_meta else "N/A"

abstract = "N/A"
for h4 in soup.find_all("h4"):
    if "Abstract" in h4.text:
        p = h4.find_next_sibling("p")
        if p:
            abstract = p.text.strip()
        break

bib_button = soup.find("a", string="Bibtex")
bib_url = base_url + bib_button["href"] if bib_button else "N/A"

pdf_url = f"{pdf_url}"

print("Title:", title)
print("Authors:", authors)
print("PDF URL:", pdf_url)
print("BibTeX URL:", bib_url)
print("Abstract:", abstract)


Title: MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence
Authors: Ionut-Vlad Modoranu, Mher Safaryan, Grigory Malinovsky, Eldar Kurtic, Thomas Robert, Peter Richtárik, Dan Alistarh
PDF URL: https://proceedings.neurips.cc/paper_files/paper/2024/file/000f947dcaff8fbffcc3f53a1314f358-Paper-Conference.pdf
BibTeX URL: https://papers.neurips.cc/paper_files/paper/23213-/bibtex
Abstract: We propose a new variant of the Adam optimizer called MicroAdam that specifically minimizes memory overheads, while maintaining theoretical convergence guarantees. We achieve this by compressing the gradient information before it is fed into the optimizer state, thereby reducing its memory footprint significantly. We control the resulting compression  error via a novel instance of the classical error feedback mechanism from distributed optimization in which the error correction information is itself compressed to allow for practical memory gains. We prove that the resul

In [None]:
years = [2021, 2022, 2023, 2024]
base_url = "https://papers.neurips.cc"
headers = {"User-Agent": "Mozilla/5.0"}

papers = []

for year in years:
    book_url = f"{base_url}/paper_files/paper/{year}"
    resp = requests.get(book_url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    for li in soup.select("ul.paper-list > li"):
        a = li.find("a", attrs={"title": "paper title"})
        if a:
            href = a["href"]
            match = re.search(r"hash/([a-f0-9]{32})", href)
            if match:
                hash_id = match.group(1)
                papers.append({
                    "hash_id": hash_id,
                    "year": year
                })

print(f"Extracted {len(papers)} papers")
print("Sanity check:", papers[0])

Extracted 13202 papers
Sanity check: {'hash_id': '000c076c390a4c357313fca29e390ece', 'year': 2021}


In [None]:
! pip install tqdm



In [None]:
from tqdm import tqdm

paper_data = []
counter = 0
total = len(papers)

for hashy in tqdm(papers, desc="Scraping NeurIPS Papers"):
    # counter += 1
    # print(f"[{counter}/{total}] Processing {hashy['year']} - {hashy['hash_id']}...")

    year = hashy['year']
    hash_id = hashy['hash_id']
    url = f"https://papers.neurips.cc/paper_files/paper/{year}/hash/{hash_id}-Abstract.html"
    reviews_url = f"https://papers.neurips.cc/paper_files/paper/{year}/file/{hash_id}-Reviews.html"
    reviews_url2 = f"https://papers.neurips.cc/paper_files/paper/{year}/file/{hash_id}-Review.html"
    base_url = "https://papers.neurips.cc"
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")

        title_meta = soup.find("meta", attrs={"name": "citation_title"})
        title = title_meta["content"] if title_meta else "N/A"

        authors = "N/A"
        for h4 in soup.find_all("h4"):
            if "Authors" in h4.text:
                p_tag = h4.find_next_sibling("p")
                if p_tag:
                    i_tag = p_tag.find("i")
                    if i_tag:
                        authors = i_tag.get_text(strip=True)
                break

        pdf_meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
        pdf_url = pdf_meta["content"] if pdf_meta else "N/A"

        abstract = "N/A"
        for h4 in soup.find_all("h4"):
            if "Abstract" in h4.text:
                p = h4.find_next_sibling("p")
                if p:
                    abstract = p.text.strip()
                break

        bib_button = soup.find("a", string="Bibtex")
        bib_url = base_url + bib_button["href"] if bib_button else "N/A"

        if pdf_url == 'N/A':
            continue

        pdf_url = f"{pdf_url}"

        paper_data.append({
            "year": year,
            "hash_id": hash_id,
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "pdf_url": pdf_url,
            "bib_url": bib_url,
        })

    except Exception as e:
        print(f"Error processing {hash_id}: {e}")
        continue

papers_df = pd.DataFrame(paper_data)
papers_df.to_csv('2021_2024_neurIPS_papers.csv')

Scraping NeurIPS Papers:   3%|▎         | 402/13202 [06:14<141:28:58, 39.79s/it]

Error processing 2a10665525774fa2501c2c8c4985ce61: HTTPSConnectionPool(host='papers.neurips.cc', port=443): Max retries exceeded with url: /paper_files/paper/2021/hash/2a10665525774fa2501c2c8c4985ce61-Abstract.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x79b9ca4f55d0>, 'Connection to papers.neurips.cc timed out. (connect timeout=None)'))


Scraping NeurIPS Papers: 100%|██████████| 13202/13202 [2:18:00<00:00,  1.59it/s]


In [None]:
df = pd.read_csv('2021_2024_neurIPS_papers.csv')
df[df['hash_id'] == '2a10665525774fa2501c2c8c4985ce61']

Unnamed: 0.1,Unnamed: 0,year,hash_id,title,authors,abstract,pdf_url,bib_url


In [None]:
import requests
from bs4 import BeautifulSoup

hash_id = "2a10665525774fa2501c2c8c4985ce61"
year = 2021
headers = {"User-Agent": "Mozilla/5.0"}
base_url = "https://papers.neurips.cc"

url = f"{base_url}/paper_files/paper/{year}/hash/{hash_id}-Abstract.html"

try:
    resp = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(resp.text, "html.parser")

    title_meta = soup.find("meta", attrs={"name": "citation_title"})
    title = title_meta["content"] if title_meta else "N/A"

    authors = "N/A"
    for h4 in soup.find_all("h4"):
        if "Authors" in h4.text:
            p_tag = h4.find_next_sibling("p")
            if p_tag:
                i_tag = p_tag.find("i")
                if i_tag:
                    authors = i_tag.get_text(strip=True)
            break

    pdf_meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
    pdf_url = pdf_meta["content"] if pdf_meta else "N/A"

    abstract = "N/A"
    for h4 in soup.find_all("h4"):
        if "Abstract" in h4.text:
            p = h4.find_next_sibling("p")
            if p:
                abstract = p.text.strip()
            break

    bib_button = soup.find("a", string="Bibtex")
    bib_url = base_url + bib_button["href"] if bib_button else "N/A"

    data = {
        "year": year,
        "hash_id": hash_id,
        "title": title,
        "authors": authors,
        "abstract": abstract,
        "pdf_url": pdf_url,
        "bib_url": bib_url
    }

    print(" Recovered data:\n", data)

except Exception as e:
    print("Failed again:", e)


 Recovered data:
 {'year': 2021, 'hash_id': '2a10665525774fa2501c2c8c4985ce61', 'title': 'Learning where to learn: Gradient sparsity in meta and continual learning', 'authors': 'Johannes von Oswald, Dominic Zhao, Seijin Kobayashi, Simon Schug, Massimo Caccia, Nicolas Zucchet, João Sacramento', 'abstract': 'Finding neural network weights that generalize well from small datasets is difficult. A promising approach is to learn a weight initialization such that a small number of weight changes results in low generalization error. We show that this form of meta-learning can be improved by letting the learning algorithm decide which weights to change, i.e., by learning where to learn. We find that patterned sparsity emerges from this process, with the pattern of sparsity varying on a problem-by-problem basis. This selective sparsity results in better generalization and less interference in a range of few-shot and continual learning problems. Moreover, we find that sparse learning also emerges

In [None]:
df_patch = pd.DataFrame([data])
df = pd.concat([df,df_patch],ignore_index=True)

df.head()

Unnamed: 0.1,Unnamed: 0,year,hash_id,title,authors,abstract,pdf_url,bib_url
0,0.0,2021,000c076c390a4c357313fca29e390ece,Beyond Value-Function Gaps: Improved Instance-...,"Christoph Dann, Teodor Vanislavov Marinov, Meh...",We provide improved gap-dependent regret bound...,https://proceedings.neurips.cc/paper_files/pap...,https://papers.neurips.cc/paper_files/paper/11...
1,1.0,2021,003dd617c12d444ff9c80f717c3fa982,Learning One Representation to Optimize All Re...,"Ahmed Touati, Yann Ollivier",We introduce the forward-backward (FB) represe...,https://proceedings.neurips.cc/paper_files/pap...,https://papers.neurips.cc/paper_files/paper/11...
2,2.0,2021,007ff380ee5ac49ffc34442f5c2a2b86,Matrix factorisation and the interpretation of...,"Nick Whiteley, Annie Gray, Patrick Rubin-Delanchy","Given a graph or similarity matrix, we conside...",https://proceedings.neurips.cc/paper_files/pap...,https://papers.neurips.cc/paper_files/paper/11...
3,3.0,2021,0084ae4bc24c0795d1e6a4f58444d39b,UniDoc: Unified Pretraining Framework for Docu...,"Jiuxiang Gu, Jason Kuen, Vlad I Morariu, Hando...",Document intelligence automates the extraction...,https://proceedings.neurips.cc/paper_files/pap...,https://papers.neurips.cc/paper_files/paper/11...
4,4.0,2021,008bd5ad93b754d500338c253d9c1770,Finding Discriminative Filters for Specific De...,"Liangbin Xie, Xintao Wang, Chao Dong, Zhongang...",Recent blind super-resolution (SR) methods typ...,https://proceedings.neurips.cc/paper_files/pap...,https://papers.neurips.cc/paper_files/paper/11...


In [None]:
df[df['hash_id'] == '2a10665525774fa2501c2c8c4985ce61']

Unnamed: 0.1,Unnamed: 0,year,hash_id,title,authors,abstract,pdf_url,bib_url
13201,,2021,2a10665525774fa2501c2c8c4985ce61,Learning where to learn: Gradient sparsity in ...,"Johannes von Oswald, Dominic Zhao, Seijin Koba...",Finding neural network weights that generalize...,https://proceedings.neurips.cc/paper_files/pap...,https://papers.neurips.cc/paper_files/paper/12...


In [None]:
df.to_csv('2021-2024_NIPS_papers.csv')

In [None]:
df.describe()

# sanity check, we have all 13202 papers for the years of 2021, 2022, 2023 and 2024

Unnamed: 0.1,Unnamed: 0,year
count,13201.0,13202.0
mean,6600.0,2022.772156
std,3810.944787,1.100187
min,0.0,2021.0
25%,3300.0,2022.0
50%,6600.0,2023.0
75%,9900.0,2024.0
max,13200.0,2024.0
