In [None]:
! pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [None]:
import numpy as np
import pandas as pd
import pylab as plt

import requests
import time

import requests
from bs4 import BeautifulSoup
import re
import fitz

from google.colab import files

In [None]:
# works for 2017-2000-1987
year = 1987
base_url = "https://papers.neurips.cc"
book_url = f"https://papers.neurips.cc/paper_files/paper/{year}"

headers = {
    "User-Agent": "Mozilla/5.0"
}

resp = requests.get(book_url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

papers = []
for li in soup.select("ul.paper-list > li"):
    a = li.find("a", attrs={"title": "paper title"})
    i = li.find("i")

    if a and i:
        href = a["href"]
        title = a.text.strip()
        authors = i.text.strip()

        match = re.search(r"hash/([a-f0-9]{32})", href)
        if match:
            hash_id = match.group(1)
            papers.append({
                "hash_id": hash_id,
                'year' : year
            })

print(f"Extracted {len(papers)} papers")
print("Sanity check:", papers[0])

Extracted 90 papers
Sanity check: {'hash_id': '03004620ea802b9118dd44d69f07af56', 'year': 1987}


In [None]:
year = 1987
hash_id = '03004620ea802b9118dd44d69f07af56'
url = f"https://papers.neurips.cc/paper_files/paper/{year}/hash/{hash_id}-Abstract.html"
base_url = "https://papers.neurips.cc"
headers = {"User-Agent": "Mozilla/5.0"}

resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

title_meta = soup.find("meta", attrs={"name": "citation_title"})
title = title_meta["content"] if title_meta else "N/A"

authors = "N/A"
for h4 in soup.find_all("h4"):
    if "Authors" in h4.text:
        p_tag = h4.find_next_sibling("p")
        if p_tag:
            i_tag = p_tag.find("i")
            if i_tag:
                authors = i_tag.get_text(strip=True)
        break

pdf_meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
pdf_url = pdf_meta["content"] if pdf_meta else "N/A"

abstract = "N/A"
for h4 in soup.find_all("h4"):
    if "Abstract" in h4.text:
        p = h4.find_next_sibling("p")
        if p:
            abstract = p.text.strip()
        break

bib_button = soup.find("a", string="Bibtex")
bib_url = base_url + bib_button["href"] if bib_button else "N/A"

pdf_url = f"{pdf_url}"

print("Title:", title)
print("Authors:", authors)
print("PDF URL:", pdf_url)
print("BibTeX URL:", bib_url)
print("Abstract:", abstract)


Title: Synchronization in Neural Nets
Authors: Jacques J. Vidal, John Haggerty
PDF URL: https://proceedings.neurips.cc/paper_files/paper/1987/file/03004620ea802b9118dd44d69f07af56-Paper.pdf
BibTeX URL: https://papers.neurips.cc/paper_files/paper/1987/file/03004620ea802b9118dd44d69f07af56-Bibtex.bib
Abstract: The  paper  presents  an  artificial  neural  network  concept  (the  Synchronizable Oscillator Networks)  where the instants of individual  firings  in  the  form  of  point  processes  constitute  the  only  form  of  information  transmitted  between  joining  neurons.  This  type  of  communication contrasts with  that which  is  assumed  in most  other  models  which  typically  are  continuous  or  discrete  value-passing  networks.  Limiting the messages received  by each processing unit to  time  markers that signal  the firing  of other units  presents  significant  implemen tation advantages. 
In  our  model,  neurons  fire  spontaneously  and  regularly  in  the  absence

In [None]:
years = [2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990,1989,1988,1987]
base_url = "https://papers.neurips.cc"
headers = {"User-Agent": "Mozilla/5.0"}

papers = []

for year in years:
    book_url = f"{base_url}/paper_files/paper/{year}"
    resp = requests.get(book_url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    for li in soup.select("ul.paper-list > li"):
        a = li.find("a", attrs={"title": "paper title"})
        if a:
            href = a["href"]
            match = re.search(r"hash/([a-f0-9]{32})", href)
            if match:
                hash_id = match.group(1)
                papers.append({
                    "hash_id": hash_id,
                    "year": year
                })

print(f"Extracted {len(papers)} papers")
print("Sanity check:", papers[0])

Extracted 7243 papers
Sanity check: {'hash_id': '0060ef47b12160b9198302ebdb144dcf', 'year': 2017}


In [None]:
! pip install tqdm



In [None]:
from tqdm import tqdm

paper_data = []
counter = 0
total = len(papers)

for hashy in tqdm(papers, desc="Scraping NeurIPS Papers"):
    # counter += 1
    # print(f"[{counter}/{total}] Processing {hashy['year']} - {hashy['hash_id']}...")

    year = hashy['year']
    hash_id = hashy['hash_id']
    url = f"https://papers.neurips.cc/paper_files/paper/{year}/hash/{hash_id}-Abstract.html"
    reviews_url = f"https://papers.neurips.cc/paper_files/paper/{year}/file/{hash_id}-Reviews.html"
    reviews_url2 = f"https://papers.neurips.cc/paper_files/paper/{year}/file/{hash_id}-Review.html"
    base_url = "https://papers.neurips.cc"
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")

        title_meta = soup.find("meta", attrs={"name": "citation_title"})
        title = title_meta["content"] if title_meta else "N/A"

        authors = "N/A"
        for h4 in soup.find_all("h4"):
            if "Authors" in h4.text:
                p_tag = h4.find_next_sibling("p")
                if p_tag:
                    i_tag = p_tag.find("i")
                    if i_tag:
                        authors = i_tag.get_text(strip=True)
                break

        pdf_meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
        pdf_url = pdf_meta["content"] if pdf_meta else "N/A"

        abstract = "N/A"
        for h4 in soup.find_all("h4"):
            if "Abstract" in h4.text:
                p = h4.find_next_sibling("p")
                if p:
                    abstract = p.text.strip()
                break

        bib_button = soup.find("a", string="Bibtex")
        bib_url = base_url + bib_button["href"] if bib_button else "N/A"

        if pdf_url == 'N/A':
            continue

        pdf_url = f"{pdf_url}"

        paper_data.append({
            "year": year,
            "hash_id": hash_id,
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "pdf_url": pdf_url,
            "bib_url": bib_url,
        })

    except Exception as e:
        print(f"Error processing {hash_id}: {e}")
        continue

papers_df = pd.DataFrame(paper_data)
papers_df.to_csv('1987_2017_neurIPS_papers.csv')

Scraping NeurIPS Papers:  17%|█▋        | 1243/7243 [06:14<64:56:03, 38.96s/it]

Error processing fe40fb944ee700392ed51bfe84dd4e3d: HTTPSConnectionPool(host='papers.neurips.cc', port=443): Max retries exceeded with url: /paper_files/paper/2016/hash/fe40fb944ee700392ed51bfe84dd4e3d-Abstract.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x79ad4e170150>, 'Connection to papers.neurips.cc timed out. (connect timeout=None)'))


Scraping NeurIPS Papers: 100%|██████████| 7243/7243 [26:00<00:00,  4.64it/s]
