A simple parser for the affiliation of articles from 5 top Machine Learning conferences: **NeurIPS/ICML/ICLR/CVPR/ACL**. Collects statistics on the affiliation of research groups (**academic** or **commercial** research) and impact on ML community.

In [None]:
!pip install fitz

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl (20 kB)
Collecting configobj (from fitz)
  Downloading configobj-5.0.8-py2.py3-none-any.whl (36 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.0.0-py3-none-any.whl (16 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.8.6-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.2-py3-none-any.whl (95 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl (421 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━

In [None]:
!pip install -U PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import fitz  # PyMuPDF
import os
import re

# List of top AI conferences
conferences = {
    "NeurIPS": "https://proceedings.neurips.cc/",
    "ICML": "https://proceedings.mlr.press/v139/",
    "ICLR": "https://openreview.net/group?id=ICLR.cc/2023/Conference",
    "CVPR": "http://openaccess.thecvf.com/CVPR2023.py",
    "ACL": "https://aclanthology.org/events/acl-2023/"
}

# List of known commercial entities
commercial_entities = ["Google", "Microsoft", "OpenAI", "Facebook", "Amazon", "IBM", "Apple", "NVIDIA", "DeepMind", "Salesforce", "Alibaba", "Tencent", "Baidu"]

# Function to classify affiliation
def classify_affiliation(affiliations):
    for affiliation in affiliations:
        for entity in commercial_entities:
            if entity.lower() in affiliation.lower():
                return "Commercial"
    return "Academic"

def find_doi_by_title(title):
    url = "https://api.crossref.org/works"
    params = {
        "query.bibliographic": title,
        "rows": 1  # Limit to one result for simplicity
    }
    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        items = data.get("message", {}).get("items", [])
        if items:
            doi = items[0].get("DOI")
            return doi
        else:
            print("DOI not found for the given title.")
            return None
    else:
        print(f"Error: Unable to fetch data. Status code {response.status_code}")
        return None

def get_citation_count(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        print(data)
        citation_count = data['message']['is-referenced-by-count']
        return citation_count
    else:
        print(f"Error: Unable to fetch data for DOI {doi}")
        return None

# Function to extract affiliation text from the first page of the PDF
def extract_affiliations_from_pdf(pdf_url):
    try:
        pdf_response = requests.get(pdf_url)
        pdf_path = "temp.pdf"
        with open(pdf_path, "wb") as pdf_file:
            pdf_file.write(pdf_response.content)

        # Extract text from PDF
        # Open the PDF file
        document = fitz.open(pdf_path)

        # Select the first page
        first_page = document[0]

        # Define the region to extract text from (left down corner)
        # Adjust the rectangle coordinates as necessary
        rect = fitz.Rect(0, first_page.rect.height * 0.8, first_page.rect.width * 0.5, first_page.rect.height)

        # Extract text from the specified region
        affiliation_text = first_page.get_text("text", clip=rect)
        affiliation_text = re.sub(r'-\n', '', affiliation_text)
        affiliation_text = re.sub(r'\n', ' ', affiliation_text)

        keywords = (
            r"University|Institute|Company|Corporation|Inc|Ltd|LLC|Laboratories|"
            r"Google|Microsoft|OpenAI|Facebook|Amazon|IBM|Apple|NVIDIA|DeepMind|"
            r"Salesforce|Alibaba|Tencent|Baidu"
        )

        # Regular expression to find affiliations based on keywords
        affiliation_pattern = re.compile(rf"([A-Za-z0-9\s,]+(?:{keywords})[A-Za-z0-9\s,]*)", re.IGNORECASE)

        # Find all matches in the text
        affiliations = affiliation_pattern.findall(affiliation_text)
        os.remove(pdf_path)
        return affiliations
    except Exception as e:
        print(f"Error extracting PDF: {e}")
        return []

# Function to extract data from conference proceedings
def extract_papers(conference, url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    paper_data = []
    if conference == "ICML":
        papers = soup.find_all("div", class_="paper")

    for paper in papers:
        title_element = paper.find("p", class_="title")
        title = title_element.text if title_element else "N/A"

        #print(str(title))
        #doi = find_doi_by_title(str(title))
        #print(doi)
        #citation_count = get_citation_count(doi)

        paper_link = paper.find("a", href=True)
        if paper_link:
            paper_page_url = paper_link['href']
            if conference == "ICML":
                # Get the correct PDF link from the paper's detail page
                paper_detail_response = requests.get(f"{paper_page_url}")
                paper_detail_soup = BeautifulSoup(paper_detail_response.content, 'html.parser')
                pdf_link = paper_detail_soup.find("a", href=True, text="Download PDF")
                if pdf_link:
                    paper_page_url = f"{pdf_link['href']}"

            affiliations = extract_affiliations_from_pdf(paper_page_url)
            paper_data.append({"title": title, "pdf_url": paper_page_url, "affiliations": affiliations, "conference": conference,
                               #"citations": citation_count
                               })
    return paper_data

# Extract data for each conference
all_papers = []
for conference, url in conferences.items():
    papers = extract_papers(conference, url)
    if papers:
        all_papers.extend(papers)

# Check if we have extracted any data
if not all_papers:
    print("No papers extracted.")
else:
    # Convert to DataFrame
    papers_df = pd.DataFrame(all_papers)

    # Classify affiliations
    papers_df['affiliation'] = papers_df['affiliations'].apply(classify_affiliation)

    # Analysis: Count the number of papers by affiliation
    affiliation_counts = papers_df['affiliation'].value_counts()
    print(affiliation_counts)
    print(papers_df[['title', 'affiliation']])

  pdf_link = paper_detail_soup.find("a", href=True, text="Download PDF")


MuPDF error: syntax error: cannot find ExtGState resource 'TRP1'

affiliation
Academic      845
Commercial    338
Name: count, dtype: int64


KeyError: "['citations'] not in index"