In [2]:
!pip install playwright
!playwright install

Collecting playwright
  Downloading playwright-1.51.0-py3-none-macosx_11_0_arm64.whl.metadata (3.5 kB)
Collecting pyee<13,>=12 (from playwright)
  Downloading pyee-12.1.1-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet<4.0.0,>=3.1.1 (from playwright)
  Downloading greenlet-3.2.0-cp313-cp313-macosx_11_0_universal2.whl.metadata (4.1 kB)
Downloading playwright-1.51.0-py3-none-macosx_11_0_arm64.whl (38.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading greenlet-3.2.0-cp313-cp313-macosx_11_0_universal2.whl (269 kB)
Downloading pyee-12.1.1-py3-none-any.whl (15 kB)
Installing collected packages: pyee, greenlet, playwright
Successfully installed greenlet-3.2.0 playwright-1.51.0 pyee-12.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m

In [None]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import os
import csv
import difflib
from urllib.parse import quote
from PyPDF2 import PdfReader
import io
import requests
from script_2 import extract_text_from_pdf, clean_text #script_2 is the code that parses the pdfs

In [66]:
base_url = "https://www.aimodels.fyi"
papers_page = "/papers?search=&selectedTimeRange=thisYear&page={}"
PDF_DIR = 'arxiv_pdfs_new'
os.makedirs(PDF_DIR, exist_ok=True)
paper_urls = []

In [68]:
# Function to get paper URLs
async def get_paper_urls():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Set to True to run headlessly
        page = await browser.new_page()

        # Iterate through pages (adjust as needed)
        for page_num in range(1, 70):
            url = base_url + papers_page.format(page_num)
            await page.goto(url)
            await page.wait_for_timeout(5000)  # Wait for the page to load

            # Get page content and pass to BeautifulSoup for parsing
            page_content = await page.content()
            soup = BeautifulSoup(page_content, 'html.parser')

            # Find all paper links
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('/papers/arxiv/'):
                    full_url = base_url + href
                    if full_url not in paper_urls:
                        paper_urls.append(full_url)

        await browser.close()

    print(f"Found {len(paper_urls)} paper URLs.")

In [69]:
# Function to extract summary and heading from each paper page
async def extract_summary_and_pdf(paper_url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        await page.goto(paper_url)
        await page.wait_for_timeout(3000)

        # Get page content and pass to BeautifulSoup for parsing
        page_content = await page.content()
        soup = BeautifulSoup(page_content, 'html.parser')

        # Extract the paper's heading
        head_div = soup.find('div', class_='css-b1ilzc')
        heading = head_div.find('h1').get_text(" ", strip=True)

        # Extract the summary (or article body)
        summary_div = soup.find('div', class_='css-79elbk')

        if not summary_div:
            return 'No summary found.', heading, paper_url

        # Replace all <h2> and <p> tags with plain text in a continuous format
        parts = []
        for element in summary_div.find_all(['h2', 'p', 'li']):
            if element.name == 'h2':
                text = element.get_text(" ", strip=True)
                text = '**' + text + '**'
                parts.append(text)
            else:
                text = element.get_text(" ", strip=True)
                parts.append(text)

        summary = ' '.join(parts)  # Join all parts with a space

        await browser.close()

    return summary, heading, paper_url

In [70]:
# Function to find arxiv_id by title (using fuzzy matching)
def find_arxiv_id_by_title(title, similarity_threshold=0.8):
    ARXIV_API_URL = "http://export.arxiv.org/api/query?search_query=ti:\"{}\"&max_results=1"
    query_url = ARXIV_API_URL.format(quote(title))
    response = requests.get(query_url)
    if response.status_code != 200:
        print(f"Failed to search arXiv for: {title}")
        return None

    try:
        import xml.etree.ElementTree as ET
        root = ET.fromstring(response.text)
        entries = root.findall("{http://www.w3.org/2005/Atom}entry")
        best_match = None
        best_score = 0

        for entry in entries:
            arxiv_title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
            score = difflib.SequenceMatcher(None, title.strip().lower(), arxiv_title.lower()).ratio()
            if score > best_score:
                best_score = score
                best_match = entry

        if best_match and best_score >= similarity_threshold:
            arxiv_id_url = best_match.find("{http://www.w3.org/2005/Atom}id").text
            arxiv_id = arxiv_id_url.split('/abs/')[-1]
            print(f"Fuzzy match found (score={best_score:.2f}): {arxiv_id}")
            return arxiv_id
        else:
            print(f"No good match found for: {title} (best score: {best_score:.2f})")
            return None

    except Exception as e:
        print(f"Error parsing arXiv response for title '{title}': {e}")
        return None

In [71]:
# Function to download PDF
def download_pdf(heading, arxiv_id):
    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(pdf_url)
    if response.status_code == 200:
        file_path = os.path.join(PDF_DIR, f"{heading}.pdf")
        print(f"Downloading {file_path}")
        with open(file_path, 'wb') as f:
            f.write(response.content)
        return file_path, io.BytesIO(response.content)
    return None, None

In [72]:
# Function to count the number of pages in a PDF
def count_pdf_pages(pdf_stream):
    try:
        reader = PdfReader(pdf_stream)
        print(f"Found {len(reader.pages)} pages")
        return len(reader.pages)
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

In [None]:
# Main function to orchestrate everything
async def main():
    await get_paper_urls()  # Get all paper URLs
    print(f"Found {len(paper_urls)} paper URLs.")

    # Open CSV file for writing
    with open('papers_summary.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['link', 'heading', 'arxiv_id', 'file_path', 'page_count', 'pdf_text', 'summary']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Process each paper URL
        for paper_url in paper_urls:
            summary, heading, paper_url = await extract_summary_and_pdf(paper_url)
            print(f"Processing {heading}...")

            arxiv_id = find_arxiv_id_by_title(heading)
            file_path, pdf = download_pdf(heading, arxiv_id)

            num_pages = 0
            if pdf:
                num_pages = count_pdf_pages(pdf)
                print(f"{num_pages} pages")

            cleaned_pdf_text = ""
            if file_path and os.path.exists(file_path) and num_pages < 50:
                print(f"Extracting and cleaning PDF: {file_path}")
                raw_text = extract_text_from_pdf(file_path)
                if raw_text:
                    cleaned_pdf_text = clean_text(raw_text)
                else:
                    print(f"Could not extract text from PDF: {file_path}")

            # Write data to CSV
            writer.writerow({
                'link': paper_url,  
                'heading': heading,
                'arxiv_id': arxiv_id,
                'file_path': file_path,
                'page_count': num_pages,
                'pdf_text': cleaned_pdf_text,
                'summary': summary
            })
            print(f"Entry completed for {heading}")
            print("---------------------------------------------------------------------------------------")


In [97]:
# Run the script using asyncio
await main()

Processing Words or Vision: Do Vision-Language Models Have Blind Faith in Text?...
No good match found for: Words or Vision: Do Vision-Language Models Have Blind Faith in Text? (best score: 0.00)
Entry completed for Words or Vision: Do Vision-Language Models Have Blind Faith in Text?
---------------------------------------------------------------------------------------
Processing MedSAM2: Segment Anything in 3D Medical Images and Videos...


  if best_match and best_score >= similarity_threshold:


Fuzzy match found (score=1.00): 2504.03600v1
Downloading arxiv_pdfs_new/MedSAM2: Segment Anything in 3D Medical Images and Videos.pdf
Found 13 pages
13 pages
Extracting and cleaning PDF: arxiv_pdfs_new/MedSAM2: Segment Anything in 3D Medical Images and Videos.pdf
Initial text length: 65087 characters
Removing contributors to reduce text length (65087 characters)
Removing references to reduce text length (38077 characters)
Removing appendix to reduce text length (22681 characters)
Removing acknowledgments to reduce text length (22681 characters)
Removing citations to reduce text length (22681 characters)
Removing emails to reduce text length (22556 characters)
Removing page_numbers to reduce text length (22556 characters)
Still over 30k after removing sections, truncating (22533 characters)
Final text length: 15993 characters
Entry completed for MedSAM2: Segment Anything in 3D Medical Images and Videos
-------------------------------------------------------------------------------------

  if best_match and best_score >= similarity_threshold:


Fuzzy match found (score=1.00): 2411.05821v2
Downloading arxiv_pdfs_new/Benchmarking Vision, Language, & Action Models on Robotic Learning Tasks.pdf
Found 19 pages
19 pages
Extracting and cleaning PDF: arxiv_pdfs_new/Benchmarking Vision, Language, & Action Models on Robotic Learning Tasks.pdf
Initial text length: 69218 characters
Removing contributors to reduce text length (69218 characters)
Cleaned text too short (1912 chars), reverting to truncated original
Final text length: 15998 characters
Entry completed for Benchmarking Vision, Language, & Action Models on Robotic Learning Tasks
---------------------------------------------------------------------------------------
Processing The Model Openness Framework: Promoting Completeness and Openness for Reproducibility, Transparency, and Usability in Artificial Intelligence...


  if best_match and best_score >= similarity_threshold:


Fuzzy match found (score=0.99): 2403.13784v6
Downloading arxiv_pdfs_new/The Model Openness Framework: Promoting Completeness and Openness for Reproducibility, Transparency, and Usability in Artificial Intelligence.pdf
Found 28 pages
28 pages
Extracting and cleaning PDF: arxiv_pdfs_new/The Model Openness Framework: Promoting Completeness and Openness for Reproducibility, Transparency, and Usability in Artificial Intelligence.pdf
Initial text length: 99465 characters
Removing contributors to reduce text length (99465 characters)
Removing references to reduce text length (32120 characters)
Cleaned text too short (7886 chars), reverting to truncated original
Final text length: 15998 characters
Entry completed for The Model Openness Framework: Promoting Completeness and Openness for Reproducibility, Transparency, and Usability in Artificial Intelligence
---------------------------------------------------------------------------------------
Processing RLEF: Grounding Code LLMs in Execution F

  if best_match and best_score >= similarity_threshold:


Fuzzy match found (score=0.99): 2410.02089v2
Downloading arxiv_pdfs_new/RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning.pdf
Found 23 pages
23 pages
Extracting and cleaning PDF: arxiv_pdfs_new/RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning.pdf
Initial text length: 73241 characters
Removing contributors to reduce text length (73241 characters)
Cleaned text too short (48 chars), reverting to truncated original
Final text length: 15976 characters
Entry completed for RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning
---------------------------------------------------------------------------------------
Processing Learning to Move Like Professional Counter-Strike Players...
No good match found for: Learning to Move Like Professional Counter-Strike Players (best score: 0.00)
Entry completed for Learning to Move Like Professional Counter-Strike Players
------------------------------------------------------------

  if best_match and best_score >= similarity_threshold:


Fuzzy match found (score=0.99): 2502.20391v1
Downloading arxiv_pdfs_new/Point Policy: Unifying Observations and Actions with Key Points for Robot Manipulation.pdf
Found 16 pages
16 pages
Extracting and cleaning PDF: arxiv_pdfs_new/Point Policy: Unifying Observations and Actions with Key Points for Robot Manipulation.pdf
Initial text length: 64064 characters
Removing contributors to reduce text length (64064 characters)
Cleaned text too short (690 chars), reverting to truncated original
Final text length: 15998 characters
Entry completed for Point Policy: Unifying Observations and Actions with Key Points for Robot Manipulation
---------------------------------------------------------------------------------------
Processing WebAssembly enables low latency interoperable augmented and virtual reality software...


  if best_match and best_score >= similarity_threshold:


Fuzzy match found (score=0.99): 2110.07128v2
Downloading arxiv_pdfs_new/WebAssembly enables low latency interoperable augmented and virtual reality software.pdf
Found 11 pages
11 pages
Extracting and cleaning PDF: arxiv_pdfs_new/WebAssembly enables low latency interoperable augmented and virtual reality software.pdf
Initial text length: 36301 characters
Removing contributors to reduce text length (36301 characters)
Final text length: 14709 characters
Entry completed for WebAssembly enables low latency interoperable augmented and virtual reality software
---------------------------------------------------------------------------------------
Processing Exploring GPU-to-GPU Communication: Insights into Supercomputer Interconnects...
No good match found for: Exploring GPU-to-GPU Communication: Insights into Supercomputer Interconnects (best score: 0.00)
Entry completed for Exploring GPU-to-GPU Communication: Insights into Supercomputer Interconnects
----------------------------------------

  if best_match and best_score >= similarity_threshold:


Fuzzy match found (score=1.00): 2411.10109v1
Downloading arxiv_pdfs_new/Generative Agent Simulations of 1,000 People.pdf
Found 65 pages
65 pages
Entry completed for Generative Agent Simulations of 1,000 People
---------------------------------------------------------------------------------------
Processing MambaByte: Token-free Selective State Space Model...
No good match found for: MambaByte: Token-free Selective State Space Model (best score: 0.00)
Entry completed for MambaByte: Token-free Selective State Space Model
---------------------------------------------------------------------------------------


In [78]:
paper_urls

['https://www.aimodels.fyi/papers/arxiv/difuzcam-replacing-camera-lens-mask-diffusion-model',
 'https://www.aimodels.fyi/papers/arxiv/deepseek-r1-incentivizing-reasoning-capability-llms-via',
 'https://www.aimodels.fyi/papers/arxiv/elements-differentiable-programming',
 'https://www.aimodels.fyi/papers/arxiv/chemputer-chemputation-universal-chemical-compound-synthesis-machine',
 'https://www.aimodels.fyi/papers/arxiv/xlstmtime-long-term-time-series-forecasting-xlstm',
 'https://www.aimodels.fyi/papers/arxiv/q-sparse-all-large-language-models-can',
 'https://www.aimodels.fyi/papers/arxiv/distilling-system-2-into-system-1',
 'https://www.aimodels.fyi/papers/arxiv/automated-design-agentic-systems',
 'https://www.aimodels.fyi/papers/arxiv/learning-to-learn-at-test-time-rnns',
 'https://www.aimodels.fyi/papers/arxiv/meta-rewarding-language-models-self-improving-alignment',
 'https://www.aimodels.fyi/papers/arxiv/to-code-or-not-to-code-exploring',
 'https://www.aimodels.fyi/papers/arxiv/llm-

In [95]:
[(i, url) for i, url in enumerate(paper_urls) if 'vision-language' in url.lower()]

[(61,
  'https://www.aimodels.fyi/papers/arxiv/toponets-high-performing-vision-language-models-brain'),
 (91,
  'https://www.aimodels.fyi/papers/arxiv/llava-cot-let-vision-language-models-reason'),
 (113,
  'https://www.aimodels.fyi/papers/arxiv/benchmarking-vision-language-models-optical-character-recognition'),
 (135,
  'https://www.aimodels.fyi/papers/arxiv/siglip-2-multilingual-vision-language-encoders-improved'),
 (141,
  'https://www.aimodels.fyi/papers/arxiv/smoldocling-ultra-compact-vision-language-model-end'),
 (255,
  'https://www.aimodels.fyi/papers/arxiv/unicorn-text-only-data-synthesis-vision-language'),
 (326,
  'https://www.aimodels.fyi/papers/arxiv/physbench-benchmarking-enhancing-vision-language-models-physical'),
 (381,
  'https://www.aimodels.fyi/papers/arxiv/enhancing-abnormality-grounding-vision-language-models-knowledge'),
 (465,
  'https://www.aimodels.fyi/papers/arxiv/where-do-large-vision-language-models-look'),
 (504,
  'https://www.aimodels.fyi/papers/arxiv/v

In [83]:
paper_urls[213:]

['https://www.aimodels.fyi/papers/arxiv/position-aiml-influencers-have-place-academic-process',
 'https://www.aimodels.fyi/papers/arxiv/how-numerical-precision-affects-mathematical-reasoning-capabilities',
 'https://www.aimodels.fyi/papers/arxiv/parametric-matrix-models',
 'https://www.aimodels.fyi/papers/arxiv/meissonic-revitalizing-masked-generative-transformers-efficient-high',
 'https://www.aimodels.fyi/papers/arxiv/operationalizing-threat-model-red-teaming-large-language',
 'https://www.aimodels.fyi/papers/arxiv/stop-overthinking-survey-efficient-reasoning-large-language',
 'https://www.aimodels.fyi/papers/arxiv/gold-medalist-performance-solving-olympiad-geometry-alphageometry2',
 'https://www.aimodels.fyi/papers/arxiv/janus-pro-unified-multimodal-understanding-generation-data',
 'https://www.aimodels.fyi/papers/arxiv/automated-capability-discovery-via-model-self-exploration',
 'https://www.aimodels.fyi/papers/arxiv/trustworthiness-generative-foundation-models-guideline-assessment