In [165]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import io
from PyPDF2 import PdfReader
import time
from urllib.parse import quote
import os
import csv
import difflib
import re
import os
import csv
import PyPDF2
from pathlib import Path
import argparse
import tqdm
import fitz  # PyMuPDF - better PDF extraction
from script import extract_text_from_pdf, clean_text

In [166]:
base_url = "https://www.aimodels.fyi"
papers_page = "/papers?search=&selectedTimeRange=thisYear&page={}"
PDF_DIR='arxiv_pdfs'
os.makedirs(PDF_DIR, exist_ok=True)
paper_urls = []

# Iterate through the first 5 pages (adjust as needed)
for page_num in range(1,10):
    url = base_url + papers_page.format(page_num)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all paper links
    for link in soup.find_all('a', href=True):
        href = link['href']
        #print(href)
        if href.startswith('/papers/arxiv/'):
            full_url = base_url + href
            #print(full_url)
            if full_url not in paper_urls:
                paper_urls.append(full_url)
        #print(paper_urls)

print(f"Found {len(paper_urls)} paper URLs.")

Found 108 paper URLs.


In [167]:
def extract_summary_and_pdf(paper_url):
    response = requests.get(paper_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    head_div = soup.find('div', class_= 'css-b1ilzc')
    heading = head_div.find('h1')
    heading = heading.get_text(" ", strip=True)

    summary_div = soup.find('div', class_='css-79elbk')

    if not summary_div:
        return 'No summary found.', paper_url

    # Replace all <h2> and <p> tags with plain text in a continuous format
    parts = []
    for element in summary_div.find_all(['h2', 'p', 'li']):
        if element.name == 'h2':
            text = element.get_text(" ", strip=True)
            text = '**'+text+'**'
            parts.append(text)
        else:
          text = element.get_text(" ", strip=True)
          parts.append(text)

    summary = ' '.join(parts)  # Join all parts with a space

    return summary, heading, paper_url

In [170]:
ARXIV_API_URL = "http://export.arxiv.org/api/query?search_query=ti:\"{}\"&max_results=1"

def find_arxiv_id_by_title(title, similarity_threshold=0.8):
    query_url = ARXIV_API_URL.format(quote(title))
    response = requests.get(query_url)
    if response.status_code != 200:
        print(f"Failed to search arXiv for: {title}")
        return None

    try:
        import xml.etree.ElementTree as ET
        root = ET.fromstring(response.text)
        entries = root.findall("{http://www.w3.org/2005/Atom}entry")
        best_match = None
        best_score = 0

        for entry in entries:
            arxiv_title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
            score = difflib.SequenceMatcher(None, title.strip().lower(), arxiv_title.lower()).ratio()
            if score > best_score:
                best_score = score
                best_match = entry

        if best_match and best_score >= similarity_threshold:
            arxiv_id_url = best_match.find("{http://www.w3.org/2005/Atom}id").text
            arxiv_id = arxiv_id_url.split('/abs/')[-1]
            print(f"Fuzzy match found (score={best_score:.2f}): {arxiv_id}")
            return arxiv_id
        else:
            print(f"No good match found for: {title} (best score: {best_score:.2f})")
            return None

    except Exception as e:
        print(f"Error parsing arXiv response for title '{title}': {e}")
        return None

In [171]:
def download_pdf(heading, arxiv_id):
    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(pdf_url)
    if response.status_code == 200:
      file_path = os.path.join(PDF_DIR, f"{heading}.pdf")
      print(f"Downloading {file_path}")
      with open(file_path, 'wb') as f:
          f.write(response.content)
      return file_path, io.BytesIO(response.content)
    return None, None

In [172]:
def count_pdf_pages(pdf_stream):
    try:
        reader = PdfReader(pdf_stream)
        print(f"Found {len(reader.pages)} pages")
        return len(reader.pages)
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

In [180]:
with open('papers_summary.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['link','heading','arxiv_id','file_path','page_count','pdf_text','summary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for paper_url in paper_urls:
        summary, heading, paper_url = extract_summary_and_pdf(paper_url)
        print(f"{heading}")

        arxiv_id = find_arxiv_id_by_title(heading)
        file_path, pdf = download_pdf(heading, arxiv_id)

        if pdf:
            num_pages = count_pdf_pages(pdf)
            print(f"{num_pages} pages")

        cleaned_pdf_text = ""
        if file_path and os.path.exists(file_path) and (num_pages<50):
            print(f"Extracting and cleaning PDF: {file_path}")
            raw_text = extract_text_from_pdf(file_path)
            if raw_text:
                cleaned_pdf_text = clean_text(raw_text)
            else:
                print(f"Could not extract text from PDF: {file_path}")

        if arxiv_id is not None:
            writer.writerow({
                'link': paper_url,
                'heading': heading,
                'arxiv_id': arxiv_id,
                'file_path': file_path,
                'page_count':num_pages,
                'pdf_text': cleaned_pdf_text,
                'summary': summary
            })
            print(f"Entry completed for {heading}")
            print("---------------------------------------------------------------------------------------")

DifuzCam: Replacing Camera Lens with a Mask and a Diffusion Model
Fuzzy match found (score=1.00): 2408.07541v1
Downloading arxiv_pdfs/DifuzCam: Replacing Camera Lens with a Mask and a Diffusion Model.pdf
Found 11 pages
11 pages
Extracting and cleaning PDF: arxiv_pdfs/DifuzCam: Replacing Camera Lens with a Mask and a Diffusion Model.pdf
Initial text length: 38777 characters
Removing contributors to reduce text length (38777 characters)
Final text length: 17149 characters
Entry completed for DifuzCam: Replacing Camera Lens with a Mask and a Diffusion Model
---------------------------------------------------------------------------------------
DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning
Fuzzy match found (score=0.99): 2501.12948v1
Downloading arxiv_pdfs/DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.pdf
Found 22 pages
22 pages
Extracting and cleaning PDF: arxiv_pdfs/DeepSeek-R1: Incentivizing Reasoning Capability i

[0, [512, 0, 0, 250, 0, 408], 6, 9, 0, 10, [180, 333, 333, 500, 564, 250, 333, 250, 278], 19, 28, 500, 29, 30, 278, 31, [0, 564, 0, 444, 0, 722, 667, 667, 722, 611, 556, 722, 722, 333, 389, 722, 611, 889, 722, 722, 556, 722, 667, 556, 611, 722, 722, 944, 0, 719, 611, 333, 278, 333, 0, 500, 333, 444, 500, 444, 500, 444, 333, 500, 500, 278, 278, 500, 278, 778], 81, 84, 500, 85, [333, 389, 278, 500, 500, 722, 500, 500, 444, 480, 0, 480], 97, 199, 0, 200, [444], 201, 219, 0, 220, [444], 221, 285, 0, 286, [500], 287, 304, 0, 305, 306, 556, 307, [604, 821, 814, 0, 333], 312, 319, 0, 320, [333], 321, 336, 0, 337, [1000], 338, IndirectObject(548, 0, 135346077206736), 0, 343, 344, 444, 345, 391, 0, 392, [490]]


Initial text length: 81943 characters
Removing contributors to reduce text length (81943 characters)
Final text length: 29894 characters
Entry completed for LIMO: Less is More for Reasoning
---------------------------------------------------------------------------------------
LADDER: Self-Improving LLMs Through Recursive Problem Decomposition
No good match found for: LADDER: Self-Improving LLMs Through Recursive Problem Decomposition (best score: 0.00)
Alice in Wonderland: Simple Tasks Showing Complete Reasoning Breakdown in State-Of-the-Art Large Language Models
No good match found for: Alice in Wonderland: Simple Tasks Showing Complete Reasoning Breakdown in State-Of-the-Art Large Language Models (best score: 0.00)
Mind Your Step (by Step): Chain-of-Thought can Reduce Performance on Tasks where Thinking Makes Humans Worse
No good match found for: Mind Your Step (by Step): Chain-of-Thought can Reduce Performance on Tasks where Thinking Makes Humans Worse (best score: 0.00)
Aurora: A 



Initial text length: 40321 characters
Removing contributors to reduce text length (40321 characters)
Cleaned text too short (14097 chars), reverting to truncated original
Final text length: 29998 characters
Entry completed for Mixture of A Million Experts
---------------------------------------------------------------------------------------
StructuredRAG: JSON Response Formatting with Large Language Models
Fuzzy match found (score=1.00): 2408.11061v1
Downloading arxiv_pdfs/StructuredRAG: JSON Response Formatting with Large Language Models.pdf
Found 10 pages
10 pages
Extracting and cleaning PDF: arxiv_pdfs/StructuredRAG: JSON Response Formatting with Large Language Models.pdf
Initial text length: 27853 characters
Text is already under 30k characters, keeping all content.
Entry completed for StructuredRAG: JSON Response Formatting with Large Language Models
---------------------------------------------------------------------------------------
Better & Faster Large Language Models via M

In [181]:
import shutil
shutil.make_archive('arxiv_pdfs', 'zip', 'arxiv_pdfs')

'/content/arxiv_pdfs.zip'