In [1]:
# install the necessary libraries
# !pip install requests beautifulsoup4 pandas

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# Step 1: Set up the base URL
base_url = "https://arxiv.org"

In [4]:
# Step 2: Fetch the list of papers
recent_cs_url = f"{base_url}/list/cs/pastweek?skip=0&show=100"
response = requests.get(recent_cs_url)
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
recent_cs_url

'https://arxiv.org/list/cs/pastweek?skip=0&show=100'

In [6]:
response

<Response [200]>

In [7]:
#soup

In [8]:
# This will store all the papers' details
papers = []
i = 0

In [9]:
# Step 3: Extract the required data
# Find all paper links - assuming they are contained within 'dt' tags
for dt in soup.find_all('dt'):
    i = i + 1
    
    # Find the anchor tag with the title 'Abstract'
    a_tag = dt.find('a', title='Abstract')

    # Check if the anchor tag exists
    if a_tag:
        
        paper_relative_url = a_tag['href']
        paper_url = base_url + paper_relative_url
        #print(dt)
        
        
        # Fetch the paper's detailed page
        paper_response = requests.get(paper_url)
        paper_soup = BeautifulSoup(paper_response.text, 'html.parser')
        #print(paper_soup)
        
        title = paper_soup.find('h1', class_='title').text.replace('Title:', '').strip()
        print(i, ": ", title)

        # The authors are in 'div' with class 'authors', within 'a' tags
        authors = [a.text.strip() for a in paper_soup.find('div', class_='authors').find_all('a')]
        

        # The abstract is in a 'blockquote' with class 'abstract'
        abstract = paper_soup.find('blockquote', class_='abstract').text.replace('Abstract:', '').strip()
   

        # The subjects are in 'div' with class 'metatable', within 'td' with class 'tablecell subjects'
        subjects_div = paper_soup.find('td', class_='tablecell subjects')

        if subjects_div:
            subjects = subjects_div.text.strip()
        else:
            subjects = 'Not found'

        # Add to our papers list
        papers.append({
            'title': title,
            'subjects': subjects,
            'authors': authors,
            'abstract': abstract
        })

1 :  Point Could Mamba: Point Cloud Learning via State Space Model
2 :  Mitigating Reversal Curse via Semantic-aware Permutation Training
3 :  An Experimental Study of Low-Latency Video Streaming over 5G
4 :  AtP*: An efficient and scalable method for localizing LLM behaviour to components
5 :  Neural Acceleration of Incomplete Cholesky Preconditioners
6 :  Dialect prejudice predicts AI decisions about people's character, employability, and criminality
7 :  Happy Ending: An Empty Hexagon in Every Set of 30 Points
8 :  Can Transformers Capture Spatial Relations between Objects?
9 :  Cost-Effective Activity Control of Asymptomatic Carriers in Layered Temporal Social Networks
10 :  Few-Shot Relation Extraction with Hybrid Visual Evidence
11 :  Subhomogeneous Deep Equilibrium Models
12 :  MAIDR: Making Statistical Visualizations Accessible with Multimodal Data Representation
13 :  Adaptive Learning Rate for Follow-the-Regularized-Leader: Competitive Ratio Analysis and Best-of-Both-Worlds
1

In [10]:
# Step 4: Save to CSV
df = pd.DataFrame(papers)
df.to_csv('arxiv_papers.csv', index=False)

In [11]:
df.index = range(1, len(df) + 1)
df

Unnamed: 0,title,subjects,authors,abstract
1,Point Could Mamba: Point Cloud Learning via St...,Computer Vision and Pattern Recognition (cs.CV),"[Tao Zhang, Xiangtai Li, Haobo Yuan, Shunping ...","In this work, for the first time, we demonstra..."
2,Mitigating Reversal Curse via Semantic-aware P...,Computation and Language (cs.CL); Artificial I...,"[Qingyan Guo, Rui Wang, Junliang Guo, Xu Tan, ...",While large language models (LLMs) have achiev...
3,An Experimental Study of Low-Latency Video Str...,Multimedia (cs.MM); Performance (cs.PF),"[Imran Khan, Tuyen X. Tran, Matti Hiltunen, Th...",Low-latency video streaming over 5G has become...
4,AtP*: An efficient and scalable method for loc...,Machine Learning (cs.LG); Computation and Lang...,"[János Kramár, Tom Lieberum, Rohin Shah, Neel ...",Activation Patching is a method of directly co...
5,Neural Acceleration of Incomplete Cholesky Pre...,"Distributed, Parallel, and Cluster Computing (...","[Joshua Dennis Booth, Hongyang Sun, Trevor Gar...",The solution of a sparse system of linear equa...
...,...,...,...,...
96,Do Zombies Understand? A Choose-Your-Own-Adven...,Computation and Language (cs.CL),"[Ariel Goldstein, Gabriel Stanovsky]",Recent advances in LLMs have sparked a debate ...
97,"Graph Homomorphism, Monotone Classes and Bound...",Computational Complexity (cs.CC); Logic in Com...,"[Tala Eagling-Vose, Barnaby Martin, Daniel Pau...",A recent paper describes a framework for study...
98,Analyzing Divergence for Nondeterministic Prob...,Logic in Computer Science (cs.LO),"[Hao Wu, Yuxi Fu, Huan Long, Xian Xu, Wenbo Zh...",Branching and weak probabilistic bisimilaritie...
99,Multiple Ways of Working with Users to Develop...,Human-Computer Interaction (cs.HC); Robotics (...,"[Amal Nanavati, Max Pascher, Vinitha Ranganeni...",Despite the growth of physically assistive rob...
