In [58]:
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [67]:
def scrape_arxiv(query, num_pages):
    base_url = "https://arxiv.org"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
    }

    all_papers = []
    for page in range(1, num_pages+1):
        url = f"{base_url}/search/?query={query}&searchtype=all&source=header&start={50*(page-1)}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        papers = []
        for result in soup.find_all('li', class_='arxiv-result'):
            title = result.find('p', class_='title is-5 mathjax').text.strip()
            abstract = result.find('span', class_='abstract-full has-text-grey-dark mathjax').text.strip()
            authors = result.find('p', class_='authors').text.strip()
            pub_date = result.find('p', class_='is-size-7').text.strip().split(': ')[-1]
            # Extract tags with their data-tooltip attribute
            tags = [span['data-tooltip'] for span in result.find_all('span', class_=lambda x: x and 'tag is-small' in x)]
            papers.append({'Title': title, 'Abstract': abstract, 'Authors': authors, 'Publication Date': pub_date, 'Tags': tags})
        
        all_papers.extend(papers)

    return pd.DataFrame(all_papers)

# EDIT the search query here
query = 'Engineering'

# EDIT the number of papers to scrape here
num_pages = 10

df = scrape_arxiv(query, num_pages)
print(df.shape)
df

(500, 5)


Unnamed: 0,Title,Abstract,Authors,Publication Date,Tags
0,Raman-phonon-polariton condensation in a trans...,Phonon polaritons are hybrid states of light a...,"Authors:\nAlexander N. Bourzutschky, \n \...","Submitted 8 May, 2024; \n originally anno...","[Mesoscale and Nanoscale Physics, Strongly Cor..."
1,Attention-Driven Training-Free Efficiency Enha...,Diffusion Models (DMs) have exhibited superior...,"Authors:\nHongjie Wang, \n \n Difan ...","Submitted 8 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Arti..."
2,SVDD Challenge 2024: A Singing Voice Deepfake ...,The rapid advancement of AI-generated singing ...,"Authors:\nYou Zhang, \n \n Yongyi Za...","Submitted 8 May, 2024; \n originally anno...","[Audio and Speech Processing, Artificial Intel..."
3,An LSTM-Based Chord Generation System Using Ch...,This paper proposes a system for chord generat...,Authors:\nJack Hardwick,"Submitted 8 May, 2024; \n originally anno...","[Sound, Machine Learning, Audio and Speech Pro..."
4,Cellular Traffic Prediction Using Online Predi...,The advent of 5G technology promises a paradig...,"Authors:\nHossein Mehri, \n \n Hao C...","Submitted 8 May, 2024; \n originally anno...","[Systems and Control, Machine Learning]"
...,...,...,...,...,...
495,Model Predictive Guidance for Fuel-Optimal Lan...,This paper introduces a landing guidance strat...,"Authors:\nKi-Wook Jung, \n \n Sang-D...","Submitted 2 May, 2024; \n originally anno...",[Systems and Control]
496,Causal Influence in Federated Edge Inference,"In this paper, we consider a setting where het...","Authors:\nMert Kayaalp, \n \n Yunus ...","Submitted 2 May, 2024; \n originally anno...","[Machine Learning, Multiagent Systems, Signal ..."
497,Towards Consistent Object Detection via LiDAR-...,As human-machine interaction continues to evol...,"Authors:\nKai Luo, \n \n Hao Wu, \n ...","Submitted 2 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Robo..."
498,Prompt engineering paradigms for medical appli...,Prompt engineering is crucial for harnessing t...,"Authors:\nJamil Zaghir, \n \n Marco ...","Submitted 2 May, 2024; \n originally anno...","[Computation and Language, Machine Learning]"
