In [80]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

def scrape_arxiv(base_url, subject, query, search_url):
    # print(search_url)
    response = requests.get(search_url)
    # print(response)
    if response.status_code != 200:
        print("Error: Unable to fetch arXiv data.")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    papers = []

    for paper in soup.find_all("li", class_="arxiv-result"):
        title = paper.find("p", class_="title").text.strip()
        authors = paper.find("p", class_="authors").text.strip()
        abstract = paper.find("p", class_="abstract").text.strip()
        
        # Find all span elements with class="tag"
        tag_elements = paper.find_all('span', class_='tag')
        tags = [tag.get('data-tooltip') for tag in tag_elements]
        
        submitted = paper.find("p", class_="is-size-7")
        if(submitted == None):
            submitted = ""
        else:
            submitted = submitted.text.strip()
        
        comments = paper.find("p", class_="comments") 
        if(comments == None):
            comments = ""
        else:
            comments = comments.text.strip()
        
        pdf_url = ""
        if(not paper.find("a", text="pdf") == None):
            pdf_url = paper.find("a", text="pdf")["href"]
        arxiv_id = pdf_url.split("/")[-1]

        papers.append({
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "tags": tags,
            "subject": subject,
            "query": query,
            "submitted": submitted,
            "comments": comments,
            "pdf_url": base_url + pdf_url,
            "arxiv_id": arxiv_id
        })

    return papers

def savescrapeddata(subjectquery):
    allresults = []
    counter = 0
    start = 0
    subject = subjectquery.split(":")[0]
    subject_lg = subjectquery.split(":")[2]
    query = subjectquery.split(":")[1]  # Modify this with your desired search query
    while(counter < 1):
        max_results = 200  # Modify this with the maximum number of results you want
        base_url = "https://arxiv.org"
        query_encoded = urllib.parse.quote_plus(query)
        search_url = f"{base_url}/search/{subject}?query={query_encoded}&searchtype=all&abstracts=show&order=-announced_date_first&size={max_results}"
        # print(search_url)
        if(counter > 0):
            search_url = search_url + "&start=" + str(start)
            
        results = scrape_arxiv(base_url, subject_lg, query, search_url)
        allresults.extend(results)
        start += 200
        counter += 1
    return allresults
    # # save results to csv
    # import pandas as pd
    # df = pd.DataFrame(allresults)
    # df.to_csv(f"arxiv_{subject}_{query}.csv", index=False)
    
def searchforallcateogory(searchsubjects):
    allresults = []
    for subject in searchsubjects:
        results = savescrapeddata(subject)
        allresults.extend(results)
    return allresults
         
searchsubjects = ["cs:machine learning:computer science", "cs:nlp:computer science", "cs:data science:computer science", "physics:space science:physics", "physics:gravity:physics", "physics:quantum physics:physics", "math:statistics:mathematics", "math:algebra:mathematics", "math:probability:mathematics", "q-bio:biology:quantitative biology", "q-fin:finance:quantative finance", "stat:statistics:statistics", "eess:electrical engineering:electrical engineering and systems science", "econ:economics:economics"]
allresults = searchforallcateogory(searchsubjects) 

# save results to csv
import pandas as pd
df = pd.DataFrame(allresults)
df.to_csv(f"arxiv_data.csv", index=False)

In [12]:
#!pip install requests beautifulsoup4
