In [1]:
headers = {
    'authority': 'curlconverter.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'max-age=0',
    'if-modified-since': 'Fri, 15 Jul 2022 21:44:42 GMT',
    'if-none-match': 'W/"62d1dfca-3a58"',
    'referer': 'https://curlconverter.com/',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Microsoft Edge";v="102"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Linux"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'cross-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
}

In [2]:
import time

import requests
from bs4 import BeautifulSoup
import csv

# The base URL for the arXiv website
base_url = "https://arxiv.org"
papers_details = []
for skip in range(0, 100, 5):
    # Update list_url with the current skip value
    list_url = f"{base_url}/list/cs/pastweek?skip={skip}&show=5"
    try:
        response = requests.get(list_url, headers=headers, timeout=200)  # Set a reasonable timeout, e.g., 200 seconds
        print(f"Status Code for skip {skip}: {response.status_code}")
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            spans = soup.find_all('span', class_='list-identifier')
            # Iterate over the spans to extract the links and IDs
            for span in spans:
                a_tag = span.find('a')
                if a_tag and a_tag.text.startswith('arXiv:'):
                    # Extract the paper ID and link to the abstract
                    paper_id = a_tag.text
                    paper_link = f"{base_url}{a_tag['href']}"
            
                    # Send a request to the paper's abstract page
                    paper_response = requests.get(paper_link)
                    paper_soup = BeautifulSoup(paper_response.text, "html.parser")
            
                    # Extract title, subjects, authors, and abstract from the paper's page
            
                    # Adjusted to use the correct class name for the title
                    title_element = paper_soup.find('h1', class_='title mathjax')
                    if title_element:
                        # The title is the text node that follows the descriptor span
                        title = title_element.get_text(separator=" ", strip=True).replace('Title:', '').strip()
                    else:
                        title = 'Title not found'
            
                    # 查找带有 'abstract' 类的 <blockquote> 元素
                    abstract_element = paper_soup.find('blockquote', class_='abstract mathjax')
                    # 确保找到了 <blockquote> 元素
                    if abstract_element:
                        # 找到并定位摘要的 <span> 描述符标签
                        descriptor_span = abstract_element.find('span', class_='descriptor')
                        # 确保找到了 <span> 标签
                        if descriptor_span:
                            # 提取 <span> 标签后的所有文本
                            abstract = descriptor_span.find_next_sibling(string=True).strip()
                        else:
                            abstract = 'Descriptor not found'
                    else:
                        abstract = 'Abstract not found'
            
                    # Adjusted to use the correct class name for the authors
                    authors_element = paper_soup.find('div', class_='authors')
                    # Extract all author names, which are in multiple <a> tags
                    if authors_element:
                        author_names = [a_tag.text for a_tag in authors_element.find_all('a')]
                        authors = ', '.join(author_names)
                    else:
                        authors = 'Authors not found'
            
                    # 先定位到含有主题信息的 <td> 标签
                    subjects_td = paper_soup.find('td', class_='tablecell subjects')
                    # 在这个 <td> 标签内部，找到所有的 <span> 标签
                    subjects_span = subjects_td.find('span', class_='primary-subject') if subjects_td else None
            
                    if subjects_span:
                        # The subjects are the text node that follows the descriptor span
                        subjects = subjects_span.get_text(separator=" ", strip=True)
                    else:
                        subjects = 'Subjects not found'
            
                    # Store the details in the list
                    papers_details.append([title, subjects, authors, abstract])
                time.sleep(1)
            
        else:
            print(f"Failed to retrieve data for skip={skip}. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Request failed for skip={skip}: {e}")
    
    # Respectful crawling - adjust the sleep time as needed
    time.sleep(1)  # Sleep for 1 second between requests to avoid overwhelming the server



Status Code for skip 0: 200
Status Code for skip 5: 200
Status Code for skip 10: 200
Status Code for skip 15: 200
Status Code for skip 20: 200
Status Code for skip 25: 200
Status Code for skip 30: 200
Status Code for skip 35: 200
Status Code for skip 40: 200
Status Code for skip 45: 200
Status Code for skip 50: 200
Status Code for skip 55: 200
Status Code for skip 60: 200
Status Code for skip 65: 200
Status Code for skip 70: 200
Status Code for skip 75: 200
Status Code for skip 80: 200
Status Code for skip 85: 200
Status Code for skip 90: 200
Status Code for skip 95: 200


In [None]:
with open('./papers.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Subjects', 'Authors', 'Abstract'])
    for paper in papers_details:
        writer.writerow(paper)