In [1]:
import requests
from bs4 import BeautifulSoup
import csv

In [2]:
def scrape_arxiv_paper_details(paper_id):
    paper_link = f"https://arxiv.org/abs/{paper_id}"
    response = requests.get(paper_link)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting details
    title = soup.find('h1', class_='title mathjax').text.strip() if soup.find('h1', class_='title mathjax') else "N/A"
    authors = [author.text.strip() for author in soup.find_all('div', class_='authors')[0].find_all('a')] if soup.find_all('div', class_='authors') else ["N/A"]
    subjects_tag = soup.find('span', class_='primary-subject') if soup.find('span', class_='primary-subject') else None
    subjects = subjects_tag.text.strip() if subjects_tag else "N/A"
    abstract = soup.find('blockquote', class_='abstract mathjax').text.strip() if soup.find('blockquote', class_='abstract mathjax') else "N/A"

    # Print the details
    print(title)
    print("Authors:", authors)
    print("Subjects:", subjects)
    print(abstract)

# Main function
def main():
    paper_id = "2403.00762"
    scrape_arxiv_paper_details(paper_id)

if __name__ == "__main__":
    main()

Title:Point Could Mamba: Point Cloud Learning via State Space Model
Authors: ['Tao Zhang', 'Xiangtai Li', 'Haobo Yuan', 'Shunping Ji', 'Shuicheng Yan']
Subjects: Computer Vision and Pattern Recognition (cs.CV)
Abstract:In this work, for the first time, we demonstrate that Mamba-based point cloud methods can outperform point-based methods. Mamba exhibits strong global modeling capabilities and linear computational complexity, making it highly attractive for point cloud analysis. To enable more effective processing of 3-D point cloud data by Mamba, we propose a novel Consistent Traverse Serialization to convert point clouds into 1-D point sequences while ensuring that neighboring points in the sequence are also spatially adjacent. Consistent Traverse Serialization yields six variants by permuting the order of x, y, and z coordinates, and the synergistic use of these variants aids Mamba in comprehensively observing point cloud data. Furthermore, to assist Mamba in handling point sequences

In [2]:
def get_arxiv_paper_ids():
    base_url = "https://arxiv.org/list/cs/recent?show={}&skip={}"
    paper_ids = []

    # Iterate through the pages to collect paper IDs
    for page_number in range(0, 100, 25):
        url = base_url.format(25, page_number)
        print("Fetching page:", url)  # Debugging statement
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the list of papers
        papers_list = soup.find('div', {'id': 'dlpage'})

        # Loop through the list of papers and extract their IDs
        for dt_tag in papers_list.find_all('dt'):
            paper_link = dt_tag.find('a', {'title': 'Abstract'}).get('href')
            paper_id = paper_link.split('/')[-1]
            paper_ids.append(paper_id)

            if len(paper_ids) == 100:
                return paper_ids  # Return if 100 paper IDs are collected

    return paper_ids

def main():
    paper_ids = get_arxiv_paper_ids()
    #print("First 100 Paper IDs:")
    print(paper_ids)

if __name__ == "__main__":
    main()

Fetching page: https://arxiv.org/list/cs/recent?show=25&skip=0
Fetching page: https://arxiv.org/list/cs/recent?show=25&skip=25
Fetching page: https://arxiv.org/list/cs/recent?show=25&skip=50
Fetching page: https://arxiv.org/list/cs/recent?show=25&skip=75
['2403.00762', '2403.00758', '2403.00752', '2403.00745', '2403.00743', '2403.00742', '2403.00737', '2403.00729', '2403.00725', '2403.00724', '2403.00720', '2403.00717', '2403.00715', '2403.00712', '2403.00704', '2403.00696', '2403.00691', '2403.00690', '2403.00689', '2403.00686', '2403.00685', '2403.00682', '2403.00680', '2403.00675', '2403.00674', '2403.00673', '2403.00669', '2403.00668', '2403.00665', '2403.00663', '2403.00662', '2403.00646', '2403.00645', '2403.00644', '2403.00643', '2403.00642', '2403.00641', '2403.00633', '2403.00632', '2403.00631', '2403.00628', '2403.00625', '2403.00623', '2403.00622', '2403.00621', '2403.00613', '2403.00611', '2403.00607', '2403.00606', '2403.00598', '2403.00594', '2403.00592', '2403.00591', '2

In [2]:
paper_ids = ['2403.00762', '2403.00758', '2403.00752', '2403.00745', '2403.00743', '2403.00742', '2403.00737', '2403.00729', '2403.00725', '2403.00724', '2403.00720', '2403.00717', '2403.00715', '2403.00712', '2403.00704', '2403.00696', '2403.00691', '2403.00690', '2403.00689', '2403.00686', '2403.00685', '2403.00682', '2403.00680', '2403.00675', '2403.00674', '2403.00673', '2403.00669', '2403.00668', '2403.00665', '2403.00663', '2403.00662', '2403.00646', '2403.00645', '2403.00644', '2403.00643', '2403.00642', '2403.00641', '2403.00633', '2403.00632', '2403.00631', '2403.00628', '2403.00625', '2403.00623', '2403.00622', '2403.00621', '2403.00613', '2403.00611', '2403.00607', '2403.00606', '2403.00598', '2403.00594', '2403.00592', '2403.00591', '2403.00590', '2403.00587', '2403.00586', '2403.00585', '2403.00584', '2403.00582', '2403.00579', '2403.00578', '2403.00574', '2403.00573', '2403.00571', '2403.00570', '2403.00567', '2403.00566', '2403.00565', '2403.00564', '2403.00563', '2403.00561', '2403.00558', '2403.00556', '2403.00554', '2403.00553', '2403.00550', '2403.00546', '2403.00543', '2403.00542', '2403.00540', '2403.00539', '2403.00536', '2403.00529', '2403.00528', '2403.00527', '2403.00526', '2403.00522', '2403.00520', '2403.00517', '2403.00515', '2403.00514', '2403.00510', '2403.00509', '2403.00506', '2403.00504', '2403.00499', '2403.00497', '2403.00491', '2403.00489', '2403.00486']

In [None]:
def scrape_arxiv_paper_details(paper_id):
    paper_link = f"https://arxiv.org/abs/{paper_id}"
    print("Scraping paper:", paper_link)  # Debugging statement
    response = requests.get(paper_link)
    print("response:", response)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting details
    title = soup.find('h1', class_='title mathjax').text.strip() if soup.find('h1', class_='title mathjax') else "N/A"
    authors = [author.text.strip() for author in soup.find_all('div', class_='authors')[0].find_all('a')] if soup.find_all('div', class_='authors') else ["N/A"]
    subjects_tag = soup.find('span', class_='primary-subject') if soup.find('span', class_='primary-subject') else None
    subjects = subjects_tag.text.strip() if subjects_tag else "N/A"
    abstract = soup.find('blockquote', class_='abstract mathjax').text.strip() if soup.find('blockquote', class_='abstract mathjax') else "N/A"

    return {'Title': title, 'Authors': authors, 'Subjects': subjects, 'Abstract': abstract}

def save_to_csv(papers):
    with open('arxiv_cs_papers.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Title', 'Authors', 'Subjects', 'Abstract']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for paper_id in papers:
            paper_details = scrape_arxiv_paper_details(paper_id)
            writer.writerow(paper_details)

def main():
    #paper_ids = get_arxiv_paper_ids()
    #print("Paper IDs collected:", paper_ids)  # Debugging statement
    save_to_csv(paper_ids)
    print("Scraping and saving completed!")

if __name__ == "__main__":
    main()

Scraping paper: https://arxiv.org/abs/2403.00762
response: <Response [200]>
Scraping paper: https://arxiv.org/abs/2403.00758
response: <Response [200]>
Scraping paper: https://arxiv.org/abs/2403.00752
response: <Response [200]>
Scraping paper: https://arxiv.org/abs/2403.00745
response: <Response [200]>
Scraping paper: https://arxiv.org/abs/2403.00743
response: <Response [200]>
Scraping paper: https://arxiv.org/abs/2403.00742
response: <Response [200]>
Scraping paper: https://arxiv.org/abs/2403.00737


In [None]:
不要连续的请求 修改head 增加一些slip （过几秒再爬）
设置一个超时的机制，跳过该个爬取下一个