# Abstract extraction

In [145]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import time

# ArXiv API endpoint
ARXIV_API_URL = "http://export.arxiv.org/api/query"

def fetch_arxiv_abstracts(num_abstracts=1):
    """
    Fetch abstracts from the ArXiv API based on the specified number and return a DataFrame with
    columns: title, abstract text, date, and authors. Limits max_results to 2000 per request.
    
    Parameters:
        num_abstracts (int): The number of abstracts to fetch.
        
    Returns:
        pandas.DataFrame: A DataFrame containing titles, abstracts, dates, and authors.
    """
    # Enforce a reasonable limit for max_results
    MAX_RESULTS_LIMIT = 2000
    num_abstracts = min(num_abstracts, MAX_RESULTS_LIMIT)

    # Parameters for the search query
    params = {
        "search_query": "cat:cs.*",
        "start": 0,
        "max_results": num_abstracts,
        "sortBy": "submittedDate",
        "sortOrder": "descending"
    }

    # Perform the request to the ArXiv API with rate limiting
    response = requests.get(ARXIV_API_URL, params=params)
    time.sleep(3)  # Rate limit: one request every three seconds
    response_text = response.text

    # Parse the XML response
    root = ET.fromstring(response_text)
    namespace = {'arxiv': 'http://www.w3.org/2005/Atom'}

    data = []

    # Iterate through each entry to extract relevant data
    for entry in root.findall('arxiv:entry', namespace):
        # Extract title
        title = entry.find('arxiv:title', namespace).text.strip()

        # Extract abstract
        summary = entry.find('arxiv:summary', namespace).text.strip()

        # Extract publication date and format it nicely
        date = entry.find('arxiv:published', namespace).text
        formatted_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")

        # Extract authors
        authors = [author.text.strip() for author in entry.findall('arxiv:author/arxiv:name', namespace)]
        authors_str = ", ".join(authors)

        # Append the data as a row
        data.append({
            "Title": title,
            "Abstract": summary,
            "Date": formatted_date,
            "Authors": authors_str
        })

    # Create a DataFrame
    df = pd.DataFrame(data)
    return df

# Example: Fetch 3 abstracts and display the DataFrame
abstracts_df = fetch_arxiv_abstracts(num_abstracts=200)
abstracts_df

Unnamed: 0,Title,Abstract,Date,Authors
0,AniDoc: Animation Creation Made Easier,The production of 2D animation follows an indu...,2024-12-18,"Yihao Meng, Hao Ouyang, Hanlin Wang, Qiuyu Wan..."
1,Learning from Massive Human Videos for Univers...,Scalable learning of humanoid robots is crucia...,2024-12-18,"Jiageng Mao, Siheng Zhao, Siqi Song, Tianheng ..."
2,Thinking in Space: How Multimodal Large Langua...,Humans possess the visual-spatial intelligence...,2024-12-18,"Jihan Yang, Shusheng Yang, Anjali W. Gupta, Ri..."
3,Autoregressive Video Generation without Vector...,This paper presents a novel approach that enab...,2024-12-18,"Haoge Deng, Ting Pan, Haiwen Diao, Zhengxiong ..."
4,E-CAR: Efficient Continuous Autoregressive Ima...,Recent advances in autoregressive (AR) models ...,2024-12-18,"Zhihang Yuan, Yuzhang Shang, Hanling Zhang, To..."
...,...,...,...,...
495,Enhancing Internet of Things Security throughS...,With the rapid rise of the Internet of Things ...,2024-12-17,"Safa Ben Atitallah, Maha Driss, Wadii Boulila,..."
496,Equity in the Use of ChatGPT for the Classroom...,A college education historically has been seen...,2024-12-17,"Monnie McGee, Bivin Sadler"
497,Koopman Mode-Based Detection of Internal Short...,Monitoring of internal short circuit (ISC) in ...,2024-12-17,"Sanchita Ghosh, Soumyoraj Mallick, Tanushree Roy"
498,Contract-based Design and Verification of Mult...,Quantitative requirements play an important ro...,2024-12-17,"Rafael Dewes, Rayna Dimitrova"


# Iterative saving mechanism

In [26]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import time
import os

# ArXiv API endpoint
ARXIV_API_URL = "http://export.arxiv.org/api/query"
# File path to save the DataFrame
DATAFRAME_FILE = "arxiv_abstracts.csv"


def fetch_arxiv_abstracts(num_abstracts=1):
    """
    Fetch abstracts from the ArXiv API based on the specified number and return a DataFrame with
    columns: title, abstract text, date, and authors. Limits max_results to 2000 per request.
    Dynamically updates the start index and merges new data with the saved DataFrame.

    Parameters:
        num_abstracts (int): The number of abstracts to fetch.

    Returns:
        pandas.DataFrame: A DataFrame containing titles, abstracts, dates, and authors.
    """
    
    # Load the existing DataFrame if the file exists
    if os.path.exists(DATAFRAME_FILE):
        existing_df = pd.read_csv(DATAFRAME_FILE)
        start_index = len(existing_df)
    else:
        existing_df = pd.DataFrame(columns=["Title", "Abstract", "Date", "Authors"])
        start_index = 0

    # Parameters for the search query
    params = {
        "search_query": "cat:cs.SI",
        "start": start_index,
        "max_results": start_index + num_abstracts,
        "sortBy": "submittedDate",
        "sortOrder": "descending"
    }

    # Perform the request to the ArXiv API with rate limiting
    response = requests.get(ARXIV_API_URL, params=params)
    time.sleep(3)  # Rate limit: one request every three seconds
    response_text = response.text

    # Parse the XML response
    root = ET.fromstring(response_text)
    namespace = {'arxiv': 'http://www.w3.org/2005/Atom'}

    data = []

    # Iterate through each entry to extract relevant data
    for entry in root.findall('arxiv:entry', namespace):
        # Extract title
        title = entry.find('arxiv:title', namespace).text.strip()

        # Extract abstract
        summary = entry.find('arxiv:summary', namespace).text.strip()

        # Extract publication date and format it nicely
        date = entry.find('arxiv:published', namespace).text
        formatted_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")

        # Extract authors
        authors = [author.text.strip() for author in entry.findall('arxiv:author/arxiv:name', namespace)]
        authors_str = ", ".join(authors)

        # Append the data as a row
        data.append({
            "Title": title,
            "Abstract": summary,
            "Date": formatted_date,
            "Authors": authors_str
        })

    # Create a DataFrame
    new_df = pd.DataFrame(data)

    # Concatenate the new data with the existing DataFrame and remove duplicates
    combined_df = pd.concat([existing_df, new_df]).drop_duplicates().reset_index(drop=True)

    # Save the updated DataFrame to file
    combined_df.to_csv(DATAFRAME_FILE, index=False)

    return combined_df

# Example: Fetch 3 abstracts and display the DataFrame
abstracts_df = fetch_arxiv_abstracts(num_abstracts=2100)
abstracts_df

Unnamed: 0,Title,Abstract,Date,Authors
0,AniDoc: Animation Creation Made Easier,The production of 2D animation follows an indu...,2024-12-18,"Yihao Meng, Hao Ouyang, Hanlin Wang, Qiuyu Wan..."
1,Learning from Massive Human Videos for Univers...,Scalable learning of humanoid robots is crucia...,2024-12-18,"Jiageng Mao, Siheng Zhao, Siqi Song, Tianheng ..."
2,Thinking in Space: How Multimodal Large Langua...,Humans possess the visual-spatial intelligence...,2024-12-18,"Jihan Yang, Shusheng Yang, Anjali W. Gupta, Ri..."
3,Autoregressive Video Generation without Vector...,This paper presents a novel approach that enab...,2024-12-18,"Haoge Deng, Ting Pan, Haiwen Diao, Zhengxiong ..."
4,E-CAR: Efficient Continuous Autoregressive Ima...,Recent advances in autoregressive (AR) models ...,2024-12-18,"Zhihang Yuan, Yuzhang Shang, Hanling Zhang, To..."
...,...,...,...,...
5583,Mutual influence between language and percepti...,Language interfaces with many other cognitive ...,2021-12-29,"Xenia Ohmer, Michael Marino, Michael Franke, P..."
5584,Bayesian Neural Hawkes Process for Event Uncer...,Event data consisting of time of occurrence of...,2021-12-29,"Manisha Dubey, Ragja Palakkadavath, P. K. Srijith"
5585,How Powerful are Interest Diffusion on Purchas...,A taocode is a kind of specially coded text-li...,2021-12-29,"Xuanwen Huang, Yang Yang, Ziqiang Cheng, Shen ..."
5586,Deep learning for location based beamforming w...,Massive MIMO systems are highly efficient but ...,2021-12-29,"Luc Le Magoarou, Taha Yassine, Stéphane Paquel..."


# Full text extraction

In [104]:
import requests
import xml.etree.ElementTree as ET
import fitz  # PyMuPDF
from io import BytesIO
import matplotlib.pyplot as plt
from PIL import Image

# ArXiv API endpoint
ARXIV_API_URL = "http://export.arxiv.org/api/query"

# Parameters for the search query
params = {
    "search_query": "cat:cs.*",
    "start": 0,
    "max_results": 1,
    "sortBy": "submittedDate",
    "sortOrder": "descending"
}

# Perform the request to the ArXiv API
response = requests.get(ARXIV_API_URL, params=params)
response_text = response.text

# Parse the XML response
root = ET.fromstring(response_text)
namespace = {'arxiv': 'http://www.w3.org/2005/Atom'}

# Extract title, summary, and PDF link
entry = root.find('arxiv:entry', namespace)
if entry is not None:
    pdf_link = entry.find('arxiv:link[@title="pdf"]', namespace).attrib['href']
    
    # Download the PDF content
    pdf_response = requests.get(pdf_link)
    pdf_bytes = BytesIO(pdf_response.content)
    
    # Open the PDF with PyMuPDF
    pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")

content = ""
for page in pdf_document:
    content += page.get_text()