# arxiv api 사용하여 논문 metadata 수집

In [6]:
import asyncio
from concurrent.futures import ThreadPoolExecutor
import requests
from datetime import datetime
from xml.etree import ElementTree as ET
import time
import pickle

## 데이터 수집
### 2016-10-01 ~ 2024-12-24 기간 동안의 Computation and Language 분야 논문 수집 

#### 데이터 형식
| **컬럼 이름**         | **설명**                                           |
|-----------------------|----------------------------------------------------|
| `entry_id`           | 논문의 고유 식별자 및 arxiv 논문 주소                                  |
| `updated`            | 마지막으로 수정된 날짜 및 시간 (tzinfo)                      |
| `published`          | 게재된 날짜 및 시간 (tzinfo)                                 |
| `title`              | 논문의 제목                                         |
| `authors`            | 저자 목록                                           |
| `summary` (abstract) | 항목의 요약 또는 초록                               |
| `comment`            | 추가적인 코멘트나 비고                              |
| `journal_ref`        | 논문이 게재된 저널의 참고 정보                      |
| `doi`                | 논문의 DOI(디지털 객체 식별자)                      |
| `primary_category`   | 주요 분류 카테고리                                  |
| `categories`         | 모든 분류 카테고리 목록                             |
| `links`              | arxiv 논문 주소와 논문 pdf 주소                             |


#### 2016-10-01 ~ 2023-09-20 (ascending 사용)

In [7]:
def fetch_arxiv_ascending(query, start, max_results, retries=3, delay=5, failed_urls=None):
    
    if failed_urls is None:
        failed_urls = []
    
    base_url = "http://export.arxiv.org/api/query"
    api_url = f"{base_url}?search_query={query}&start={start}&max_results={max_results}&sortBy=submittedDate&sortOrder=ascending"
    print(api_url)
    
    for attempt in range(retries):
        # API 요청
        response = requests.get(api_url)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch data: {response.status_code}")

        # XML 데이터 파싱
        root = ET.fromstring(response.content)
        entries = root.findall("{http://www.w3.org/2005/Atom}entry")
        
        # Entry가 있는 경우 결과 반환
        if entries:
            results = []
            for entry in entries:
                title = entry.find("{http://www.w3.org/2005/Atom}title").text
                published = entry.find("{http://www.w3.org/2005/Atom}published").text
                updated = entry.find("{http://www.w3.org/2005/Atom}updated").text
                authors = [
                    author.find("{http://www.w3.org/2005/Atom}name").text
                    for author in entry.findall("{http://www.w3.org/2005/Atom}author")
                ]
                summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
                comment = entry.find("{http://arxiv.org/schemas/atom}comment")
                journal_ref = entry.find("{http://arxiv.org/schemas/atom}journal_ref")
                doi = entry.find("{http://arxiv.org/schemas/atom}doi")
                primary_category = entry.find("{http://arxiv.org/schemas/atom}primary_category").attrib.get("term")
                categories = [cat.attrib.get("term") for cat in entry.findall("{http://www.w3.org/2005/Atom}category")]
                links = [link.attrib.get("href") for link in entry.findall("{http://www.w3.org/2005/Atom}link")]

                results.append({
                    "entry_id": entry.find("{http://www.w3.org/2005/Atom}id").text,
                    "updated": datetime.fromisoformat(updated[:-1]),
                    "published": datetime.fromisoformat(published[:-1]),
                    "title": title,
                    "authors": authors,
                    "summary": summary,
                    "comment": comment.text if comment is not None else None,
                    "journal_ref": journal_ref.text if journal_ref is not None else None,
                    "doi": doi.text if doi is not None else None,
                    "primary_category": primary_category,
                    "categories": categories,
                    "links": links,
                })
            return results

        # Entry가 없는 경우 대기 후 재시도
        print(f"No entries found, retrying... (attempt {attempt + 1}/{retries})")
        time.sleep(delay)

    # 재시도 후 실패 시 예외 발생
    print("Failed to fetch data: No entries found after multiple attempts.")
    failed_urls.append(api_url)
    
    return []

In [8]:
async def fetch_arxiv_async(query, start, end, max_results):
    loop = asyncio.get_event_loop()
    tasks = []
    failed_urls = []

    # ThreadPoolExecutor를 사용해 동기 코드를 비동기로 실행
    with ThreadPoolExecutor() as executor:
        for current_start in range(start, end, max_results):
            tasks.append(
                loop.run_in_executor(
                    executor, fetch_arxiv_ascending, query, current_start, max_results, failed_urls
                )
            )
            await asyncio.sleep(10)  # 1초 대기

        # 모든 작업 실행
        results_batches = await asyncio.gather(*tasks)

    # 결과를 하나의 리스트로 병합
    all_results = [paper for batch in results_batches for paper in batch]
    
    return all_results, failed_urls

In [4]:
query = "cat:cs.CL"
start = 5000
end = 50000
max_results = 1000  # 한 번에 가져올 논문 수

results, error_urls = await fetch_arxiv_async(query, start, end, max_results)

http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=5000&max_results=1000&sortBy=submittedDate&sortOrder=ascending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=6000&max_results=1000&sortBy=submittedDate&sortOrder=ascending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=7000&max_results=1000&sortBy=submittedDate&sortOrder=ascending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=8000&max_results=1000&sortBy=submittedDate&sortOrder=ascending
No entries found, retrying... (attempt 1/3)
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=9000&max_results=1000&sortBy=submittedDate&sortOrder=ascending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=10000&max_results=1000&sortBy=submittedDate&sortOrder=ascending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=11000&max_results=1000&sortBy=submittedDate&sortOrder=ascending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=12000&max

In [47]:
try:
    with open("../data_files/paper_metadata/arxiv_ascending_20161001_20230920.pkl", "wb") as f:
        pickle.dump(results, f)
except Exception as e:
    print(f"Error while saving pickle file: {e}")


#### 2023-09-27 ~ 2024-12-27 (descending 사용)

In [23]:
def fetch_arxiv_descending(query, start, max_results, retries=3, delay=5):
    base_url = "http://export.arxiv.org/api/query"
    api_url = f"{base_url}?search_query={query}&start={start}&max_results={max_results}&sortBy=submittedDate&sortOrder=descending"
    print(api_url)
    
    for attempt in range(retries):
        # API 요청
        response = requests.get(api_url)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch data: {response.status_code}")

        # XML 데이터 파싱
        root = ET.fromstring(response.content)
        entries = root.findall("{http://www.w3.org/2005/Atom}entry")
        
        # Entry가 있는 경우 결과 반환
        if entries:
            results = []
            for entry in entries:
                title = entry.find("{http://www.w3.org/2005/Atom}title").text
                published = entry.find("{http://www.w3.org/2005/Atom}published").text
                updated = entry.find("{http://www.w3.org/2005/Atom}updated").text
                authors = [
                    author.find("{http://www.w3.org/2005/Atom}name").text
                    for author in entry.findall("{http://www.w3.org/2005/Atom}author")
                ]
                summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
                comment = entry.find("{http://arxiv.org/schemas/atom}comment")
                journal_ref = entry.find("{http://arxiv.org/schemas/atom}journal_ref")
                doi = entry.find("{http://arxiv.org/schemas/atom}doi")
                primary_category = entry.find("{http://arxiv.org/schemas/atom}primary_category").attrib.get("term")
                categories = [cat.attrib.get("term") for cat in entry.findall("{http://www.w3.org/2005/Atom}category")]
                links = [link.attrib.get("href") for link in entry.findall("{http://www.w3.org/2005/Atom}link")]

                results.append({
                    "entry_id": entry.find("{http://www.w3.org/2005/Atom}id").text,
                    "updated": datetime.fromisoformat(updated[:-1]),
                    "published": datetime.fromisoformat(published[:-1]),
                    "title": title,
                    "authors": authors,
                    "summary": summary,
                    "comment": comment.text if comment is not None else None,
                    "journal_ref": journal_ref.text if journal_ref is not None else None,
                    "doi": doi.text if doi is not None else None,
                    "primary_category": primary_category,
                    "categories": categories,
                    "links": links,
                })
            return results

        # Entry가 없는 경우 대기 후 재시도
        print(f"No entries found, retrying... (attempt {attempt + 1}/{retries})")
        time.sleep(delay)

    # 재시도 후 실패 시 예외 발생
    print("Failed to fetch data: No entries found after multiple attempts.")
    return []


In [24]:
async def fetch_arxiv_async(query, start, end, max_results):
    loop = asyncio.get_event_loop()
    tasks = []

    # ThreadPoolExecutor를 사용해 동기 코드를 비동기로 실행
    with ThreadPoolExecutor() as executor:
        for current_start in range(start, end, max_results):
            tasks.append(
                loop.run_in_executor(
                    executor, fetch_arxiv_descending, query, current_start, max_results
                )
            )
            await asyncio.sleep(10)  # 1초 대기

        # 모든 작업 실행
        results_batches = await asyncio.gather(*tasks)

    # 결과를 하나의 리스트로 병합
    all_results = [paper for batch in results_batches for paper in batch]
    return all_results

In [25]:
query = "cat:cs.CL"
start = 0
end = 25000
max_results = 1000  # 한 번에 가져올 논문 수

results = await fetch_arxiv_async(query, start, end, max_results)

http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=0&max_results=1000&sortBy=submittedDate&sortOrder=descending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=1000&max_results=1000&sortBy=submittedDate&sortOrder=descending
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=2000&max_results=1000&sortBy=submittedDate&sortOrder=descending
No entries found, retrying... (attempt 1/3)
No entries found, retrying... (attempt 2/3)
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=3000&max_results=1000&sortBy=submittedDate&sortOrder=descending
No entries found, retrying... (attempt 1/3)
No entries found, retrying... (attempt 2/3)
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=4000&max_results=1000&sortBy=submittedDate&sortOrder=descending
No entries found, retrying... (attempt 1/3)
No entries found, retrying... (attempt 2/3)
http://export.arxiv.org/api/query?search_query=cat:cs.CL&start=5000&max_results=1000&sortBy=submittedDat

In [28]:
try:
    with open("../data_files/paper_metadata/arxiv_descending_2024_20160927.pkl", "wb") as f:
        pickle.dump(results, f)
except Exception as e:
    print(f"Error while saving pickle file: {e}")


#### get error urls

In [6]:
def fetch_error_urls(url, retries=3, delay=10):
    
    for attempt in range(retries):
        # API 요청
        response = requests.get(url)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch data: {response.status_code}")

        # XML 데이터 파싱
        root = ET.fromstring(response.content)
        entries = root.findall("{http://www.w3.org/2005/Atom}entry")
        
        # Entry가 있는 경우 결과 반환
        if entries:
            results = []
            for entry in entries:
                title = entry.find("{http://www.w3.org/2005/Atom}title").text
                published = entry.find("{http://www.w3.org/2005/Atom}published").text
                updated = entry.find("{http://www.w3.org/2005/Atom}updated").text
                authors = [
                    author.find("{http://www.w3.org/2005/Atom}name").text
                    for author in entry.findall("{http://www.w3.org/2005/Atom}author")
                ]
                summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
                comment = entry.find("{http://arxiv.org/schemas/atom}comment")
                journal_ref = entry.find("{http://arxiv.org/schemas/atom}journal_ref")
                doi = entry.find("{http://arxiv.org/schemas/atom}doi")
                primary_category = entry.find("{http://arxiv.org/schemas/atom}primary_category").attrib.get("term")
                categories = [cat.attrib.get("term") for cat in entry.findall("{http://www.w3.org/2005/Atom}category")]
                links = [link.attrib.get("href") for link in entry.findall("{http://www.w3.org/2005/Atom}link")]

                results.append({
                    "entry_id": entry.find("{http://www.w3.org/2005/Atom}id").text,
                    "updated": datetime.fromisoformat(updated[:-1]),
                    "published": datetime.fromisoformat(published[:-1]),
                    "title": title,
                    "authors": authors,
                    "summary": summary,
                    "comment": comment.text if comment is not None else None,
                    "journal_ref": journal_ref.text if journal_ref is not None else None,
                    "doi": doi.text if doi is not None else None,
                    "primary_category": primary_category,
                    "categories": categories,
                    "links": links,
                })
            return results

        # Entry가 없는 경우 대기 후 재시도
        print(f"No entries found, retrying... (attempt {attempt + 1}/{retries})")
        time.sleep(delay)

    # 재시도 후 실패 시 예외 발생
    print("Failed to fetch data: No entries found after multiple attempts.")
    return []


In [64]:
error_url1 = 'https://export.arxiv.org/api/query?search_query=cat:cs.CL&start=37000&max_results=1000&sortBy=submittedDate&sortOrder=ascending'
error_result1 = fetch_error_urls(error_url1)
print(len(error_result1))

1000
