# metadata의 entry_id를 기준으로 논문의 pdf 저장 

In [8]:
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import time
import pickle
import arxiv
import itertools
import re
from tqdm import tqdm
import os

### load meatadata

In [2]:
with open ("../data_files/paper_metadata/arxiv_ascending_20161001_20230920.pkl", 'rb') as f:
    paper_meta = pickle.load(f)
    
with open ("../data_files/paper_metadata/arxiv_descending_2024_20160927.pkl", 'rb') as f:
    after2023 = pickle.load(f)

paper_meta.extend(after2023)

In [6]:
print("수집된 metadata 수: ", len(paper_meta))
sorted_paper = sorted(paper_meta, key=lambda x: x['published'])

pub_dates = [datetime.strftime(paper['published'], format="%Y-%m-%d")  for paper in sorted_paper]
print("기간: ", sorted(pub_dates)[0], '~', sorted(pub_dates)[-1])

수집된 metadata 수:  70000
기간:  2016-10-01 ~ 2024-12-24


In [32]:
# id로 주소 불러오고 제목을 기준으로 저장하기 위한 dict 

title_id = {}

for paper in paper_meta:
    tmp_title = paper['title']
    tmp_id = paper['entry_id'].split('/')[-1]
    title_id[tmp_id] = tmp_title 

### pdf 저장

In [19]:
# 제목 필터링 함수
def sanitize_title(title):
    title_fixed = title.replace('\n', ' ').strip()  # 줄바꿈 제거 및 양끝 공백 제거
    title_fixed = re.sub(r'\s+', ' ', title_fixed)  # 여러 공백을 하나로 축소
    title_fixed = title_fixed.replace('/', '_')  # '/'를 '_'로 대체
    return title_fixed

# 논문 검색 및 다운로드 함수
def fetch_and_download(key, title, dir_name, error_dict):
    try:
        tmp_paper = next(arxiv.Client().results(arxiv.Search(id_list=[key])))
        title_fixed = sanitize_title(title)
        
        # PDF 다운로드
        tmp_paper.download_pdf(dirpath=dir_name, filename=f"{title_fixed}.pdf")
        time.sleep(2)  # 요청 간격 2초
        return True
    except Exception as e:
        error_dict[key] = title
        print(f"Error downloading paper with ID {key}: {e}")
        return False
    
# 디렉토리 생성 함수
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
max_workers = 4    
batch_size = 5000  

start_val = 0  # 시작 인덱스
end_val = 70000    # 끝 인덱스

for batch_start in range(start_val, end_val, batch_size):
    batch_end = batch_start + batch_size
    if batch_end > end_val:
        batch_end = end_val  # 55,000을 초과하지 않도록 처리

    batch_error_ids = {}

    dir_name = f"./paper_pdf_{batch_end}"
    ensure_dir(dir_name)

    # 슬라이싱할 데이터
    title_id_items = list(itertools.islice(title_id.items(), batch_start, batch_end))

    print(f"\n=== Downloading batch {batch_start} ~ {batch_end} (총 {len(title_id_items)}개) ===")

    # 병렬 다운로드 실행
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(
            executor.map(
                lambda item: fetch_and_download(item[0], item[1], dir_name, batch_error_ids),
                title_id_items
            ),
            total=len(title_id_items)
        ))

    # 에러 개수 계산
    batch_error_count = len(batch_error_ids)
    print(f"Batch {batch_start}~{batch_end} completed with {batch_error_count} errors.\n")

    # Pickle로 배치별 에러 저장
    with open(f'error_id_during_save_pdf_{batch_start}_{batch_end}.pkl', 'wb') as f:
        pickle.dump(batch_error_ids, f)

print("All done.")