In [1]:
import arxiv
import pandas as pd
from datetime import datetime, timedelta
import time
from tqdm import tqdm

tags = [
    "cs.LG",
    "cs.AI",
    "cs.CC",
    "math.ST",
    "math.OC",
    "math.PR",
    "stat.ML",
    "cs.CV",
    "cs.CL",
    "cs.NE",
    "cs.RO",
    "math.DS",
    "cs.HC",
    "cs.DM",
    "cs.MA",
    "cs.SD",
    "math.QA",
    "q-fin.ST",
    "q-fin.MF",
    "q-fin.CP",
    "stat.TH"
]
search_query = ' OR '.join([f'cat:{tag}' for tag in tags])
client = arxiv.Client()
start_date = datetime(2024, 12, 25) # 2024-09-22
end_date = datetime.now()

all_papers = []
total_months = (end_date.year - start_date.year)*12 + (end_date.month - start_date.month)

with tqdm(total=total_months, desc="Fetching data", unit="month") as pbar:
    while start_date < end_date:
        current_end_date = start_date + timedelta(days=30)
        start_str = start_date.strftime('%Y%m%d')
        end_str = current_end_date.strftime('%Y%m%d')
        date_query = f'submittedDate:[{start_str} TO {end_str}]'
        full_query = f'({search_query}) AND {date_query}'

        # Suche definieren
        search = arxiv.Search(
            query=full_query,
            max_results=30000,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        try:
            # Ergebnisse abrufen
            for paper in client.results(search):
                all_papers.append({
                    'paper_id': paper.entry_id,
                    'published_date': paper.published.date(),
                    'categories': ','.join(paper.categories)
                })
        except Exception as e:
            # Letzte verarbeitete Periode ausgeben und Teilergebnis speichern
            print("Error occurred. Saving partial data.")
            print(f"Failed at date range starting: {start_date.strftime('%Y-%m-%d')}")
            print(e)
            pd.DataFrame(all_papers).to_csv('arxiv_papers_partial.csv', index=False)
            break  # Abbruch der Schleife (alternativ weiter mit 'continue')

        # Zum nächsten Monatsabschnitt
        start_date = current_end_date
        pbar.update(1)
        time.sleep(3)

# Abschließend alles speichern
df = pd.DataFrame(all_papers)
df.to_csv('arxiv_papers_with_details_8.csv', index=False)

Fetching data: 1month [00:03,  3.72s/month]
