In [2]:
import arxiv
import pandas as pd
from datetime import datetime, timedelta
import time
from tqdm import tqdm

tags = [
    "cs.LG",
    "cs.AI",
    "cs.CC",
    "math.ST",
    "math.OC",
    "math.PR",
    "stat.ML",
    "cs.CV",
    "cs.CL",
    "cs.NE",
    "cs.RO",
    "math.DS",
    "cs.HC",
    "cs.DM",
    "cs.MA",
    "cs.SD",
    "math.QA",
    "q-fin.ST",
    "q-fin.MF",
    "q-fin.CP",
    "stat.TH"
]
search_query = ' OR '.join([f'cat:{tag}' for tag in tags])
client = arxiv.Client()
start_date = datetime(2024, 12, 1) # 2024-09-22
# end_date = datetime.now()
end_date= datetime(2025, 1, 1)

all_papers = []
total_months = (end_date.year - start_date.year)*12 + (end_date.month - start_date.month)

with tqdm(total=total_months, desc="Fetching data", unit="month") as pbar:
    while start_date < end_date:
        current_end_date = start_date + timedelta(days=30)
        start_str = start_date.strftime('%Y%m%d')
        end_str = current_end_date.strftime('%Y%m%d')
        date_query = f'submittedDate:[{start_str} TO {end_str}]'
        full_query = f'({search_query}) AND {date_query}'

        # Suche definieren
        search = arxiv.Search(
            query=full_query,
            max_results=30000,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        try:
            # Ergebnisse abrufen
            for paper in client.results(search):
                all_papers.append({
                    'paper_id': paper.entry_id,
                    'published_date': paper.published.date(),
                    'categories': ','.join(paper.categories)
                })
        except Exception as e:
            # Letzte verarbeitete Periode ausgeben und Teilergebnis speichern
            print("Error occurred. Saving partial data.")
            print(f"Failed at date range starting: {start_date.strftime('%Y-%m-%d')}")
            print(e)
            pd.DataFrame(all_papers).to_csv('arxiv_papers_partial.csv', index=False)
            break  # Abbruch der Schleife (alternativ weiter mit 'continue')

        # Zum nächsten Monatsabschnitt
        start_date = current_end_date
        pbar.update(1)
        time.sleep(3)

# Abschließend alles speichern
df = pd.DataFrame(all_papers)
df.to_csv('arxiv_papers_with_details_8.csv', index=False)

Fetching data: 2month [16:00, 480.16s/month]                    


In [11]:
import pandas as pd

# Read the CSV files
file1 = 'arxiv_papers.csv'
file2 = 'arxiv_papers_with_details_8.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Combine the dataframes and drop duplicates based on 'paper_id'
combined_df = pd.concat([df1, df2]).drop_duplicates(subset='paper_id')

# Extract the part after the last '/' in 'paper_id' and create a new column 'id'
combined_df['id'] = combined_df['paper_id'].apply(lambda x: x.split('/')[-1])

# Write the resulting dataframe to a new CSV file
output_file = 'complete_arxiv_papers.csv'
combined_df.to_csv(output_file, index=False)

print(f"The combined CSV file has been saved as {output_file}.")


The combined CSV file has been saved as complete_arxiv_papers.csv.


In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 635250 entries, 0 to 635249
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   paper_id        635250 non-null  object
 1   published_date  635250 non-null  object
 2   categories      635250 non-null  object
 3   id              635250 non-null  object
dtypes: object(4)
memory usage: 19.4+ MB


In [13]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 644925 entries, 0 to 17154
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   paper_id        644925 non-null  object
 1   published_date  644925 non-null  object
 2   categories      644925 non-null  object
 3   id              644925 non-null  object
dtypes: object(4)
memory usage: 24.6+ MB


In [17]:
# Check for entries with the year 2025 in 'published_date'
entries_2025 = combined_df[combined_df['published_date'].str.contains('2025')]

if not entries_2025.empty:
    print("Entries with the year 2025 found")
    # Drop entries with the year 2025
    combined_df = combined_df[~combined_df['published_date'].str.contains('2025')]
else:
    print("No entries with the year 2025 found.")

No entries with the year 2025 found.


In [18]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 637601 entries, 0 to 17154
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   paper_id        637601 non-null  object
 1   published_date  637601 non-null  object
 2   categories      637601 non-null  object
 3   id              637601 non-null  object
dtypes: object(4)
memory usage: 24.3+ MB


In [19]:
output_file = 'complete_arxiv_papers.csv'
combined_df.to_csv(output_file, index=False)