In [19]:
import calendar

def get_date_range(year=2024, month=1):
    """
    根據給定年月取得該月份的起始和結束日期範圍
    格式：YYYYMMDDHHMM
    
    Args:
        year (int): 年份，預設 2024
        month (int): 月份 (1-12)
    
    Returns:
        tuple: (start_date, end_date)
    """
    # 檢查輸入
    if not 1 <= month <= 12:
        raise ValueError("月份必須在 1 到 12 之間")
    
    # 使用 calendar 獲取該月最後一天
    _, last_day = calendar.monthrange(year, month)
    
    # 格式化日期字串
    start_date = f"{year}{month:02d}010000"
    end_date = f"{year}{month:02d}{last_day:02d}0000"
    
    return start_date, end_date

# 測試每個月的日期範圍
if __name__ == "__main__":
    for month in range(1, 13):
        start, end = get_date_range(2024, month)
        print(f"2024年{month}月:")
        print(f"起始日期: {start}")
        print(f"結束日期: {end}")
        print()

2024年1月:
起始日期: 202401010000
結束日期: 202401310000

2024年2月:
起始日期: 202402010000
結束日期: 202402290000

2024年3月:
起始日期: 202403010000
結束日期: 202403310000

2024年4月:
起始日期: 202404010000
結束日期: 202404300000

2024年5月:
起始日期: 202405010000
結束日期: 202405310000

2024年6月:
起始日期: 202406010000
結束日期: 202406300000

2024年7月:
起始日期: 202407010000
結束日期: 202407310000

2024年8月:
起始日期: 202408010000
結束日期: 202408310000

2024年9月:
起始日期: 202409010000
結束日期: 202409300000

2024年10月:
起始日期: 202410010000
結束日期: 202410310000

2024年11月:
起始日期: 202411010000
結束日期: 202411300000

2024年12月:
起始日期: 202412010000
結束日期: 202412310000



In [None]:
##要更改 1-12 
然後重新run 以下的code 重複12 次
Change the month: to fetch that paper in that month
month=1


In [235]:
import arxiv
import pandas as pd

# 設定年份和每月的目標數量
year = 2024
papers_per_month = 2000

# 儲存結果
results = []



# 設定查詢篩選條件，包含類別和日期範圍
start_date,end_date=get_date_range(year=2024, month=month)
date_query = f"submittedDate:["+start_date+" TO "+end_date+"]"
print(date_query)

# 設定查詢條件
"""
search = arxiv.Search(
    query=f"au:del_maestro+AND+submittedDate:[202301010600+TO+202401010600]",
    max_results=papers_per_month,
    sort_by=arxiv.SortCriterion.SubmittedDate
)
"""

search = arxiv.Search(
    query=f"cat:cs.CL AND {date_query}",
    max_results=papers_per_month,
    sort_by=arxiv.SortCriterion.SubmittedDate
)



# 解析查詢結果
for result in search.results():
    results.append([
        result.title,
        ", ".join(author.name for author in result.authors),
        result.summary.replace("\n", " "),  # 清理摘要中的換行符號
        result.pdf_url,
        f"{year}-{month:02d}"  # 年月格式
    ])
    




submittedDate:[202412010000 TO 202412310000]


  for result in search.results():


In [236]:
# 將結果存成 DataFrame 並導出為 CSV
columns = ["Title", "Authors", "Abstract", "PDF Link", "Year-Month"]
df = pd.DataFrame(results, columns=columns)


In [237]:
df

Unnamed: 0,Title,Authors,Abstract,PDF Link,Year-Month
0,The Text Classification Pipeline: Starting Sha...,"Marco Siino, Ilenia Tinnirello, Marco La Cascia",Text Classification (TC) stands as a cornersto...,http://arxiv.org/pdf/2501.00174v1,2024-12
1,DeepLL: Considering Linear Logic for the Analy...,Nick Papoulias,Deep Learning experiments have critical requir...,http://arxiv.org/pdf/2501.00169v1,2024-12
2,Measuring Large Language Models Capacity to An...,"Subramaniam Vincent, Phoebe Wang, Zhan Shi, Sa...","Since the launch of ChatGPT in late 2022, the ...",http://arxiv.org/pdf/2501.00164v1,2024-12
3,Temporal reasoning for timeline summarisation ...,"Jiayu Song, Mahmud Akhter, Dana Atzil Slonim, ...",This paper explores whether enhancing temporal...,http://arxiv.org/pdf/2501.00152v1,2024-12
4,A Data-Centric Approach to Detecting and Mitig...,"Julia Ive, Paulina Bondaronek, Vishal Yadav, D...",Introduction: Healthcare AI models often inher...,http://arxiv.org/pdf/2501.00129v1,2024-12
...,...,...,...,...,...
1588,A Comparative Study of LLM-based ASR and Whisp...,"Zheshu Song, Ziyang Ma, Yifan Yang, Jianheng Z...",Large Language Models (LLMs) have showcased ex...,http://arxiv.org/pdf/2412.00721v2,2024-12
1589,Text Is Not All You Need: Multimodal Prompting...,Ashwin Baluja,While Large Language Models (LLMs) have demons...,http://arxiv.org/pdf/2412.05315v1,2024-12
1590,Multi-Agent Collaboration in Incident Response...,Zefang Liu,Incident response (IR) is a critical aspect of...,http://arxiv.org/pdf/2412.00652v2,2024-12
1591,ROSE: A Reward-Oriented Data Selection Framewo...,"Yang Wu, Huayi Zhang, Yizheng Jiao, Lin Ma, Xi...",Instruction tuning has underscored the signifi...,http://arxiv.org/pdf/2412.00631v1,2024-12


In [240]:
output_file = "data/papers_2024_"+f"{month:02d}"+".csv"
output_file 

'data/papers_2024_12.csv'

In [241]:

df.to_csv(output_file, index=False, encoding="utf-8-sig")

print(f"爬取完成，資料已儲存至 {output_file}")

爬取完成，資料已儲存至 data/papers_2024_12.csv
