In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import os
from datetime import datetime, timedelta
import time
import re

# Configuration
ARXIV_API_URL = "http://export.arxiv.org/api/query"
BATCH_SIZE = 1000
REQUEST_INTERVAL = 4  # Respect rate limiting
CSV_PATH = r"C:\arxiv_project\data\raw\articles.csv"
LOG_FILE = r"C:\arxiv_project\data\logs\daily_fetch.log"

def log_message(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}"
    print(log_entry)
    os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(log_entry + '\n')

def extract_arxiv_data(entry, ns):
    try:
        title = entry.find('atom:title', ns).text.strip()
        summary = entry.find('atom:summary', ns).text.strip()
        published = entry.find('atom:published', ns).text
        updated = entry.find('atom:updated', ns).text
        link = entry.find('atom:id', ns).text

        if not re.match(r"^\d{4}-\d{2}-\d{2}T", published):
            raise ValueError(f"Invalid published date: {published}")

        arxiv_id = link.split('/')[-1] if link else ""
        authors = [author.find('atom:name', ns).text for author in entry.findall('atom:author', ns)]
        authors_str = "; ".join(authors)
        categories = [cat.get('term') for cat in entry.findall('atom:category', ns)]
        categories_str = "; ".join(categories)

        primary_category = ""
        primary_elem = entry.find('atom:primary_category', ns)
        if primary_elem is not None:
            primary_category = primary_elem.get('term', '')

        doi = None
        doi_elem = entry.find('.//atom:link[@title="doi"]', ns)
        if doi_elem is not None:
            doi_href = doi_elem.get('href', '')
            if 'doi.org' in doi_href:
                doi = doi_href.split('doi.org/')[-1]

        journal_ref = None
        journal_elem = entry.find('atom:journal_ref', ns)
        if journal_elem is not None:
            journal_ref = journal_elem.text.strip()

        article = {
            "id": arxiv_id,
            "title": title,
            "author": authors_str,
            "published": published,
            "updated": updated,
            "link": link,
            "summary": summary,
            "primary_category": primary_category,
            "category": categories_str,
            "doi": doi,
            "journal_ref": journal_ref,
            "entry": link,
            "name": f"{authors[0] if authors else 'Unknown'} et al. - {title[:50]}...",
            "comment": None,
            "fetch_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        return article
    except Exception as e:
        log_message(f"❌ Error extracting data: {e}")
        return None

def get_last_fetch_date():
    if os.path.exists(CSV_PATH):
        try:
            df = pd.read_csv(CSV_PATH, usecols=['published'])
            df = df[df['published'].astype(str).str.match(r"^\d{4}-\d{2}-\d{2}T")]
            df['published_date'] = pd.to_datetime(df['published'], errors='coerce').dt.date
            df = df.dropna(subset=['published_date'])
            last_date = df['published_date'].max()
            log_message(f"📅 Found last date in CSV: {last_date}")
            return last_date
        except Exception as e:
            log_message(f"⚠️ Error reading CSV for last date: {e}")

    fallback_date = (datetime.now() - timedelta(days=1)).date()
    log_message(f"📅 No CSV found, using fallback date: {fallback_date}")
    return fallback_date

def fetch_daily_data(target_date):
    start_str = target_date.strftime("%Y%m%d0000")
    end_str = target_date.strftime("%Y%m%d2359")
    query = f"submittedDate:[{start_str} TO {end_str}]"

    log_message(f"🔎 Fetching data for {target_date.strftime('%Y-%m-%d')}")

    session = requests.Session()
    session.headers.update({'User-Agent': 'ArxivDailyBot/1.0'})
    all_articles, start = [], 0

    while True:
        params = {
            "search_query": query,
            "start": start,
            "max_results": BATCH_SIZE,
            "sortBy": "submittedDate"
        }

        try:
            response = session.get(ARXIV_API_URL, params=params, timeout=45)
            response.raise_for_status()
            root = ET.fromstring(response.content)
            ns = {'atom': 'http://www.w3.org/2005/Atom'}
            entries = root.findall('atom:entry', ns)
            if not entries:
                break
            log_message(f"📄 Processing {len(entries)} entries (batch starting at {start})")
            for entry in entries:
                article = extract_arxiv_data(entry, ns)
                if article:
                    all_articles.append(article)
            start += BATCH_SIZE
            time.sleep(REQUEST_INTERVAL)
        except Exception as e:
            log_message(f"❌ Error fetching data: {e}")
            time.sleep(5)
            break

    session.close()
    log_message(f"✅ Fetched {len(all_articles)} articles for {target_date.strftime('%Y-%m-%d')}")
    return all_articles

def append_to_csv(articles):
    if not articles:
        log_message("ℹ️ No articles to append")
        return

    os.makedirs(os.path.dirname(CSV_PATH), exist_ok=True)
    df = pd.DataFrame(articles)
    df = df[df['published'].astype(str).str.match(r"^\d{4}-\d{2}-\d{2}T")]

    if os.path.exists(CSV_PATH):
        df.to_csv(CSV_PATH, mode='a', header=False, index=False, encoding='utf-8')
        log_message(f"📝 Appended {len(df)} articles to existing CSV")
    else:
        df.to_csv(CSV_PATH, mode='w', header=True, index=False, encoding='utf-8')
        log_message(f"📝 Created new CSV with {len(df)} articles")

def remove_duplicates_from_csv():
    if not os.path.exists(CSV_PATH):
        return

    try:
        df = pd.read_csv(CSV_PATH, dtype=str, low_memory=False)
        initial_count = len(df)
        df = df.drop_duplicates(subset=['id'], keep='last')
        final_count = len(df)
        if initial_count != final_count:
            df.to_csv(CSV_PATH, index=False, encoding='utf-8')
            log_message(f"🧹 Removed {initial_count - final_count} duplicate entries")
        else:
            log_message("✅ No duplicates found")
    except Exception as e:
        log_message(f"❌ Error removing duplicates: {e}")

def main():
    log_message("=== Daily ArXiv Data Fetcher Started ===")
    log_message(f"📁 CSV Path: {CSV_PATH}")
    log_message(f"📁 Log Path: {LOG_FILE}")

    try:
        csv_dir = os.path.dirname(CSV_PATH)
        log_message(f"📂 CSV Directory: {csv_dir}")
        log_message(f"📂 Directory exists: {os.path.exists(csv_dir)}")
        os.makedirs(csv_dir, exist_ok=True)

        last_fetch_date = get_last_fetch_date()
        yesterday = (datetime.now() - timedelta(days=1)).date()
        today = datetime.now().date()
        log_message(f"📅 Today: {today}")
        log_message(f"📅 Yesterday: {yesterday}")

        if last_fetch_date >= yesterday:
            log_message(f"✅ Already up to date. Last fetch: {last_fetch_date}")
            log_message("🧪 For testing, let's try fetching yesterday's data anyway...")
            articles = fetch_daily_data(yesterday)
            if articles:
                append_to_csv(articles)
                log_message(f"🧪 Test fetch completed with {len(articles)} articles")
            return

        current_date = last_fetch_date + timedelta(days=1)
        total_articles = 0

        while current_date <= yesterday:
            log_message(f"🔄 Processing date: {current_date}")
            articles = fetch_daily_data(current_date)
            if articles:
                append_to_csv(articles)
                total_articles += len(articles)
            else:
                log_message(f"ℹ️ No articles found for {current_date}")
            current_date += timedelta(days=1)
            time.sleep(2)

        remove_duplicates_from_csv()
        log_message(f"🎉 Daily fetch completed! Total new articles: {total_articles}")

        if os.path.exists(CSV_PATH):
            df = pd.read_csv(CSV_PATH, dtype=str, low_memory=False)
            log_message(f"📊 Total articles in CSV: {len(df):,}")
    except Exception as e:
        log_message(f"❌ Fatal error: {e}")
        import traceback
        log_message(f"❌ Traceback: {traceback.format_exc()}")
        raise

if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        test_simple_fetch()
    else:
        main()


[2025-07-04 16:19:09] === Daily ArXiv Data Fetcher Started ===
[2025-07-04 16:19:09] 📁 CSV Path: C:\arxiv_project\arxiv_dataset\articles.csv
[2025-07-04 16:19:09] 📁 Log Path: C:\arxiv_project\arxiv_dataset\daily_fetch.log
[2025-07-04 16:19:09] 📂 CSV Directory: C:\arxiv_project\arxiv_dataset
[2025-07-04 16:19:09] 📂 Directory exists: True
[2025-07-04 16:19:37] 📅 Found last date in CSV: 2025-07-02
[2025-07-04 16:19:37] 📅 Today: 2025-07-04
[2025-07-04 16:19:37] 📅 Yesterday: 2025-07-03
[2025-07-04 16:19:37] 🔄 Processing date: 2025-07-03
[2025-07-04 16:19:37] 🔎 Fetching data for 2025-07-03
[2025-07-04 16:19:56] 📄 Processing 659 entries (batch starting at 0)
[2025-07-04 16:20:05] ✅ Fetched 659 articles for 2025-07-03
[2025-07-04 16:20:05] 📝 Appended 659 articles to existing CSV
[2025-07-04 16:22:08] 🧹 Removed 659 duplicate entries
[2025-07-04 16:22:09] 🎉 Daily fetch completed! Total new articles: 659


  df = pd.read_csv(CSV_PATH)


[2025-07-04 16:23:01] 📊 Total articles in CSV: 2,244,856
