In [16]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import os
from datetime import datetime, timedelta
import time
import re

# Configuration
ARXIV_API_URL = "http://export.arxiv.org/api/query"
BATCH_SIZE = 1000
REQUEST_INTERVAL = 4  # Respect rate limiting: 1 request per 3+ seconds
CSV_PATH = r"C:\arxiv_project\arxiv_dataset\articles.csv"
LOG_FILE = r"C:\arxiv_project\arxiv_dataset\daily_fetch.log"

def log_message(message):
    """Log messages to both console and file"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}"
    print(log_entry)
    
    # Ensure log directory exists
    os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
    
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(log_entry + '\n')

def extract_arxiv_data(entry, ns):
    """Extract all arXiv data from XML entry"""
    try:
        title = entry.find('atom:title', ns).text.strip()
        summary = entry.find('atom:summary', ns).text.strip()
        published = entry.find('atom:published', ns).text
        updated = entry.find('atom:updated', ns).text
        link = entry.find('atom:id', ns).text
        
        arxiv_id = link.split('/')[-1] if link else ""
        
        authors = [author.find('atom:name', ns).text for author in entry.findall('atom:author', ns)]
        authors_str = "; ".join(authors)  # Join authors for CSV
        
        categories = [cat.get('term') for cat in entry.findall('atom:category', ns)]
        categories_str = "; ".join(categories)  # Join categories for CSV
        
        primary_category = ""
        primary_elem = entry.find('atom:primary_category', ns)
        if primary_elem is not None:
            primary_category = primary_elem.get('term', '')
        
        doi = None
        doi_elem = entry.find('.//atom:link[@title="doi"]', ns)
        if doi_elem is not None:
            doi_href = doi_elem.get('href', '')
            if 'doi.org' in doi_href:
                doi = doi_href.split('doi.org/')[-1]
        
        journal_ref = None
        journal_elem = entry.find('atom:journal_ref', ns)
        if journal_elem is not None:
            journal_ref = journal_elem.text.strip()
        
        article = {
            "id": arxiv_id,
            "title": title,
            "author": authors_str,
            "published": published,
            "updated": updated,
            "link": link,
            "summary": summary,
            "primary_category": primary_category,
            "category": categories_str,
            "doi": doi,
            "journal_ref": journal_ref,
            "entry": link,
            "name": f"{authors[0] if authors else 'Unknown'} et al. - {title[:50]}...",
            "comment": None,
            "fetch_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        return article
    except Exception as e:
        log_message(f"❌ Error extracting data: {e}")
        return None

def get_last_fetch_date():
    """Get the last fetch date from CSV or log file"""
    if os.path.exists(CSV_PATH):
        try:
            # Read only the last few rows to get the most recent date
            df = pd.read_csv(CSV_PATH, usecols=['published'], nrows=None)
            if not df.empty:
                # Get the most recent published date
                df['published_date'] = pd.to_datetime(df['published']).dt.date
                last_date = df['published_date'].max()
                log_message(f"📅 Found last date in CSV: {last_date}")
                return last_date
        except Exception as e:
            log_message(f"⚠️ Error reading CSV for last date: {e}")
    
    # If no CSV exists or error, start from yesterday
    fallback_date = (datetime.now() - timedelta(days=1)).date()
    log_message(f"📅 No CSV found, using fallback date: {fallback_date}")
    return fallback_date

def fetch_daily_data(target_date):
    """Fetch arXiv data for a specific date"""
    start_str = target_date.strftime("%Y%m%d0000")
    end_str = target_date.strftime("%Y%m%d2359")
    query = f"submittedDate:[{start_str} TO {end_str}]"
    
    log_message(f"🔎 Fetching data for {target_date.strftime('%Y-%m-%d')}")
    
    session = requests.Session()
    session.headers.update({'User-Agent': 'ArxivDailyBot/1.0'})
    
    all_articles = []
    start = 0
    
    while True:
        params = {
            "search_query": query,
            "start": start,
            "max_results": BATCH_SIZE,
            "sortBy": "submittedDate"
        }
        
        try:
            response = session.get(ARXIV_API_URL, params=params, timeout=45)
            response.raise_for_status()
            root = ET.fromstring(response.content)
            ns = {'atom': 'http://www.w3.org/2005/Atom'}
            entries = root.findall('atom:entry', ns)
            
            if not entries:
                break
            
            log_message(f"📄 Processing {len(entries)} entries (batch starting at {start})")
            
            for entry in entries:
                article = extract_arxiv_data(entry, ns)
                if article:
                    all_articles.append(article)
            
            start += BATCH_SIZE
            time.sleep(REQUEST_INTERVAL)
            
        except Exception as e:
            log_message(f"❌ Error fetching data: {e}")
            time.sleep(5)
            break
    
    session.close()
    log_message(f"✅ Fetched {len(all_articles)} articles for {target_date.strftime('%Y-%m-%d')}")
    return all_articles

def append_to_csv(articles):
    """Append new articles to CSV file"""
    if not articles:
        log_message("ℹ️ No articles to append")
        return
    
    # Ensure directory exists
    os.makedirs(os.path.dirname(CSV_PATH), exist_ok=True)
    
    # Convert to DataFrame
    df = pd.DataFrame(articles)
    
    # Check if CSV exists
    if os.path.exists(CSV_PATH):
        # Append to existing CSV
        df.to_csv(CSV_PATH, mode='a', header=False, index=False, encoding='utf-8')
        log_message(f"📝 Appended {len(articles)} articles to existing CSV")
    else:
        # Create new CSV with headers
        df.to_csv(CSV_PATH, mode='w', header=True, index=False, encoding='utf-8')
        log_message(f"📝 Created new CSV with {len(articles)} articles")

def remove_duplicates_from_csv():
    """Remove duplicate entries from CSV based on arXiv ID"""
    if not os.path.exists(CSV_PATH):
        return
    
    try:
        df = pd.read_csv(CSV_PATH)
        initial_count = len(df)
        
        # Remove duplicates based on 'id' column (arXiv ID)
        df = df.drop_duplicates(subset=['id'], keep='last')
        final_count = len(df)
        
        if initial_count != final_count:
            df.to_csv(CSV_PATH, index=False, encoding='utf-8')
            log_message(f"🧹 Removed {initial_count - final_count} duplicate entries")
        else:
            log_message("✅ No duplicates found")
            
    except Exception as e:
        log_message(f"❌ Error removing duplicates: {e}")

def main():
    """Main function to fetch daily data"""
    log_message("=== Daily ArXiv Data Fetcher Started ===")
    log_message(f"📁 CSV Path: {CSV_PATH}")
    log_message(f"📁 Log Path: {LOG_FILE}")
    
    try:
        # Test directory access
        csv_dir = os.path.dirname(CSV_PATH)
        log_message(f"📂 CSV Directory: {csv_dir}")
        log_message(f"📂 Directory exists: {os.path.exists(csv_dir)}")
        
        if not os.path.exists(csv_dir):
            log_message(f"🔧 Creating directory: {csv_dir}")
            os.makedirs(csv_dir, exist_ok=True)
        
        # Get the last fetch date
        last_fetch_date = get_last_fetch_date()
        log_message(f"📅 Last fetch date: {last_fetch_date}")
        
        # Calculate which dates to fetch
        yesterday = (datetime.now() - timedelta(days=1)).date()
        today = datetime.now().date()
        
        log_message(f"📅 Today: {today}")
        log_message(f"📅 Yesterday: {yesterday}")
        
        # Determine the date range to fetch
        if last_fetch_date >= yesterday:
            log_message(f"✅ Already up to date. Last fetch: {last_fetch_date}")
            log_message("🧪 For testing, let's try fetching yesterday's data anyway...")
            articles = fetch_daily_data(yesterday)
            if articles:
                append_to_csv(articles)
                log_message(f"🧪 Test fetch completed with {len(articles)} articles")
            else:
                log_message("🧪 No articles found for yesterday")
            return
        
        # Fetch data for missing dates (from day after last fetch to yesterday)
        current_date = last_fetch_date + timedelta(days=1)
        total_articles = 0
        
        log_message(f"📅 Will fetch from {current_date} to {yesterday}")
        
        # Check if the date range makes sense
        if current_date > yesterday:
            log_message(f"⚠️ No dates to fetch: current_date ({current_date}) > yesterday ({yesterday})")
            log_message("🧪 For testing, let's fetch yesterday's data...")
            articles = fetch_daily_data(yesterday)
            if articles:
                append_to_csv(articles)
                total_articles = len(articles)
                log_message(f"🧪 Test fetch completed with {len(articles)} articles")
            else:
                log_message("🧪 No articles found for yesterday")
        else:
            # Normal fetching loop
            while current_date <= yesterday:
                log_message(f"🔄 Processing date: {current_date}")
                articles = fetch_daily_data(current_date)
                if articles:
                    append_to_csv(articles)
                    total_articles += len(articles)
                else:
                    log_message(f"ℹ️ No articles found for {current_date}")
                
                current_date += timedelta(days=1)
                time.sleep(2)  # Small delay between dates
        
        # Remove duplicates
        remove_duplicates_from_csv()
        
        log_message(f"🎉 Daily fetch completed! Total new articles: {total_articles}")
        
        # Log CSV statistics
        if os.path.exists(CSV_PATH):
            df = pd.read_csv(CSV_PATH)
            log_message(f"📊 Total articles in CSV: {len(df):,}")
        else:
            log_message("❌ CSV file was not created!")
            
    except Exception as e:
        log_message(f"❌ Fatal error: {e}")
        import traceback
        log_message(f"❌ Traceback: {traceback.format_exc()}")
        raise

def test_simple_fetch():
    """Simple test function to fetch a small amount of recent data"""
    log_message("🧪 === SIMPLE TEST MODE ===")
    
    # Test with just 10 recent articles
    query = "cat:cs.AI"  # Computer Science - AI category
    
    params = {
        "search_query": query,
        "start": 0,
        "max_results": 10,
        "sortBy": "submittedDate"
    }
    
    try:
        log_message("🔄 Making test API request...")
        response = requests.get(ARXIV_API_URL, params=params, timeout=30)
        response.raise_for_status()
        
        log_message(f"✅ API Response Status: {response.status_code}")
        log_message(f"📄 Response Length: {len(response.content)} bytes")
        
        root = ET.fromstring(response.content)
        ns = {'atom': 'http://www.w3.org/2005/Atom'}
        entries = root.findall('atom:entry', ns)
        
        log_message(f"📊 Found {len(entries)} entries")
        
        if entries:
            articles = []
            for entry in entries:
                article = extract_arxiv_data(entry, ns)
                if article:
                    articles.append(article)
                    log_message(f"📄 Article: {article['title'][:50]}...")
            
            if articles:
                log_message(f"💾 Attempting to save {len(articles)} articles to CSV...")
                append_to_csv(articles)
                
                # Check if file was created
                if os.path.exists(CSV_PATH):
                    file_size = os.path.getsize(CSV_PATH)
                    log_message(f"✅ CSV created successfully! Size: {file_size} bytes")
                else:
                    log_message("❌ CSV file was not created!")
            else:
                log_message("❌ No articles extracted from API response")
        else:
            log_message("❌ No entries found in API response")
            
    except Exception as e:
        log_message(f"❌ Test failed: {e}")
        import traceback
        log_message(f"❌ Traceback: {traceback.format_exc()}")

if __name__ == "__main__":
    # Add command line argument for test mode
    import sys
    
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        test_simple_fetch()
    else:
        main()

[2025-07-03 21:51:49] === Daily ArXiv Data Fetcher Started ===
[2025-07-03 21:51:49] 📁 CSV Path: C:\arxiv_project\arxiv_dataset\articles.csv
[2025-07-03 21:51:49] 📁 Log Path: C:\arxiv_project\arxiv_dataset\daily_fetch.log
[2025-07-03 21:51:49] 📂 CSV Directory: C:\arxiv_project\arxiv_dataset
[2025-07-03 21:51:49] 📂 Directory exists: True
[2025-07-03 21:52:14] 📅 Found last date in CSV: 2025-07-02
[2025-07-03 21:52:14] 📅 Last fetch date: 2025-07-02
[2025-07-03 21:52:14] 📅 Today: 2025-07-03
[2025-07-03 21:52:14] 📅 Yesterday: 2025-07-02
[2025-07-03 21:52:14] ✅ Already up to date. Last fetch: 2025-07-02
[2025-07-03 21:52:14] 🧪 For testing, let's try fetching yesterday's data anyway...
[2025-07-03 21:52:14] 🔎 Fetching data for 2025-07-02
[2025-07-03 21:52:15] 📄 Processing 706 entries (batch starting at 0)
[2025-07-03 21:52:20] ✅ Fetched 706 articles for 2025-07-02
[2025-07-03 21:52:20] 📝 Appended 706 articles to existing CSV
[2025-07-03 21:52:20] 🧪 Test fetch completed with 706 articles
