In [None]:
import requests
import xml.etree.ElementTree as ET
from pymongo import MongoClient, errors
from datetime import datetime, timedelta
import time
import concurrent.futures
import re

MONGO_URI = "mongodb://localhost:27017/"
DB_NAME = "arxiv"
COLLECTION_NAME = "articles"

ARXIV_API_URL = "http://export.arxiv.org/api/query"
BATCH_SIZE = 1000  
MAX_THREADS = 2    # Réduit pour respecter rate limiting
REQUEST_INTERVAL = 4  # Augmenté pour respecter "1 requête/3 secondes"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

collection.create_index("link", unique=True)
collection.create_index("published")

def extract_arxiv_data(entry, ns):
    """Extraction optimisée de toutes les données arXiv"""
    try:
        title = entry.find('atom:title', ns).text.strip()
        summary = entry.find('atom:summary', ns).text.strip()
        published = entry.find('atom:published', ns).text
        updated = entry.find('atom:updated', ns).text
        link = entry.find('atom:id', ns).text
        
        arxiv_id = link.split('/')[-1] if link else ""
        
        authors = [author.find('atom:name', ns).text for author in entry.findall('atom:author', ns)]
   
        categories = [cat.get('term') for cat in entry.findall('atom:category', ns)]
        primary_category = ""
        primary_elem = entry.find('atom:primary_category', ns)
        if primary_elem is not None:
            primary_category = primary_elem.get('term', '')
        
        doi = None
        doi_elem = entry.find('.//atom:link[@title="doi"]', ns)
        if doi_elem is not None:
            doi_href = doi_elem.get('href', '')
            if 'doi.org' in doi_href:
                doi = doi_href.split('doi.org/')[-1]
        
        journal_ref = None
        journal_elem = entry.find('atom:journal_ref', ns)
        if journal_elem is not None:
            journal_ref = journal_elem.text.strip()
        
        article = {
            "id": arxiv_id,
            "title": title,
            "author": authors,
            "published": published,
            "updated": updated,
            "link": link,
            "summary": summary,
            "primary_category": primary_category,
            "category": categories,
            "doi": doi,
            "journal_ref": journal_ref,
            "entry": link,  # Même que link pour arXiv
            "name": f"{authors[0] if authors else 'Unknown'} et al. - {title[:50]}...",
            "comment": None  # Peut être ajouté si disponible
        }
        
        return article
    except Exception as e:
        print(f"❌ Erreur extraction: {e}")
        return None

def fetch_and_insert_period(start_date, end_date):
    start_str = start_date.strftime("%Y%m%d0000") 
    end_str = end_date.strftime("%Y%m%d2359")
    query = f"submittedDate:[{start_str} TO {end_str}]"
    print(f"🔎 Thread démarré pour {start_str} ➡️ {end_str}")

    session = requests.Session()
    session.headers.update({'User-Agent': 'ArxivBot/1.0'})
    
    start = 0
    total_inserted = 0
    
    while True:
        params = {
            "search_query": query,
            "start": start,
            "max_results": BATCH_SIZE,
            "sortBy": "submittedDate"  
        }
        
        try:
            response = session.get(ARXIV_API_URL, params=params, timeout=45)
            response.raise_for_status()
            root = ET.fromstring(response.content)
            ns = {'atom': 'http://www.w3.org/2005/Atom'}
            entries = root.findall('atom:entry', ns)
            
            total_results_elem = root.find('.//opensearch:totalResults', 
                                         {'opensearch': 'http://a9.com/-/spec/opensearch/1.1/'})
            if total_results_elem is not None:
                total_results = int(total_results_elem.text)
                if total_results > 30000:
                    print(f"⚠️ ATTENTION: {total_results} résultats pour {start_str}, limite 30k atteinte!")
                    print(f"   Seuls les premiers 30k seront récupérés. Affinez la période.")
            
            if not entries:
                break

            articles = []
            for entry in entries:
                article = extract_arxiv_data(entry, ns)
                if article:
                    articles.append(article)

            if articles:
                try:
                    result = collection.insert_many(articles, ordered=False)
                    inserted_count = len(result.inserted_ids)
                    total_inserted += inserted_count
                    print(f"✅ {inserted_count} articles insérés pour {start_str}")
                except errors.BulkWriteError as bwe:
                    inserted = bwe.details.get("nInserted", 0)
                    total_inserted += inserted
                    print(f"⚠️ {inserted} nouveaux, doublons ignorés ({start_str})")

            start += BATCH_SIZE
            time.sleep(REQUEST_INTERVAL)

        except Exception as e:
            print(f"❌ Erreur pour {start_str}: {e}")
            time.sleep(5)  # Pause plus longue en cas d'erreur
            break
    
    session.close()
    print(f"📊 Total inséré pour {start_str}: {total_inserted}")
    return total_inserted

# Génère des périodes COURTES pour éviter la limite 30k/50k
def generate_week_ranges(start_date, end_date):
    current = start_date
    while current < end_date:
        if current.year >= 2020:
            period_days = 2
        elif current.year >= 2010:
            period_days = 5
        else:
            period_days = 7
            
        range_end = min(current + timedelta(days=period_days), end_date)
        yield (current, range_end)
        current = range_end + timedelta(days=1)

if __name__ == "__main__":
    print("=== Importation arXiv OPTIMISÉE ===")
    
    start_date = datetime(1991, 1, 1)
    end_date = datetime.now()
    
    print(f"📅 Période: {start_date.strftime('%Y-%m-%d')} ➡️ {end_date.strftime('%Y-%m-%d')}")
    
    ranges = list(generate_week_ranges(start_date, end_date))
    print(f"📊 {len(ranges)} périodes à traiter avec {MAX_THREADS} threads")
    print(f"⚠️ IMPORTANT: Périodes courtes pour éviter les limites arXiv (30k/requête)")
    print(f"🐌 Rate limiting respecté: 1 requête/{REQUEST_INTERVAL}s par thread")
    
    start_time = time.time()
    total_articles = 0
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = [executor.submit(fetch_and_insert_period, start, end) for start, end in ranges]
        
        for future in concurrent.futures.as_completed(futures):
            try:
                count = future.result()
                total_articles += count
            except Exception as e:
                print(f"❌ Erreur thread: {e}")
    
    elapsed_time = time.time() - start_time
    print(f"\n🎉 IMPORTATION TERMINÉE!")
    print(f"📈 {total_articles:,} articles traités")
    print(f"⏱️ Temps: {elapsed_time/60:.1f} minutes")
    print(f"⚡ Vitesse: {total_articles/(elapsed_time/60):.0f} articles/minute")
    
    try:
        total_db = collection.count_documents({})
        print(f"💾 Total en base: {total_db:,} articles")
    except:
        pass
    
    client.close()
    print("🔐 Connexion fermée")