# ICE Building Workflow - Knowledge Graph Construction

**Purpose**: Comprehensive data ingestion and knowledge graph building for investment intelligence
**Architecture**: ICE Simplified (2,508 lines) with LightRAG integration
**Input**: Financial data from multiple sources ‚Üí **Output**: Searchable knowledge graph

## Workflow Overview

1. **Environment Setup** - Initialize ICE system and configure data sources
2. **Workflow Mode Selection** - Choose between initial build or incremental update
3. **Data Ingestion** - Fetch financial data from APIs and process documents
4. **Knowledge Graph Building** - Extract entities, relationships, and build LightRAG graph
5. **Storage & Validation** - Verify graph construction and monitor storage
6. **Metrics & Monitoring** - Track processing metrics and system health

---

<!-- ## 1. Environment Setup & System Initialization -->

In [None]:
# Cell 1

# üîß Environment Setup
import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up paths
project_root = Path.cwd()
sys.path.insert(0, str(project_root))

# Set API key
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', '')

# Enable Docling for professional-grade document parsing (97.9% table accuracy)
# Phase 2 implementation (2025-11-04): Docling for both email attachments AND URL PDFs
os.environ['USE_DOCLING_EMAIL'] = 'true'   # Email attachments: 42% ‚Üí 97.9%
os.environ['USE_DOCLING_URLS'] = 'true'    # URL PDFs: 42% ‚Üí 97.9% (PHASE 2 - NEW)

# Enable Crawl4AI for complex URL processing (JS-heavy sites, portals, paywalls)
# Set to 'true' to enable browser automation for Tier 3-5 URLs
# Set to 'false' (default) for simple HTTP only
os.environ['USE_CRAWL4AI_LINKS'] = 'true'  # Enable Crawl4AI
os.environ['CRAWL4AI_TIMEOUT'] = '80'      # 60 second timeout
os.environ['CRAWL4AI_HEADLESS'] = 'true'   # Run browser in background

# URL Rate Limiting (added 2025-11-05 for robustness)
os.environ['URL_RATE_LIMIT_DELAY'] = '1.0'      # Seconds between requests per domain (default: 1.0)
os.environ['URL_CONCURRENT_DOWNLOADS'] = '3'     # Max concurrent downloads (default: 3)
print("‚úÖ Environment setup complete")
print(f"üìÅ Working directory: {project_root}")
print(f"üìä Docling enabled (email): {os.environ.get('USE_DOCLING_EMAIL', 'false')}")
print(f"üìä Docling enabled (URLs): {os.environ.get('USE_DOCLING_URLS', 'false')}")
print(f"üåê Crawl4AI enabled: {os.environ.get('USE_CRAWL4AI_LINKS', 'false')}")

# In notebook, after Cell 2:
print(f"Email Docling: {os.environ.get('USE_DOCLING_EMAIL')}")  # Should show: true
print(f"URL Docling: {os.environ.get('USE_DOCLING_URLS')}")     # Should show: true

In [None]:
# Cell 2
# Initialize ICE system
from updated_architectures.implementation.ice_simplified import create_ice_system

try:
    ice = create_ice_system()
    system_ready = ice.is_ready()
    print(f"‚úÖ ICE System Initialized")
    print(f"üß† LightRAG Status: {'Ready' if system_ready else 'Initializing'}")
    print(f"üìä Architecture: ICE Simplified (2,508 lines)")
    print(f"üîó Components: Core + Ingester + QueryEngine")
except Exception as e:
    print(f"‚ùå Initialization Error: {e}")
    raise  # Let errors surface for proper debugging


##############################################################################
print('#'*150)
print("="*70)
print("ATTACHMENT PROCESSOR DIAGNOSTIC (CORRECTED)")
print("="*70)

# Correct path: ice.ingester.attachment_processor (not ice.data_sources)
if hasattr(ice, 'ingester'):
    print(f"‚úÖ ice.ingester exists: {type(ice.ingester).__name__}")

    if hasattr(ice.ingester, 'attachment_processor'):
        if ice.ingester.attachment_processor is None:
            print("‚ùå attachment_processor is None (initialization failed)")
            print("   ‚Üí PDFs will be downloaded but NOT processed/ingested")
            print("\nüîç Why this happens:")
            print("   - ImportError during DoclingProcessor/AttachmentProcessor import")
            print("   - Exception during processor initialization")
            print("\nüìã Check Cell 3 output for:")
            print("   - '‚ö†Ô∏è Attachment processor initialization failed: [error]'")
        else:
            processor_type =type(ice.ingester.attachment_processor).__name__
            print(f"‚úÖ attachment_processor initialized: {processor_type}")
            print("   ‚Üí PDFs WILL be processed and ingested")

            # Check which processor
            if 'Docling' in processor_type:
                print("   ‚Üí Using DoclingProcessor (97.9% table accuracy)")
            else:
                print("   ‚Üí Using AttachmentProcessor (42% table accuracy, PyPDF2)")
    else:
        print("‚ùå attachment_processor attribute missing from ingester")
        print("   ‚Üí This is a code structure issue")
else:
    print("‚ùå ice.ingester attribute missing")
    print("   ‚Üí ICE system not properly initialized")

print("="*70)

In [None]:
print("="*70)
print("ATTACHMENT PROCESSOR DIAGNOSTIC (CORRECTED)")
print("="*70)

# Correct path: ice.ingester.attachment_processor (not ice.data_sources)
if hasattr(ice, 'ingester'):
    print(f"‚úÖ ice.ingester exists: {type(ice.ingester).__name__}")

    if hasattr(ice.ingester, 'attachment_processor'):
        if ice.ingester.attachment_processor is None:
            print("‚ùå attachment_processor is None (initialization failed)")
            print("   ‚Üí PDFs will be downloaded but NOT processed/ingested")
            print("\nüîç Why this happens:")
            print("   - ImportError during DoclingProcessor/AttachmentProcessor import")
            print("   - Exception during processor initialization")
            print("\nüìã Check Cell 3 output for:")
            print("   - '‚ö†Ô∏è Attachment processor initialization failed: [error]'")
        else:
            processor_type =type(ice.ingester.attachment_processor).__name__
            print(f"‚úÖ attachment_processor initialized: {processor_type}")
            print("   ‚Üí PDFs WILL be processed and ingested")

            # Check which processor
            if 'Docling' in processor_type:
                print("   ‚Üí Using DoclingProcessor (97.9% table accuracy)")
            else:
                print("   ‚Üí Using AttachmentProcessor (42% table accuracy, PyPDF2)")
    else:
        print("‚ùå attachment_processor attribute missing from ingester")
        print("   ‚Üí This is a code structure issue")
else:
    print("‚ùå ice.ingester attribute missing")
    print("   ‚Üí ICE system not properly initialized")

print("="*70)

In [None]:
# Cell 3
# Verify storage architecture and components
print(f"üì¶ LightRAG Storage Architecture Verification")
print(f"‚îÅ" * 40)

if not (ice and ice.core.is_ready()):
    raise RuntimeError("ICE system not ready - cannot verify storage")

# Get storage statistics using new method
storage_stats = ice.core.get_storage_stats()

print(f"LightRAG Storage Components:")
for component_name, component_info in storage_stats['components'].items():
    status = "‚úÖ Initialized" if component_info['exists'] else "‚ö†Ô∏è Not created yet"
    size_mb = component_info['size_bytes'] / (1024 * 1024) if component_info['size_bytes'] > 0 else 0
    print(f"  {component_name}: {status}")
    print(f"    Purpose: {component_info['description']}")
    print(f"    File: {component_info['file']}")
    if size_mb > 0:
        print(f"    Size: {size_mb:.2f} MB")

print(f"\nüìÅ Working Directory: {storage_stats['working_dir']}")
print(f"üóÑÔ∏è Storage Backend: File-based (development mode)")
print(f"üíæ Total Storage: {storage_stats['total_storage_bytes'] / (1024 * 1024):.2f} MB")

In [None]:
# Cell 4
# Data sources configuration status
if not (ice and hasattr(ice, 'ingester')):
    raise RuntimeError("Data ingester not initialized")

available_services = ice.ingester.available_services
print(f"\nüì° Data Sources Available: {len(available_services)}")
for service in available_services:
    print(f"  ‚úÖ {service}")

if not available_services:
    print(f"  ‚ö†Ô∏è No APIs configured - will use sample data")
    print(f"  üí° Set NEWSAPI_ORG_API_KEY for real news")
    print(f"  üí° Set ALPHA_VANTAGE_API_KEY for financial data")

# Validate OpenAI for LightRAG
openai_configured = bool(os.getenv('OPENAI_API_KEY'))
print(f"\nüîë OpenAI API: {'‚úÖ Configured' if openai_configured else '‚ùå Required for full functionality'}")

<!-- ## 2. Workflow Mode Selection & Configuration -->

<!-- ### üîß Model Provider Configuration

ICE supports **OpenAI** (paid) or **Ollama** (free local) for LLM and embeddings:

#### Option 1: OpenAI (Default - No setup required)
```bash
export OPENAI_API_KEY="sk-..."
```
- **Cost**: ~$5/month for typical usage
- **Quality**: Highest accuracy for entity extraction and reasoning
- **Setup**: Just set API key

#### Option 2: Ollama (Free Local - Requires setup)
```bash
# Set provider
export LLM_PROVIDER="ollama"

# One-time setup:
ollama serve                      # Start Ollama service
ollama pull qwen3:30b-32k        # Pull LLM model (32k context required)
ollama pull nomic-embed-text      # Pull embedding model
```
- **Cost**: $0/month (completely free)
- **Quality**: Good for most investment analysis tasks
- **Setup**: Requires local Ollama installation and model download

#### Option 3: Hybrid (Recommended for cost-conscious users)
```bash
export LLM_PROVIDER="ollama"           # Use Ollama for LLM
export EMBEDDING_PROVIDER="openai"     # Use OpenAI for embeddings
export OPENAI_API_KEY="sk-..."
```
- **Cost**: ~$2/month (embeddings only)
- **Quality**: Balanced - free LLM with high-quality embeddings

**Current configuration will be logged when you run the next cell.** -->

<!-- ### üìÑ Docling Document Processing (Switchable Architecture)

ICE supports **switchable document processing** for SEC filings and email attachments:

#### Docling Integration (Default - Professional-grade table extraction)
```bash
export USE_DOCLING_SEC=true          # SEC filings: 0% ‚Üí 97.9% table extraction
export USE_DOCLING_EMAIL=true        # Email attachments: 42% ‚Üí 97.9% accuracy
```
- **Accuracy**: 97.9% table extraction (vs 42% with PyPDF2)
- **Cost**: $0/month (local execution)
- **Setup**: Models auto-download on first use (~500MB, one-time)

#### Original Implementations (For comparison testing)
```bash
export USE_DOCLING_SEC=false         # Use metadata-only SEC extraction
export USE_DOCLING_EMAIL=false       # Use PyPDF2/openpyxl processors
```
- **Purpose**: A/B testing and backward compatibility
- **Use when**: Comparing extraction accuracy or troubleshooting

**Note**: Docling uses IBM's AI models (DocLayNet, TableFormer, Granite-Docling VLM) for professional-grade document parsing. Both implementations coexist - toggle switches instantly without code changes.

**More info**: See `md_files/DOCLING_INTEGRATION_TESTING.md` for detailed testing procedures. -->

<!-- ### üåê Crawl4AI URL Fetching (Switchable Architecture)

ICE supports **hybrid URL fetching** for email links with intelligent routing:

#### Smart Routing Strategy (Default - Simple HTTP only)
```bash
export USE_CRAWL4AI_LINKS=false      # Simple HTTP only (fast, free, default)
```
- **Approach**: Use simple HTTP (aiohttp) for all URLs
- **Works for**: DBS research URLs with embedded tokens, direct PDFs, SEC EDGAR
- **Cost**: $0/month (no browser automation)
- **Speed**: <2 seconds per URL average

#### Crawl4AI Hybrid Routing (Enable for complex sites)
```bash
export USE_CRAWL4AI_LINKS=true       # Enable browser automation for complex URLs
export CRAWL4AI_TIMEOUT=60           # Timeout in seconds (default: 60)
export CRAWL4AI_HEADLESS=true        # Headless mode (default: true)
```
- **Routing**: Automatically classifies URLs and uses appropriate method
  - Simple HTTP: DBS URLs, direct file downloads, SEC EDGAR, static content
  - Crawl4AI: Premium research portals (Goldman, Morgan Stanley), JS-heavy IR sites
- **Fallback**: Graceful degradation to simple HTTP if Crawl4AI fails
- **Cost**: CPU cost for browser automation, free tier usage
- **Use when**: Accessing JavaScript-heavy sites or login-required portals

**Note**: DBS research portal URLs work with simple HTTP (embedded auth tokens in `?E=...` parameter) - no browser automation needed.

**More info**: See `md_files/CRAWL4AI_INTEGRATION_PLAN.md` for detailed integration strategy and `CLAUDE.md` Pattern #6 for code examples. -->


In [None]:
# Cell 5
# ### Provider Switching - Uncomment ONE option below, then restart kernel

#############################################################################
#                              Model Selection                              #
#############################################################################

### Option 1: OpenAI ($5/mo, highest quality)
import os; os.environ['LLM_PROVIDER'] = 'openai'
print("‚úÖ Switched to OpenAI")

# ###Option 2: Hybrid ($2/mo, 60% savings, recommended)
# import os; os.environ['LLM_PROVIDER'] = 'ollama'; os.environ['EMBEDDING_PROVIDER'] = 'openai'
# print("‚úÖ Switched to Hybrid")

# ### Option 3: Full Ollama ($0/mo, faster model)
# import os; os.environ['LLM_PROVIDER'] = 'ollama'; os.environ['EMBEDDING_PROVIDER'] = 'ollama'; 
# # os.environ['LLM_MODEL'] = 'llama3.1:8b'
# os.environ['LLM_MODEL'] = 'qwen3:8b'
# # os.environ['LLM_MODEL'] = 'qwen3:14b'
# # os.environ['LLM_MODEL'] = 'qwen3:30b-32k'
# print("‚úÖ Switched to Full Ollama Model.")

<!-- ### üóëÔ∏è Graph Management (Optional)

**When to clear the graph:**
- ‚úÖ Switching to Full Ollama (1536-dim ‚Üí 768-dim embeddings)
- ‚úÖ Graph corrupted or very old (>30 days without updates)
- ‚úÖ Testing fresh graph builds from scratch

**When NOT to clear:**
- ‚ùå Just switching LLM provider (OpenAI ‚Üî Hybrid use same embeddings)
- ‚ùå Adding new documents (incremental updates work fine)
- ‚ùå Changing query modes (local, hybrid, etc.)

**How to clear:**
Run the code cell below (uncomment lines to activate) -->

In [None]:
# Cell 6
##########################################################
#                    Check graph info                    #
##########################################################
from updated_architectures.implementation.ice_simplified import create_ice_system
ice = create_ice_system()

# Toggle to enable/disable logging output to file
SAVE_PROCESSING_LOG = True  # Set to False to disable logging

if SAVE_PROCESSING_LOG:
    from datetime import datetime
    from pathlib import Path
    import sys
    from io import StringIO
    
    output_buffer = StringIO()
    original_stdout = sys.stdout
    sys.stdout = output_buffer
    error_buffer = StringIO()
    original_stderr = sys.stderr
    sys.stderr = error_buffer

    # Reconfigure logging handlers to use redirected stderr
    import logging
    for handler in logging.root.handlers[:]:
        handler.stream = sys.stderr  # Now points to error_buffer


##########################################################
from pathlib import Path
import shutil

def check_storage(storage_path):
    """Check and display storage file inventory"""
    files = ['vdb_entities.json', 'vdb_relationships.json', 'vdb_chunks.json', 'graph_chunk_entity_relation.graphml']
    total_size = 0
    for fname in files:
        fpath = storage_path / fname
        if fpath.exists():
            size_mb = fpath.stat().st_size / (1024 * 1024)
            print(f"  ‚úÖ {fname}: {size_mb:.2f} MB")
            total_size += size_mb
        else:
            print(f"  ‚ö†Ô∏è  {fname}: not found")
    print(f"  üíæ Total: {total_size:.2f} MB")

# Use actual config path instead of hardcoded path to avoid path mismatches
storage_path = Path(ice.config.working_dir)

check_storage(storage_path)

#####################################################################
ice.core.get_graph_stats()

##########################################################
#              Graph Health Metrics (P0)                 #
##########################################################

def check_graph_health(storage_path):
    """Check critical graph health metrics (P0 only)"""
    import json
    from pathlib import Path
    
    TICKERS = {'NVDA', 'TSMC', 'AMD', 'ASML'}  # Known portfolio tickers
    
    result = {
        'tickers_covered': set(),
        'total_entities': 0,
        'total_relationships': 0,
        'buy_signals': 0,
        'sell_signals': 0,
        'price_targets': 0
    }
    
    # Parse entities
    entities_file = Path(storage_path) / 'vdb_entities.json'
    if entities_file.exists():
        data = json.loads(entities_file.read_text())
        result['total_entities'] = len(data.get('data', []))
        
        for entity in data.get('data', []):
            text = f"{entity.get('entity_name', '')} {entity.get('content', '')}".upper()
            
            # Detect tickers
            for ticker in TICKERS:
                if ticker in text:
                    result['tickers_covered'].add(ticker)
            
            # Detect signals
            if 'BUY' in text:
                result['buy_signals'] += 1
            if 'SELL' in text:
                result['sell_signals'] += 1
            if 'PRICE TARGET' in text or 'PRICE_TARGET' in text:
                result['price_targets'] += 1
    
    # Parse relationships
    rels_file = Path(storage_path) / 'vdb_relationships.json'
    if rels_file.exists():
        data = json.loads(rels_file.read_text())
        result['total_relationships'] = len(data.get('data', []))
    
    result['tickers_covered'] = sorted(list(result['tickers_covered']))
    return result

def get_extended_graph_stats(storage_path):
    """Get additional graph statistics for comprehensive analysis"""
    import json
    from pathlib import Path
    
    stats = {
        'chunk_count': 0,
        'email_chunks': 0,
        'api_sec_chunks': 0
    }
    
    # Parse chunks
    chunks_file = Path(storage_path) / 'vdb_chunks.json'
    if chunks_file.exists():
        data = json.loads(chunks_file.read_text())
        chunks = data.get('data', [])
        stats['chunk_count'] = len(chunks)
        
        # Infer source from content markers
        for chunk in chunks:
            content = chunk.get('content', '')
            # Email documents contain investment signal markup
            if any(marker in content for marker in ['[TICKER:', '[RATING:', '[PRICE_TARGET:']):
                stats['email_chunks'] += 1
            else:
                stats['api_sec_chunks'] += 1
    
    return stats

# Run health check
health = check_graph_health(storage_path)

# Display results
print("\nüß¨ Graph Health Metrics:")
print(f"  üìä Content Coverage:")
print(f"    Tickers: {', '.join(health['tickers_covered']) if health['tickers_covered'] else 'None'} ({len(health['tickers_covered'])}/4 portfolio holdings)")

print(f"\n  üï∏Ô∏è Graph Structure:")
print(f"    Total entities: {health['total_entities']:,}")
print(f"    Total relationships: {health['total_relationships']:,}")
if health['total_entities'] > 0:
    avg_conn = health['total_relationships'] / health['total_entities']
    print(f"    Avg connections: {avg_conn:.2f}")

print(f"\n  üíº Investment Signals:")
print(f"    BUY signals: {health['buy_signals']}")
print(f"    SELL signals: {health['sell_signals']}")
print(f"    Price targets: {health['price_targets']}")

# Run extended stats
extended_stats = get_extended_graph_stats(storage_path)

print(f"\n  üì¶ Graph Storage:")
print(f"    Total chunks: {extended_stats['chunk_count']:,}")
print(f"    Email chunks: {extended_stats['email_chunks']:,}")
print(f"    API/SEC chunks: {extended_stats['api_sec_chunks']:,}")

# Save output to log file if enabled
if SAVE_PROCESSING_LOG:
    sys.stdout = original_stdout
    sys.stderr = original_stderr
    captured_out = output_buffer.getvalue()
    captured_err = error_buffer.getvalue()
    combined = captured_err + captured_out
    print(combined)  # Display in notebook
    
    # Save to markdown file with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    log_file = Path(f'logs/graph_processing_log_{timestamp}.md')
    log_file.parent.mkdir(exist_ok=True)
    log_file.write_text(
        f'# Graph Processing Log\n'
        f'**Timestamp:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n'
        f'```\n{combined}\n```\n'
    )
    print(f'\n‚úÖ Log saved: {log_file}')


<!-- ## üß™ Test: Hybrid Entity Categorization with Qwen2.5-3B

**Purpose**: Compare categorization accuracy between keyword-only and hybrid (keyword+LLM) approaches

**What this tests**:
- Baseline: Fast keyword pattern matching (~1ms per entity)
- Enhanced: Confidence scoring to identify ambiguous cases
- Hybrid: LLM fallback for low-confidence entities (~40ms per entity)

**Prerequisites**:
- ‚úÖ Ollama installed with qwen2.5:3b model (optional - degrades gracefully)
- ‚úÖ LightRAG graph built (previous cells completed)

**Expected runtime**: ~0.5 seconds for 12 sample entities (hybrid mode)

**Configuration**: `src/ice_lightrag/graph_categorization.py` - Change `CATEGORIZATION_MODE` to enable hybrid by default -->

In [None]:
# Cell 7
# Purpose: Test hybrid entity categorization with configurable sampling and model selection
# Location: ice_building_workflow.ipynb Cell 12 (FIXED VERSION)
# Dependencies: graph_categorization.py, entity_categories.py, LightRAG storage

import json
import time
import sys
import random
import requests
from pathlib import Path
from collections import Counter

# Force reload of graph_categorization module to pick up new functions
import importlib
if 'src.ice_lightrag.graph_categorization' in sys.modules:
    importlib.reload(sys.modules['src.ice_lightrag.graph_categorization'])

# ===== CONFIGURATION: User-editable settings =====
RANDOM_SEED = 42  # Set to None for different entities each run, or keep 42 for reproducible testing
OLLAMA_MODEL_OVERRIDE = 'qwen2.5:3b'  # Change to use different model (e.g., 'llama3.1:8b', 'qwen3:8b')

# ===== SETUP: Imports with error handling =====
print("=" * 70)
print("üß™ Entity Categorization Test (4 Modes) - FIXED")
print("=" * 70)

# Ensure src is in path for notebook context
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

try:
    from src.ice_lightrag.graph_categorization import (
        categorize_entity,
        categorize_entity_with_confidence,
        categorize_entity_hybrid,
        categorize_entity_llm_only  # NEW: Pure LLM mode
    )
    from src.ice_lightrag.entity_categories import CATEGORY_DISPLAY_ORDER
    print("‚úÖ Categorization functions imported successfully")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("   ‚Üí Ensure previous cells completed successfully")
    print("   ‚Üí Check that src/ice_lightrag/graph_categorization.py exists\n")
    raise

# Patch module constant for Ollama model selection
import src.ice_lightrag.graph_categorization as graph_cat_module
graph_cat_module.OLLAMA_MODEL = OLLAMA_MODEL_OVERRIDE
print(f"üîß Ollama model configured: {OLLAMA_MODEL_OVERRIDE}\n")

# ===== HEALTH CHECK: Ollama service availability =====
def check_ollama_service():
    """Check if Ollama service is running and configured model is available"""
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=2)
        if response.status_code == 200:
            models = response.json().get('models', [])
            # Use exact match to avoid accepting wrong model versions
            model_available = any(m.get('name', '') == OLLAMA_MODEL_OVERRIDE for m in models)
            return True, model_available
        return False, False
    except requests.RequestException:
        return False, False
    except (KeyError, json.JSONDecodeError):
        return True, False

ollama_running, model_available = check_ollama_service()

if ollama_running and model_available:
    print(f"‚úÖ Ollama service running with {OLLAMA_MODEL_OVERRIDE} model")
elif ollama_running:
    print(f"‚ö†Ô∏è  Ollama running but {OLLAMA_MODEL_OVERRIDE} not found")
    print(f"   ‚Üí Install: ollama pull {OLLAMA_MODEL_OVERRIDE}")
else:
    print("‚ö†Ô∏è  Ollama not running - TEST 3 & 4 will be skipped")
    print("   ‚Üí Start Ollama: brew services start ollama (macOS)\n")

# ===== DATA LOADING: Entities with validation =====
storage_path = Path(ice.config.working_dir) / "vdb_entities.json"
entities_data = {}  # Initialize to prevent NameError if file missing or error occurs

if not storage_path.exists():
    print(f"‚ùå Storage file not found: {storage_path}")
    print("   ‚Üí Run previous cells to build the knowledge graph first\n")
    print("   ‚ö†Ô∏è  Categorization tests will be skipped\n")
else:
    try:
        with open(storage_path) as f:
            entities_data = json.load(f)

        if not isinstance(entities_data, dict) or 'data' not in entities_data:
            print(f"‚ùå Invalid storage format (expected dict with 'data' key)")
            print("   ‚ö†Ô∏è  Invalid storage - tests will be skipped\n")
    except Exception as e:
        print(f"‚ùå Storage error: {e}")
        print("   ‚ö†Ô∏è  Tests will be skipped\n")

entities_list = entities_data.get('data', [])

if not isinstance(entities_list, list) or len(entities_list) == 0:
    print(f"‚ùå No entities found in storage")
    print("   ‚ö†Ô∏è  No data - tests will be skipped\n")

print(f"‚úÖ Loaded {len(entities_list)} entities from knowledge graph")

# Configurable random sampling
if RANDOM_SEED is not None:
    random.seed(RANDOM_SEED)
    sampling_mode = f"Reproducible (seed={RANDOM_SEED}, same entities each run)"
else:
    sampling_mode = "Random (different entities each run)"

test_entities = random.sample(entities_list, min(12, len(entities_list)))
print(f"   Sampling mode: {sampling_mode}")
print(f"   Testing with {len(test_entities)} entities")
print(f"   üí° To change: Set RANDOM_SEED = None (line 17) or OLLAMA_MODEL_OVERRIDE (line 18)\n")

# ===== HELPER: Compact result display =====
def display_results(results, title, show_confidence=False, show_llm=False):
    """Display categorization results in compact format"""
    print(f"\n{title}")
    print("-" * 70)

    for i, (name, category, confidence, used_llm) in enumerate(results, 1):
        display_name = name[:40] + "..." if len(name) > 40 else name

        if show_llm and used_llm:
            indicator = "ü§ñ"
        else:
            indicator = "‚ö°"

        if show_confidence:
            print(f"{i:2d}. {indicator} {display_name:43s} ‚Üí {category:20s} (conf: {confidence:.2f})")
        else:
            print(f"{i:2d}. {display_name:45s} ‚Üí {category}")

    category_counts = Counter(cat for _, cat, _, _ in results)
    print(f"\nüìä Distribution: {dict(category_counts)}")

# ===== TEST 1: Keyword-Only Baseline =====
print("\n" + "=" * 70)
print("TEST 1: Keyword-Only Categorization (Baseline)")
print("=" * 70)

start_time = time.time()
keyword_results = []

for entity in test_entities:
    name = entity.get('entity_name', '')
    content = entity.get('content', '')
    category = categorize_entity(name, content)
    keyword_results.append((name, category, 1.0, False))

elapsed = time.time() - start_time
display_results(keyword_results, "Results (Keyword Matching):", show_confidence=False)
if len(test_entities) > 0:
    print(f"\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_entities):.1f}ms per entity)")
else:
    print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no entitys to test)")

# ===== TEST 2: Confidence Scoring Analysis =====
print("\n" + "=" * 70)
print("TEST 2: Keyword + Confidence Scoring")
print("=" * 70)

start_time = time.time()
confidence_results = []

for entity in test_entities:
    name = entity.get('entity_name', '')
    content = entity.get('content', '')
    category, confidence = categorize_entity_with_confidence(name, content)
    confidence_results.append((name, category, confidence, False))

elapsed = time.time() - start_time
display_results(confidence_results, "Results (with confidence scores):", show_confidence=True)

low_confidence = [(n, c, conf) for n, c, conf, _ in confidence_results if conf < 0.70]

if low_confidence:
    print(f"\nüîç Ambiguous entities (confidence < 0.70): {len(low_confidence)}")
    for name, cat, conf in low_confidence:
        print(f"   - {name[:50]:50s} ‚Üí {cat:20s} (conf: {conf:.2f})")
else:
    print(f"\n‚úÖ All entities have high confidence (‚â•0.70) - no LLM fallback needed")

if len(test_entities) > 0:
    print(f"\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_entities):.1f}ms per entity)")
else:
    print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no entitys to test)")

# ===== TEST 3: Hybrid Mode (if Ollama available) =====
if ollama_running and model_available:
    print("\n" + "=" * 70)
    print("TEST 3: Hybrid Categorization (Keyword + LLM Fallback)")
    print("=" * 70)
    print("‚è±Ô∏è  Note: LLM calls may take 5-10 seconds total...\n")

    start_time = time.time()
    hybrid_results = []
    llm_call_count = 0

    for entity in test_entities:
        name = entity.get('entity_name', '')
        content = entity.get('content', '')

        keyword_cat, keyword_conf = categorize_entity_with_confidence(name, content)

        if keyword_conf >= 0.70:
            category, confidence = keyword_cat, keyword_conf
            used_llm = False
        else:
            category, confidence = categorize_entity_hybrid(name, content, confidence_threshold=0.70)
            used_llm = (confidence == 0.90)
            if used_llm:
                llm_call_count += 1

        hybrid_results.append((name, category, confidence, used_llm))

    elapsed = time.time() - start_time
    display_results(hybrid_results, "Results (Hybrid mode - keyword + LLM):", show_confidence=True, show_llm=True)

    if len(test_entities) > 0:
        print(f"\nü§ñ LLM calls: {llm_call_count}/{len(test_entities)} ({100*llm_call_count/len(test_entities):.1f}%)")
    else:
        print(f"\\nü§ñ LLM calls: 0/0 (no entitys to test)")
    if len(test_entities) > 0:
        print(f"‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_entities):.1f}ms per entity)")
    else:
        print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no entitys to test)")

    # ===== COMPARISON SUMMARY =====
    print("\n" + "=" * 70)
    print("üìä COMPARISON SUMMARY (Keyword vs Hybrid)")
    print("=" * 70)

    changes = 0
    for i in range(len(test_entities)):
        if keyword_results[i][1] != hybrid_results[i][1]:
            changes += 1

    if len(test_entities) > 0:
        print(f"Entities recategorized by LLM: {changes}/{len(test_entities)} ({100*changes/len(test_entities):.1f}%)")
    else:
        print(f"No entitys - skipping recategorization analysis")

    if changes > 0:
        print("\nRecategorization details:")
        for i in range(len(test_entities)):
            kw_cat = keyword_results[i][1]
            hyb_cat = hybrid_results[i][1]
            if kw_cat != hyb_cat:
                name = test_entities[i].get('entity_name', '')[:50]
                print(f"   - {name:50s}: {kw_cat:20s} ‚Üí {hyb_cat}")

    print(f"\n‚úÖ Hybrid categorization complete!")
else:
    print("\n" + "=" * 70)
    print("‚ö†Ô∏è  TEST 3 SKIPPED: Ollama not available")
    print("=" * 70)
    print("To enable hybrid mode:")
    print("   1. Install Ollama: https://ollama.com")
    print(f"   2. Pull model: ollama pull {OLLAMA_MODEL_OVERRIDE}")
    print("   3. Start service: brew services start ollama (macOS)")
    print("   4. Re-run this cell")

# ===== TEST 4: Pure LLM Mode (NEW) =====
if ollama_running and model_available:
    print("\n" + "=" * 70)
    print("TEST 4: Pure LLM Categorization (All entities)")
    print("=" * 70)
    print("‚è±Ô∏è  Note: This will call LLM for ALL entities (may take 30-60 seconds)...\n")

    start_time = time.time()
    llm_results = []

    for entity in test_entities:
        name = entity.get('entity_name', '')
        content = entity.get('content', '')
        category, confidence = categorize_entity_llm_only(name, content)
        llm_results.append((name, category, confidence, True))

    elapsed = time.time() - start_time
    display_results(llm_results, "Results (Pure LLM mode):", show_confidence=True, show_llm=True)

    print(f"\nü§ñ LLM calls: {len(test_entities)}/{len(test_entities)} (100%)")
    if len(test_entities) > 0:
        print(f"‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_entities):.1f}ms per entity)")
    else:
        print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no entitys to test)")

    # Compare keyword vs pure LLM (FIXED INDENTATION)
    llm_changes = sum(1 for i in range(len(test_entities)) if keyword_results[i][1] != llm_results[i][1])

    print("\n" + "=" * 70)
    print("üìä COMPARISON SUMMARY (Keyword vs Pure LLM)")
    print("=" * 70)
    if len(test_entities) > 0:
        print(f"Entities recategorized by pure LLM: {llm_changes}/{len(test_entities)} ({100*llm_changes/len(test_entities):.1f}%)")
    else:
        print(f"No entitys - skipping recategorization analysis")

    if llm_changes > 0:
        print("\nRecategorization details:")
        for i in range(len(test_entities)):
            kw_cat = keyword_results[i][1]
            llm_cat = llm_results[i][1]
            if kw_cat != llm_cat:
                name = test_entities[i].get('entity_name', '')[:50]
                print(f"   - {name:50s}: {kw_cat:20s} ‚Üí {llm_cat}")

    print(f"\n‚úÖ Pure LLM categorization complete!")
else:
    print("\n" + "=" * 70)
    print("‚ö†Ô∏è  TEST 4 SKIPPED: Ollama not available")
    print("=" * 70)
    print("To enable pure LLM mode:")
    print("   1. Install Ollama: https://ollama.com")
    print(f"   2. Pull model: ollama pull {OLLAMA_MODEL_OVERRIDE}")
    print("   3. Start service: brew services start ollama (macOS)")
    print("   4. Re-run this cell")

print("\n" + "=" * 70)
print("üéØ TESTING COMPLETE - 4 Categorization Modes Evaluated")
print("=" * 70)

In [None]:
# Cell 8
# Purpose: Test relationship categorization with configurable sampling and model selection
# Location: ice_building_workflow.ipynb (FIXED VERSION)
# Dependencies: graph_categorization.py, relationship_categories.py, LightRAG storage

import json
import time
import sys
import random
import requests
from pathlib import Path
from collections import Counter

# Force reload of graph_categorization module to pick up new functions
import importlib
if 'src.ice_lightrag.graph_categorization' in sys.modules:
    importlib.reload(sys.modules['src.ice_lightrag.graph_categorization'])

# ===== CONFIGURATION: User-editable settings =====
RANDOM_SEED = 42  # Set to None for different relationships each run, or keep 42 for reproducible testing
OLLAMA_MODEL_OVERRIDE = 'qwen2.5:3b'  # Change to use different model (e.g., 'llama3.1:8b', 'qwen3:8b')

# ===== SETUP: Imports with error handling =====
print("=" * 70)
print("üß™ Relationship Categorization Test (4 Modes) - FIXED")
print("=" * 70)

# Ensure src is in path for notebook context
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

try:
    from src.ice_lightrag.graph_categorization import (
        categorize_relationship,
        categorize_relationship_with_confidence,
        categorize_relationship_hybrid,
        categorize_relationship_llm_only
    )
    from src.ice_lightrag.relationship_categories import CATEGORY_DISPLAY_ORDER, extract_relationship_types
    print("‚úÖ Categorization functions imported successfully")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("   ‚Üí Ensure previous cells completed successfully")
    print("   ‚Üí Check that src/ice_lightrag/graph_categorization.py exists\n")
    raise

# Patch module constant for Ollama model selection
import src.ice_lightrag.graph_categorization as graph_cat_module
graph_cat_module.OLLAMA_MODEL = OLLAMA_MODEL_OVERRIDE
print(f"üîß Ollama model configured: {OLLAMA_MODEL_OVERRIDE}\n")

# ===== HEALTH CHECK: Ollama service availability =====
def check_ollama_service():
    """Check if Ollama service is running and configured model is available"""
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=2)
        if response.status_code == 200:
            models = response.json().get('models', [])
            # Use exact match to avoid accepting wrong model versions
            model_available = any(m.get('name', '') == OLLAMA_MODEL_OVERRIDE for m in models)
            return True, model_available
        return False, False
    except requests.RequestException:
        return False, False
    except (KeyError, json.JSONDecodeError):
        return True, False

ollama_running, model_available = check_ollama_service()

if ollama_running and model_available:
    print(f"‚úÖ Ollama service running with {OLLAMA_MODEL_OVERRIDE} model")
elif ollama_running:
    print(f"‚ö†Ô∏è  Ollama running but {OLLAMA_MODEL_OVERRIDE} not found")
    print(f"   ‚Üí Install: ollama pull {OLLAMA_MODEL_OVERRIDE}")
else:
    print("‚ö†Ô∏è  Ollama not running - TEST 3 & 4 will be skipped")
    print("   ‚Üí Start Ollama: brew services start ollama (macOS)\n")

# ===== DATA LOADING: Relationships with validation =====
storage_path = Path(ice.config.working_dir) / "vdb_relationships.json"
relationships_data = {}  # Initialize to prevent NameError if file missing or error occurs

if not storage_path.exists():
    print(f"‚ùå Storage file not found: {storage_path}")
    print("   ‚Üí Run previous cells to build the knowledge graph first\n")
    print("   ‚ö†Ô∏è  Categorization tests will be skipped\n")
else:
    try:
        with open(storage_path) as f:
            relationships_data = json.load(f)

        if not isinstance(relationships_data, dict) or 'data' not in relationships_data:
            print(f"‚ùå Invalid storage format (expected dict with 'data' key)")
            print("   ‚ö†Ô∏è  Invalid storage - tests will be skipped\n")
    except Exception as e:
        print(f"‚ùå Storage error: {e}")
        print("   ‚ö†Ô∏è  Tests will be skipped\n")

relationships_list = relationships_data.get('data', [])

if not isinstance(relationships_list, list) or len(relationships_list) == 0:
    print(f"‚ùå No relationships found in storage")
    print("   ‚ö†Ô∏è  No data - tests will be skipped\n")

print(f"‚úÖ Loaded {len(relationships_list)} relationships from knowledge graph")

# Configurable random sampling
if RANDOM_SEED is not None:
    random.seed(RANDOM_SEED)
    sampling_mode = f"Reproducible (seed={RANDOM_SEED}, same relationships each run)"
else:
    sampling_mode = "Random (different relationships each run)"

test_relationships = random.sample(relationships_list, min(12, len(relationships_list)))
print(f"   Sampling mode: {sampling_mode}")
print(f"   Testing with {len(test_relationships)} relationships")
print(f"   üí° To change: Set RANDOM_SEED = None (line 17) or OLLAMA_MODEL_OVERRIDE (line 18)\n")

# ===== HELPER: Compact result display =====
def display_results(results, title, show_confidence=False, show_llm=False):
    """Display categorization results in compact format"""
    print(f"\n{title}")
    print("-" * 70)

    for i, (name, category, confidence, used_llm) in enumerate(results, 1):
        display_name = name[:40] + "..." if len(name) > 40 else name

        if show_llm and used_llm:
            indicator = "ü§ñ"
        else:
            indicator = "‚ö°"

        if show_confidence:
            print(f"{i:2d}. {indicator} {display_name:43s} ‚Üí {category:20s} (conf: {confidence:.2f})")
        else:
            print(f"{i:2d}. {display_name:45s} ‚Üí {category}")

    category_counts = Counter(cat for _, cat, _, _ in results)
    print(f"\nüìä Distribution: {dict(category_counts)}")

# ===== TEST 1: Keyword-Only Baseline =====
print("\n" + "=" * 70)
print("TEST 1: Keyword-Only Categorization (Baseline)")
print("=" * 70)

start_time = time.time()
keyword_results = []

for relationship in test_relationships:
    content = relationship.get('content', '')
    rel_types = extract_relationship_types(content)
    display_name = rel_types if rel_types else content[:50]
    category = categorize_relationship(content)
    keyword_results.append((display_name, category, 1.0, False))

elapsed = time.time() - start_time
display_results(keyword_results, "Results (Keyword Matching):", show_confidence=False)
if len(test_relationships) > 0:
    print(f"\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_relationships):.1f}ms per relationship)")
else:
    print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no relationships to test)")

# ===== TEST 2: Confidence Scoring Analysis =====
print("\n" + "=" * 70)
print("TEST 2: Keyword + Confidence Scoring")
print("=" * 70)

start_time = time.time()
confidence_results = []

for relationship in test_relationships:
    content = relationship.get('content', '')
    rel_types = extract_relationship_types(content)
    display_name = rel_types if rel_types else content[:50]
    category, confidence = categorize_relationship_with_confidence(content)
    confidence_results.append((display_name, category, confidence, False))

elapsed = time.time() - start_time
display_results(confidence_results, "Results (with confidence scores):", show_confidence=True)

low_confidence = [(n, c, conf) for n, c, conf, _ in confidence_results if conf < 0.70]

if low_confidence:
    print(f"\nüîç Ambiguous relationships (confidence < 0.70): {len(low_confidence)}")
    for name, cat, conf in low_confidence:
        print(f"   - {name[:50]:50s} ‚Üí {cat:20s} (conf: {conf:.2f})")
else:
    print(f"\n‚úÖ All relationships have high confidence (‚â•0.70) - no LLM fallback needed")

if len(test_relationships) > 0:
    print(f"\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_relationships):.1f}ms per relationship)")
else:
    print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no relationships to test)")

# ===== TEST 3: Hybrid Mode (if Ollama available) =====
if ollama_running and model_available:
    print("\n" + "=" * 70)
    print("TEST 3: Hybrid Categorization (Keyword + LLM Fallback)")
    print("=" * 70)
    print("‚è±Ô∏è  Note: LLM calls may take 5-10 seconds total...\n")

    start_time = time.time()
    hybrid_results = []
    llm_call_count = 0

    for relationship in test_relationships:
        content = relationship.get('content', '')
        rel_types = extract_relationship_types(content)
        display_name = rel_types if rel_types else content[:50]

        keyword_cat, keyword_conf = categorize_relationship_with_confidence(content)

        if keyword_conf >= 0.70:
            category, confidence = keyword_cat, keyword_conf
            used_llm = False
        else:
            category, confidence = categorize_relationship_hybrid(content, confidence_threshold=0.70)
            used_llm = (confidence == 0.90)
            if used_llm:
                llm_call_count += 1

        hybrid_results.append((display_name, category, confidence, used_llm))

    elapsed = time.time() - start_time
    display_results(hybrid_results, "Results (Hybrid mode - keyword + LLM):", show_confidence=True, show_llm=True)

    if len(test_relationships) > 0:
        print(f"\nü§ñ LLM calls: {llm_call_count}/{len(test_relationships)} ({100*llm_call_count/len(test_relationships):.1f}%)")
    else:
        print(f"\\nü§ñ LLM calls: 0/0 (no relationships to test)")
    if len(test_relationships) > 0:
        print(f"‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_relationships):.1f}ms per relationship)")
    else:
        print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no relationships to test)")

    # ===== COMPARISON SUMMARY =====
    print("\n" + "=" * 70)
    print("üìä COMPARISON SUMMARY (Keyword vs Hybrid)")
    print("=" * 70)

    changes = 0
    for i in range(len(test_relationships)):
        if keyword_results[i][1] != hybrid_results[i][1]:
            changes += 1

    if len(test_relationships) > 0:
        print(f"Relationships recategorized by LLM: {changes}/{len(test_relationships)} ({100*changes/len(test_relationships):.1f}%)")
    else:
        print(f"No relationships - skipping recategorization analysis")

    if changes > 0:
        print("\nRecategorization details:")
        for i in range(len(test_relationships)):
            kw_cat = keyword_results[i][1]
            hyb_cat = hybrid_results[i][1]
            if kw_cat != hyb_cat:
                name = keyword_results[i][0][:50]
                print(f"   - {name:50s}: {kw_cat:20s} ‚Üí {hyb_cat}")

    print(f"\n‚úÖ Hybrid categorization complete!")
else:
    print("\n" + "=" * 70)
    print("‚ö†Ô∏è  TEST 3 SKIPPED: Ollama not available")
    print("=" * 70)
    print("To enable hybrid mode:")
    print("   1. Install Ollama: https://ollama.com")
    print(f"   2. Pull model: ollama pull {OLLAMA_MODEL_OVERRIDE}")
    print("   3. Start service: brew services start ollama (macOS)")
    print("   4. Re-run this cell")

# ===== TEST 4: Pure LLM Mode (NEW) =====
if ollama_running and model_available:
    print("\n" + "=" * 70)
    print("TEST 4: Pure LLM Categorization (All relationships)")
    print("=" * 70)
    print("‚è±Ô∏è  Note: This will call LLM for ALL relationships (may take 30-60 seconds)...\n")

    start_time = time.time()
    llm_results = []

    for relationship in test_relationships:
        content = relationship.get('content', '')
        rel_types = extract_relationship_types(content)
        display_name = rel_types if rel_types else content[:50]
        category, confidence = categorize_relationship_llm_only(content)
        llm_results.append((display_name, category, confidence, True))

    elapsed = time.time() - start_time
    display_results(llm_results, "Results (Pure LLM mode):", show_confidence=True, show_llm=True)

    print(f"\nü§ñ LLM calls: {len(test_relationships)}/{len(test_relationships)} (100%)")
    if len(test_relationships) > 0:
        print(f"‚è±Ô∏è  Time: {elapsed*1000:.1f}ms ({elapsed*1000/len(test_relationships):.1f}ms per relationship)")
    else:
        print(f"\\n‚è±Ô∏è  Time: {elapsed*1000:.1f}ms (no relationships to test)")

    # Compare keyword vs pure LLM (FIXED INDENTATION)
    llm_changes = sum(1 for i in range(len(test_relationships)) if keyword_results[i][1] != llm_results[i][1])

    print("\n" + "=" * 70)
    print("üìä COMPARISON SUMMARY (Keyword vs Pure LLM)")
    print("=" * 70)
    if len(test_relationships) > 0:
        print(f"Relationships recategorized by pure LLM: {llm_changes}/{len(test_relationships)} ({100*llm_changes/len(test_relationships):.1f}%)")
    else:
        print(f"No relationships - skipping recategorization analysis")

    if llm_changes > 0:
        print("\nRecategorization details:")
        for i in range(len(test_relationships)):
            kw_cat = keyword_results[i][1]
            llm_cat = llm_results[i][1]
            if kw_cat != llm_cat:
                name = keyword_results[i][0][:50]
                print(f"   - {name:50s}: {kw_cat:20s} ‚Üí {llm_cat}")

    print(f"\n‚úÖ Pure LLM categorization complete!")
else:
    print("\n" + "=" * 70)
    print("‚ö†Ô∏è  TEST 4 SKIPPED: Ollama not available")
    print("=" * 70)
    print("To enable pure LLM mode:")
    print("   1. Install Ollama: https://ollama.com")
    print(f"   2. Pull model: ollama pull {OLLAMA_MODEL_OVERRIDE}")
    print("   3. Start service: brew services start ollama (macOS)")
    print("   4. Re-run this cell")

print("\n" + "=" * 70)
print("üéØ TESTING COMPLETE - 4 Categorization Modes Evaluated")
print("=" * 70)

In [1]:
# ### Cell 9
# ## Clear graph storage with clear kernel restart instructions
# ## This is the SAFE and ROBUST approach

# from pathlib import Path
# import shutil

# from updated_architectures.implementation.ice_simplified import create_ice_system
# ice = create_ice_system()

# ##########################################################
# #                  Clear Graph Storage                   #
# ##########################################################

# # FIX: Reset storage_path to directory (Cell 12 redefined it to file)
# storage_path = Path(ice.config.working_dir)

# if storage_path.exists():
#     print("üìä PRE-DELETION CHECK")

#     # Show what will be deleted
#     files = list(storage_path.glob("*"))
#     total_size = sum(f.stat().st_size for f in files if f.is_file())
#     print(f"   Files: {len(files)}")
#     print(f"   Total size: {total_size / (1024*1024):.2f} MB")

#     # Delete storage
#     shutil.rmtree(storage_path)  # Deletes directory + all contents
#     storage_path.mkdir(parents=True, exist_ok=True)  # Re-create empty directory

#     print("\n‚úÖ POST-DELETION CHECK")
#     print(f"   Storage cleared: {storage_path}")
#     print(f"   Files remaining: {len(list(storage_path.glob('*')))}")

#     # CRITICAL WARNING
#     print("\n" + "=" * 70)
#     print("‚ö†Ô∏è  CRITICAL: KERNEL RESTART REQUIRED")
#     print("=" * 70)
#     print("\nWhy restart is needed:")
#     print("  ‚Ä¢ LightRAG maintains document IDs in Python memory")
#     print("  ‚Ä¢ Clearing disk files does NOT clear memory state")
#     print("  ‚Ä¢ Without restart: Documents will be rejected as 'duplicates'")
#     print("  ‚Ä¢ Result: Empty graph even after re-running ingestion")

#     print("\nüìã Next Steps:")
#     print("  1. Jupyter Menu ‚Üí Kernel ‚Üí Restart Kernel")
#     print("  2. Re-run Cell 3 (Initialize ICE System)")
#     print("  3. Re-run Cell 27 (Data Ingestion)")
#     print("  4. Verify Cell 28 shows documents > 0")

#     print("\nüí° Alternative (if you don't want to restart):")
#     print("  ‚Ä¢ Just run Cell 27 with REBUILD_GRAPH = False")
#     print("  ‚Ä¢ This skips rebuilding and uses existing graph")
#     print("  ‚Ä¢ Use this for query testing without re-ingesting data")

#     print("\n" + "=" * 70)
#     print("‚úÖ Graph cleared - RESTART KERNEL before rebuilding")
#     print("=" * 70)
# else:
#     print("‚ö†Ô∏è  Storage path doesn't exist - nothing to clear")


INFO:ice_data_ingestion.ice_integration:ICE LightRAG system initialized successfully


‚úÖ LightRAG successfully imported!


INFO:ice_data_ingestion.ice_integration:ICE LightRAG system initialized successfully
INFO:updated_architectures.implementation.ice_simplified:ICE Core initializing with ICESystemManager orchestration
INFO:src.ice_core.ice_system_manager:ICE System Manager initialized with working_dir: ice_lightrag/storage
INFO:updated_architectures.implementation.ice_simplified:‚úÖ ICESystemManager initialized successfully
INFO:imap_email_ingestion_pipeline.entity_extractor:spaCy model loaded successfully
INFO:updated_architectures.implementation.data_ingestion:‚úÖ TickerValidator initialized (false positive filtering)
INFO:src.ice_docling.docling_processor:DoclingProcessor initialized: storage=/Users/royyeo/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/Capstone Project/data/attachments
INFO:updated_architectures.implementation.data_ingestion:‚úÖ DoclingProcessor initialized (97.9% table accuracy)
INFO:updated_architectures.implementation.data_ingestion:‚úÖ BenzingaClient initialized (rea

üìä PRE-DELETION CHECK
   Files: 0
   Total size: 0.00 MB

‚úÖ POST-DELETION CHECK
   Storage cleared: ice_lightrag/storage
   Files remaining: 0

‚ö†Ô∏è  CRITICAL: KERNEL RESTART REQUIRED

Why restart is needed:
  ‚Ä¢ LightRAG maintains document IDs in Python memory
  ‚Ä¢ Clearing disk files does NOT clear memory state
  ‚Ä¢ Without restart: Documents will be rejected as 'duplicates'
  ‚Ä¢ Result: Empty graph even after re-running ingestion

üìã Next Steps:
  1. Jupyter Menu ‚Üí Kernel ‚Üí Restart Kernel
  2. Re-run Cell 3 (Initialize ICE System)
  3. Re-run Cell 27 (Data Ingestion)
  4. Verify Cell 28 shows documents > 0

üí° Alternative (if you don't want to restart):
  ‚Ä¢ Just run Cell 27 with REBUILD_GRAPH = False
  ‚Ä¢ This skips rebuilding and uses existing graph
  ‚Ä¢ Use this for query testing without re-ingesting data

‚úÖ Graph cleared - RESTART KERNEL before rebuilding


In [None]:
# Cell 10
# Portfolio configuration
import pandas as pd

# portfolio_df = pd.read_csv('portfolio_holdings.csv')
portfolio_df = pd.read_csv('portfolio_holdings_folder/portfolio_holdings_diversified_10.csv')

# Basic validation
if portfolio_df.empty:
    raise ValueError("Portfolio CSV is empty")
if 'ticker' not in portfolio_df.columns:
    raise ValueError("CSV must have 'ticker' column")

holdings = portfolio_df['ticker'].tolist()

print(f"üéØ Portfolio Configuration")
print(f"‚îÅ" * 40)
print(f"Holdings: {', '.join(holdings)} ({len(holdings)} stocks)")
print(f"Sector: {portfolio_df['sector'].iloc[0] if len(portfolio_df) > 0 else 'N/A'}")
print(f"Data Range: 2 years historical (editable in Cell 21)")
print(f"üìÑ Source: portfolio_holdings.csv")

<!-- ## 3. Data Ingestion & Processing -->

In [None]:
# Cell 11 - Clear data/attachments for testing (OPTIONAL)
# Safe, single-line command to clear attachments folder for processor testing
# This preserves the main directory structure while removing subdirectories


# Alternative: View what would be cleared (safe preview)
if os.path.exists('data/attachments'):
    subdirs = [d for d in os.listdir('data/attachments') if os.path.isdir(os.path.join('data/attachments', d))]
    if subdirs:
        print(f"üìÅ Found {len(subdirs)} subdirectories in data/attachments/:")
        for subdir in subdirs[:5]:  # Show first 5
            print(f"   - {subdir}")
        if len(subdirs) > 5:
            print(f"   ... and {len(subdirs) - 5} more")
    else:
        print("‚ú® data/attachments/ is already empty")
else:
    print("üìÅ data/attachments/ does not exist yet")
    
    
#######################################################################    
### CLEAR COMMAND (uncomment to use):
# import shutil, os; [shutil.rmtree(os.path.join('data/attachments', d)) for d in os.listdir('data/attachments') if os.path.isdir(os.path.join('data/attachments', d))] if os.path.exists('data/attachments') else None; print(f"‚úÖ Cleared {len([d for d in os.listdir('data/attachments') if os.path.isdir(os.path.join('data/attachments', d))]) if os.path.exists('data/attachments') else 0} subdirectories from data/attachments/") if os.path.exists('data/attachments') else print("üìÅ data/attachments/ does not exist")


In [None]:
# Cell 12
print("\nüìä Data Source Summary")
print("=" * 50)

# Show ACTUAL metrics if available (not fake percentages)
if ice and ice.core.is_ready():
    storage_stats = ice.core.get_storage_stats()
    print(f"üíæ Current Graph Size: {storage_stats['total_storage_bytes'] / (1024*1024):.2f} MB")
    
    # Show real source info if ingestion has run
    if 'ingestion_result' in locals() and 'metrics' in ingestion_result:
        metrics = ingestion_result['metrics']
        if 'data_sources_used' in metrics:
            print(f"‚úÖ Active sources: {', '.join(metrics['data_sources_used'])}")
        print(f"üìÑ Total documents: {ingestion_result.get('total_documents', 0)}")
    else:
        print("‚ÑπÔ∏è Data source metrics available after ingestion completes")
else:
    print("‚ö†Ô∏è Knowledge graph not ready")
    
# Cell 12
ice.core.get_graph_stats()

<!-- ## 3b. Data Source Contribution Visualization (Week 5) -->

In [None]:
# Cell 13
print("üìä ICE Data Sources Summary (Phase 1 Integration)")
print("=" * 60)
print("\n‚ÑπÔ∏è  Phase 1 focuses on architecture and data flow patterns.")
print("Actual data ingestion depends on configured API keys.\n")
print("\n1Ô∏è‚É£ API/MCP Sources:")
print("   - NewsAPI: Real-time financial news")
print("   - SEC EDGAR: Regulatory filings (10-K, 10-Q, 8-K)")
print("   - Alpha Vantage: Market data")
print("\n2Ô∏è‚É£ Email Pipeline (Phase 1 Enhanced Documents):")
print("   - Broker research with BUY/SELL signals")
print("   - Enhanced documents: [TICKER:NVDA|confidence:0.95]")
print("   - See detailed demo: imap_email_ingestion_pipeline/investment_email_extractor_simple.ipynb")
print("\n3Ô∏è‚É£ SEC Filings:")
print("   - Management commentary and financial statements")
print("   - Integrated via SEC EDGAR connector")
print("\nüí° All sources ‚Üí Single LightRAG knowledge graph via ice_simplified.py")

<!-- ## 3a. ICE Data Sources Integration (Week 5)

ICE integrates 3 heterogeneous data sources into unified knowledge graph:

**Detailed Demonstrations Available**:
- üìß **Email Pipeline**: See `imap_email_ingestion_pipeline/investment_email_extractor_simple.ipynb` (25 cells)
  - Entity extraction (tickers, ratings, price targets)
  - BUY/SELL signal extraction with confidence scores
  - Enhanced document creation with inline metadata
  
- üìä **Quick Summary Below** -->

In [None]:

# Cell 14
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# TWO-LAYER DATA SOURCE CONTROL SYSTEM
# Layer 1: Source Type Switches (Boolean - Master Kill Switches)
# Layer 2: Category Limits (Integer - Granular Control per stock)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
'''
Note:
- email_limit only works when EMAIL_SELECTOR is set as 'all'; in other words, EMAIL_SELECTOR (except for 'all') will bypass email_limit.
- Check precedence: Source switch > Selector > Limit
- Remember Crawl4AI disabled = 30-40% success for Tier 3-5 URLs.
'''

#########################################################################################################

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# LAYER 1: SOURCE TYPE SWITCHES (Master Kill Switches)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
### AQ1

email_source_enabled = True      # Controls EmailConnector
api_source_enabled = False   # Controls ALL API sources (News, Financial, Market, SEC)
mcp_source_enabled = False       # Controls MCP sources (Research/Search via Exa MCP)
url_processing_enabled = True    # Controls URL processing in emails (master switch)
crawl4ai_enabled = False         # Controls Crawl4AI browser automation for URL fetching

# Set environment variables (requires kernel restart to take effect if changed mid-session)
import os
os.environ['ICE_PROCESS_URLS'] = 'true' if url_processing_enabled else 'false'
os.environ['USE_CRAWL4AI_LINKS'] = 'true' if crawl4ai_enabled else 'false'

#########################################################################################################

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# LAYER 2: CATEGORY LIMITS (Granular Control - only active when source type enabled)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# # Email category (not per-stock, portfolio-wide)
# email_limit = 25                 # Top X latest emails

# # API categories (per stock)
# news_limit = 2                   # News articles per stock (NewsAPI, Benzinga, Finnhub, MarketAux)
# financial_limit = 2              # Financial fundamentals per stock (FMP, Alpha Vantage)
# market_limit = 1                 # Market data per stock (Polygon)
# sec_limit = 2                    # SEC filings per stock (SEC EDGAR)

# # MCP categories (per stock)
# research_limit = 0               # Research documents per stock (Exa MCP, on-demand only)


# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# PORTFOLIO SIZE SELECTOR - Sets holdings + default limits
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

PORTFOLIO_SIZE = 'tiny'  # Options: 'tiny' | 'small' | 'medium' | 'full'

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# VALIDATION: Ensure valid configuration
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# Validate PORTFOLIO_SIZE value
valid_sizes = ['tiny', 'small', 'medium', 'full']
if PORTFOLIO_SIZE not in valid_sizes:
    raise ValueError(f"‚ùå Invalid PORTFOLIO_SIZE='{PORTFOLIO_SIZE}'. Choose from: {', '.join(valid_sizes)}")

# Validate dependency for 'full' portfolio
if PORTFOLIO_SIZE == 'full' and 'holdings' not in dir():
    raise RuntimeError("‚ùå PORTFOLIO_SIZE='full' requires Cell 16 to run first! (Loads holdings from CSV)")

### PORTFOLIO CONFIGURATIONS:
portfolios = {
    'tiny': {
        # 'holdings': ['NVDA', 'AMD'],
        'holdings': ['FICO'],
        'email_limit': 1, # EMAIL_SELECTOR needs to be set as 'all'
        'news_limit': 2,
        'financial_limit': 1,
        'market_limit': 1,
        'sec_limit': 1,
        'research_limit': 0
    },
    'small': {
        'holdings': ['NVDA', 'AMD'],
        'email_limit': 25, # EMAIL_SELECTOR needs to be set as 'all'
        'news_limit': 2,
        'financial_limit': 2,
        'market_limit': 1,
        'sec_limit': 2,
        'research_limit': 0
    },
    'medium': {
        'holdings': ['NVDA', 'AMD', 'TSMC'],
        'email_limit': 50, # EMAIL_SELECTOR needs to be set as 'all'
        'news_limit': 2,
        'financial_limit': 2,
        'market_limit': 1,
        'sec_limit': 3,
        'research_limit': 0
    },
    'full': {
        'holdings': holdings,  # ‚Üê Use ALL stocks from CSV (Cell 16)
        'email_limit': 71, # EMAIL_SELECTOR needs to be set as 'all'
        'news_limit': 2,
        'financial_limit': 2,
        'market_limit': 1,
        'sec_limit': 3,
        'research_limit': 0
    }
}

portfolio = portfolios[PORTFOLIO_SIZE]

# Set ALL variables (holdings + 6 category limits)
test_holdings = portfolio['holdings']
email_limit = portfolio['email_limit']
news_limit = portfolio['news_limit']
financial_limit = portfolio['financial_limit']
market_limit = portfolio['market_limit']
sec_limit = portfolio['sec_limit']
research_limit = portfolio['research_limit']

print(f"üìä {PORTFOLIO_SIZE.upper()} Portfolio Selected")
print(f"{'‚ïê'*70}")
print(f"Holdings: {', '.join(test_holdings)} ({len(test_holdings)} stocks)")
print(f"\nüì¶ Default Category Limits (can be overridden in Cell 26):")
print(f"  üìß Email: {email_limit} broker research emails")
print(f"  üì∞ News: {news_limit}/stock")
print(f"  üíπ Financial: {financial_limit}/stock")
print(f"  üìà Market: {market_limit}/stock")
print(f"  üìë SEC: {sec_limit}/stock")
print(f"  üî¨ Research: {research_limit}/stock")

estimated = (
    email_limit + 
    len(test_holdings) * (news_limit + financial_limit + market_limit + sec_limit + research_limit)
)
print(f"\nüìä Estimated Docs: ~{estimated} (before source switches in Cell 26)")
print(f"{'‚ïê'*70}")

#########################################################################################################

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# SPECIAL SELECTOR: EMAIL_SELECTOR (Email Category Only)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# EMAIL_SELECTOR precedence:
# - If 'all' ‚Üí use email_limit
# - If specific selector ‚Üí bypass email_limit, use email_files_to_process

# (EMAIL_SELECTOR defined earlier in notebook, typically 'all')

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# EMAIL SELECTOR - Choose which emails to process
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

EMAIL_SELECTOR = 'crawl4ai_test'  # Options: 'all' | 'crawl4ai_test' | 'docling_test' | 'custom'
# EMAIL_SELECTOR = 'html_table_test'  # Options: 'all' | 'crawl4ai_test' | 'docling_test' | 'custom'

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# VALIDATION
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# Validate EMAIL_SELECTOR
valid_email = ['all', 'html_table_test', 'crawl4ai_test', 'docling_test', 'custom']
if EMAIL_SELECTOR not in valid_email:
    raise ValueError(f"‚ùå Invalid EMAIL_SELECTOR='{EMAIL_SELECTOR}'. Choose from: {', '.join(valid_email)}")

email_sets = {
    'all': {
        'email_files': None,  # Process all 71 emails
        'description': 'All 71 sample emails'
    },
    'html_table_test': {
        'email_files': [
            'FW_ RHB | Singapore Morning Cuppa _ 15 August 2025 (ST Engineering, First Resources, Golden Agri-Resources, StarHub).eml'
        ],
        'description': 'Emails with HTML tables.'
    },
    'crawl4ai_test': {
        'email_files': [
            # 'CH_HK_ Nongfu Spring Co. Ltd (9633 HK)_ Leading the pack (NOT RATED).eml',
            'CH_HK_ Tencent Music Entertainment (1698 HK)_ Stronger growth with expanding revenue streams  (NOT RATED).eml',
            'DBS SALES SCOOP (14 AUG 2025)_ TENCENT | UOL.eml',
        ],
        'description': 'Emails with URLs (tests link processing)'
    },
    'docling_test': {
        'email_files': [
            # 'Yupi Indo IPO calculations.eml',
            # 'CGSI Futuristic Tour 2.0 Shenzhen & Guangzhou 14-15 April 2025.eml',
            # 'DBS Economics & Strategy_ Macro Strategy_ Fed noise; Singapore GDP; lower USD.eml',
            # 'CGS Global AI & Robotic Conference 2025 - Hangzhou_ 27 March 2025 | some key takeaways from Supermarket _ Sports retailers (Anta, Li Ning, 361, Xtep).eml',
            # 'DBS Economics & Strategy_ China_ Capacity reduction campaign weighs on activity.eml',
            'Tencent Q2 2025 Earnings.eml',
            # 'BABA Q1 2026 June Qtr Earnings.eml',
        ],
        'description': 'Emails for testing... (tests Docling PDF/Excel/image processing)'
    },
    'crawl4ai_docling_test': {
        'email_files': [
            'CGSI Futuristic Tour 2.0 Shenzhen & Guangzhou 14-15 April 2025.eml',
            # 'Yupi Indo IPO calculations.eml',
        ],
        'description': 'Emails for testing... PDF documents contained in the embedded link in the email.'
    },
    
    'custom': {
        'email_files': [
            # Add your custom email files here
            # Example: 'your_email.eml'
        ],
        'description': 'Custom email selection'
    }
}

# Get selected email set
selected_emails = email_sets[EMAIL_SELECTOR]
email_files_to_process = selected_emails['email_files']

print(f"\n{'='*70}")
print(f"EMAIL SELECTOR: {EMAIL_SELECTOR}")
print(f"{'='*70}")
print(f"Description: {selected_emails['description']}")
if email_files_to_process:
    print(f"Emails to process: {len(email_files_to_process)}")
    for i, email_file in enumerate(email_files_to_process, 1):
        print(f"  {i}. {email_file}")
else:
    print(f"Emails to process: All (up to email_limit)")
print(f"{'='*70}\n")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# PRECEDENCE HIERARCHY (Highest to Lowest):
# 1. Source Type Switch (email_source_enabled, api_source_enabled, mcp_source_enabled)
# 2. Category Limit (news_limit, financial_limit, market_limit, sec_limit, research_limit)
# 3. Special Selector (EMAIL_SELECTOR for emails only)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# Apply precedence: Source type disabled ‚Üí override all category limits to 0
if not email_source_enabled:
    email_limit = 0
    email_files_to_process = None  # Clear selector too

if not api_source_enabled:
    news_limit = 0
    financial_limit = 0
    market_limit = 0
    sec_limit = 0

if not mcp_source_enabled:
    research_limit = 0

# Email special case: EMAIL_SELECTOR can bypass email_limit (but not email_source_enabled)
if email_source_enabled:
    if EMAIL_SELECTOR == 'all':
        # Use email_limit for 'all' mode
        actual_email_count = email_limit
        email_display = f"{email_limit} emails (up to limit)"
    else:
        # Specific selector bypasses email_limit
        actual_email_count = len(email_files_to_process) if email_files_to_process else 0
        email_display = f"{actual_email_count} specific files (EMAIL_SELECTOR ignores email_limit)"
else:
    # Source disabled - override everything
    actual_email_count = 0
    email_display = "0 (source disabled)"

#########################################################################################################
#########################################################################################################
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# DISPLAY: Show configuration after all precedence rules applied
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

print(f"\n{'='*70}")
print(f"DATA SOURCE CONFIGURATION (Two-Layer Control)")
print(f"{'='*70}\n")

print(f"LAYER 1: Source Type Switches")
print(f"  {'‚úÖ' if email_source_enabled else '‚ùå'} Email Source")
print(f"  {'‚úÖ' if api_source_enabled else '‚ùå'} API Source (News + Financial + Market + SEC)")
print(f"  {'‚úÖ' if mcp_source_enabled else '‚ùå'} MCP Source (Research/Search)")
print(f"  {'‚úÖ' if crawl4ai_enabled else '‚ùå'} Crawl4AI (Browser Automation for URLs)")
if crawl4ai_enabled:
    print(f"      ‚Üí Tier 3-5 URLs use browser automation (60-80% success rate)")
else:
    print(f"      ‚Üí Tier 3-5 URLs use simple HTTP only (30-40% success rate)")


print(f"\nLAYER 2: Category Limits (Active Categories Only)\n")

# Category 1: Email
print(f"  {'‚úÖ' if email_source_enabled and actual_email_count > 0 else '‚ùå'} Email: {email_display}")

# Categories 2-5: API sources
if api_source_enabled:
    print(f"  {'‚úÖ' if news_limit > 0 else '‚ùå'} News: {news_limit}/stock (NewsAPI, Benzinga, Finnhub, MarketAux)")
    print(f"  {'‚úÖ' if financial_limit > 0 else '‚ùå'} Financial: {financial_limit}/stock (FMP, Alpha Vantage)")
    print(f"  {'‚úÖ' if market_limit > 0 else '‚ùå'} Market: {market_limit}/stock (Polygon)")
    print(f"  {'‚úÖ' if sec_limit > 0 else '‚ùå'} SEC: {sec_limit}/stock (SEC EDGAR)")
else:
    print(f"  ‚ùå News: 0/stock (API source disabled)")
    print(f"  ‚ùå Financial: 0/stock (API source disabled)")
    print(f"  ‚ùå Market: 0/stock (API source disabled)")
    print(f"  ‚ùå SEC: 0/stock (API source disabled)")

# Category 6: MCP source
if mcp_source_enabled:
    print(f"  {'‚úÖ' if research_limit > 0 else '‚ùå'} Research: {research_limit}/stock (Exa MCP)")
else:
    print(f"  ‚ùå Research: 0/stock (MCP source disabled)")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# CALCULATE: Estimated documents (after ALL overrides and precedence)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

estimated_docs = (
    actual_email_count +
    len(test_holdings) * news_limit +
    len(test_holdings) * financial_limit +
    len(test_holdings) * market_limit +
    len(test_holdings) * sec_limit +
    len(test_holdings) * research_limit
)

print(f"\nüìä FINAL Estimated Documents: {estimated_docs}")
print(f"  - Email: {actual_email_count}")
print(f"  - News: {len(test_holdings)} tickers √ó {news_limit} = {len(test_holdings) * news_limit}")
print(f"  - Financial: {len(test_holdings)} tickers √ó {financial_limit} = {len(test_holdings) * financial_limit}")
print(f"  - Market: {len(test_holdings)} tickers √ó {market_limit} = {len(test_holdings) * market_limit}")
print(f"  - SEC: {len(test_holdings)} tickers √ó {sec_limit} = {len(test_holdings) * sec_limit}")
print(f"  - Research: {len(test_holdings)} tickers √ó {research_limit} = {len(test_holdings) * research_limit}")
print(f"{'='*70}\n")

print(f"\n‚ö†Ô∏è  NOTE: Changing crawl4ai_enabled requires KERNEL RESTART")
print(f"   ‚Ä¢ Current: USE_CRAWL4AI_LINKS={os.getenv('USE_CRAWL4AI_LINKS', 'false')}")
print(f"   ‚Ä¢ Environment variables are read at kernel start")
print(f"   ‚Ä¢ To apply changes: Kernel ‚Üí Restart & Run All")


In [None]:
# FIXED DIAGNOSTIC: Check PDFs in correct unified storage location
# Added 2025-11-04: Replaces legacy diagnostic cells that checked wrong directory
from pathlib import Path
from datetime import datetime, timedelta

print("="*70)
print("üìÅ URL PDF STORAGE VERIFICATION (FIXED)")
print("="*70)

# Check correct storage location using hierarchical structure
storage_dir = Path("data/attachments")

if storage_dir.exists():
    # Count ALL PDFs in unified storage (email attachments + URL PDFs)
    all_pdfs = list(storage_dir.glob("*/*/original/*.pdf"))
    print(f"\n‚úÖ Storage directory exists: {storage_dir}")
    print(f"üìä Total PDFs in unified storage: {len(all_pdfs)}")
    
    # Show recent PDFs (last 10 minutes)
    recent_cutoff = datetime.now() - timedelta(minutes=10)
    recent_pdfs = [f for f in all_pdfs if datetime.fromtimestamp(f.stat().st_mtime) > recent_cutoff]
    
    if recent_pdfs:
        print(f"\nüÜï Recent PDFs (last 10 min): {len(recent_pdfs)}")
        for pdf in recent_pdfs[:5]:  # Show first 5
            email_uid = pdf.parts[-4]  # Get email UID from path
            file_hash = pdf.parts[-3]  # Get file hash
            print(f"   ‚Ä¢ {pdf.name[:50]}... ({pdf.stat().st_size/1024:.1f} KB)")
            print(f"     Email: {email_uid[:30]}...")
    else:
        print(f"\n‚ÑπÔ∏è  No PDFs downloaded in last 10 minutes")
        print(f"   (Existing PDFs may be from previous runs)")
    
    # Check metadata.json files to distinguish sources
    metadata_files = list(storage_dir.glob("*/*/metadata.json"))
    url_pdf_count = 0
    email_attachment_count = 0
    
    for meta_path in metadata_files:
        try:
            import json
            with open(meta_path) as f:
                metadata = json.load(f)
                if metadata.get('source_type') == 'url_pdf':
                    url_pdf_count += 1
                elif metadata.get('source_type') == 'email_attachment':
                    email_attachment_count += 1
        except:
            pass  # Skip if metadata read fails
    
    print(f"\nüìà Source Breakdown (from metadata.json):")
    print(f"   ‚Ä¢ URL PDFs: {url_pdf_count}")
    print(f"   ‚Ä¢ Email Attachments: {email_attachment_count}")
    print(f"   ‚Ä¢ Unknown/No metadata: {len(all_pdfs) - url_pdf_count - email_attachment_count}")
    
else:
    print(f"‚ùå Storage directory not found: {storage_dir}")
    print(f"   Run data ingestion first to create storage and download PDFs")

print("="*70)


In [None]:
# Cell 14.5 - Clear Link Cache (Optional)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# PURPOSE: Force fresh downloads by clearing URL cache
# WHEN TO USE:
#   - Testing crawl4ai_test emails that were previously processed
#   - URLs already cached ‚Üí won't save to email-specific folders
#   - Want to verify URL processing works end-to-end
# 
# ISSUE BACKGROUND:
#   Cached URLs return immediately without saving to data/attachments/{email_uid}/
#   This is a known caching architecture issue (see tmp/tmp_cache_diagnostic_report.md)
# 
# SAFE TO RUN: Shows statistics before deletion, only clears link cache
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

import shutil
from pathlib import Path

cache_dir = Path('data/link_cache')

print("=" * 70)
print("üóëÔ∏è  LINK CACHE CLEARING UTILITY")
print("=" * 70)

if cache_dir.exists():
    # Pre-deletion statistics
    cache_files = list(cache_dir.glob('*.json'))
    index_file = cache_dir / 'link_cache_index.json'
    
    if index_file.exists():
        cache_files_count = len(cache_files) - 1  # Exclude index file
    else:
        cache_files_count = len(cache_files)
    
    total_size = sum(f.stat().st_size for f in cache_files if f.is_file())
    
    print(f"\nüìä PRE-DELETION STATISTICS:")
    print(f"   Cache directory: {cache_dir}")
    print(f"   Cached URLs: {cache_files_count}")
    print(f"   Total cache size: {total_size / 1024 / 1024:.2f} MB")
    
    # Show most recent cached URLs (top 5)
    if index_file.exists():
        import json
        with open(index_file, 'r') as f:
            cache_index = json.load(f)
        
        if cache_index:
            print(f"\nüìã RECENT CACHED URLs (last 5):")
            sorted_entries = sorted(
                cache_index.items(), 
                key=lambda x: x[1].get('cached_time', ''), 
                reverse=True
            )[:5]
            
            for i, (url_hash, entry) in enumerate(sorted_entries, 1):
                url = entry.get('url', 'Unknown')
                cached_time = entry.get('cached_time', 'Unknown')[:19]  # Truncate to date+time
                size_kb = entry.get('file_size', 0) / 1024
                
                # Truncate URL for display
                url_display = url if len(url) <= 60 else url[:57] + '...'
                
                print(f"   {i}. {url_display}")
                print(f"      Cached: {cached_time} | Size: {size_kb:.1f} KB")
    
    # Delete cache
    print(f"\nüóëÔ∏è  DELETING CACHE...")
    shutil.rmtree(cache_dir)
    
    print(f"\n‚úÖ POST-DELETION CHECK:")
    print(f"   Cache directory: {cache_dir}")
    print(f"   Exists: {cache_dir.exists()}")
    print(f"   Status: Cache cleared successfully")
    
    print(f"\nüí° NEXT STEPS:")
    print(f"   1. Run Cell 15 to process emails")
    print(f"   2. URLs will be downloaded fresh (not from cache)")
    print(f"   3. Files will be saved to data/attachments/{{email_uid}}/")
    print(f"   4. Cache will be rebuilt as URLs are processed")
    
else:
    print(f"\n‚ö†Ô∏è  Cache directory does not exist: {cache_dir}")
    print(f"   Status: No cache to clear")
    print(f"   This is normal if:")
    print(f"      - First time running notebook")
    print(f"      - Cache was already cleared")
    print(f"      - No URL processing has occurred yet")
    
print("\n" + "=" * 70)
print("‚úÖ Cache clearing complete!")
print("=" * 70)


In [None]:
# Cell 14.6 - Clear Content Cache (HIGHLY RECOMMENDED)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# PURPOSE: Force fresh email processing by clearing content cache
# WHY: Content cache persists across runs and can lock in stale results
# WHEN: Run before REBUILD_GRAPH=True for guaranteed fresh processing
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

import shutil
from pathlib import Path

content_cache_dir = Path('./data/content_cache')

print("üóëÔ∏è  CONTENT CACHE CLEARING UTILITY")
print("=" * 70)

if content_cache_dir.exists():
    cache_files = list(content_cache_dir.glob('*.cache'))
    index_file = content_cache_dir / 'cache_index.json'
    
    if index_file.exists():
        cache_count = len(cache_files)
        total_size = sum(f.stat().st_size for f in cache_files if f.is_file())
        
        print(f"   Cache directory: {content_cache_dir}")
        print(f"   Cached items: {cache_count}")
        print(f"   Total cache size: {total_size / 1024 / 1024:.2f} MB")
        print()
        print("üóëÔ∏è  DELETING CACHE...")
        
        shutil.rmtree(content_cache_dir)
        content_cache_dir.mkdir(parents=True, exist_ok=True)
        
        print("   ‚úÖ Content cache cleared successfully")
        print()
        print("üìå WHAT THIS MEANS:")
        print("   1. Email bodies will be re-processed (not from cache)")
        print("   2. Entity extraction will run fresh")
        print("   3. Cache will be rebuilt as emails are processed")
    else:
        print("   ‚ÑπÔ∏è  Content cache is already empty")
else:
    content_cache_dir.mkdir(parents=True, exist_ok=True)
    print("   ‚ÑπÔ∏è  Content cache directory created")

print()
print("=" * 70)
print("‚úÖ Cache clearing complete!")
print("=" * 70)


In [None]:
# Cell 14.7 - Configure LLM Temperature (IMPORTANT)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# PURPOSE: Control LLM randomness for reproducible results
# UPDATED 2025-11-08: Separate temperatures for entity extraction & query answering
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

import os

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# SEPARATE TEMPERATURE CONFIGURATION (New System)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# Entity Extraction Temperature (used during document ingestion)
# Lower temperature = more reproducible knowledge graphs
# RECOMMENDED: ‚â§0.2 for investment intelligence (consistent graphs for backtesting)
ENTITY_EXTRACTION_TEMP = 0.2

# Query Answering Temperature (used during query processing)
# Higher temperature = more creative synthesis
# RECOMMENDED: 0.3-0.7 for balanced insights (factual yet creative)
QUERY_ANSWERING_TEMP = 0.0

# Set environment variables for this session
os.environ['ICE_LLM_TEMPERATURE_ENTITY_EXTRACTION'] = str(ENTITY_EXTRACTION_TEMP)
os.environ['ICE_LLM_TEMPERATURE_QUERY_ANSWERING'] = str(QUERY_ANSWERING_TEMP)

# Display configuration
print("üå°Ô∏è  LLM TEMPERATURE CONFIGURATION")
print("=" * 70)
print(f"   Entity Extraction: {ENTITY_EXTRACTION_TEMP}")
print(f"   Query Answering:   {QUERY_ANSWERING_TEMP}")
print()
print("üìå WHAT THIS AFFECTS:")
print("   ‚úÖ Entity extraction: Uses ENTITY_EXTRACTION_TEMP during graph building")
print("   ‚úÖ Query answering: Uses QUERY_ANSWERING_TEMP during queries")
print()
print("üéØ ENTITY EXTRACTION TEMPERATURE GUIDE:")
print("   ‚Ä¢ 0.0-0.2: Deterministic (RECOMMENDED for investment intelligence)")
print("     - Same document ‚Üí same entities (reproducible graphs)")
print("     - Required for backtesting, compliance, decision validation")
print("   ‚Ä¢ 0.3-0.5: Moderate creativity")
print("     - Richer entity extraction but may lose reproducibility")
print("   ‚Ä¢ >0.5: High randomness (NOT RECOMMENDED)")
print("     - Inconsistent graphs, breaks backtesting")
print()
print("üéØ QUERY ANSWERING TEMPERATURE GUIDE:")
print("   ‚Ä¢ 0.0-0.2: Conservative")
print("     - Factual, grounded answers, but may be too literal")
print("   ‚Ä¢ 0.3-0.7: Balanced (RECOMMENDED)")
print("     - Creative synthesis while staying grounded")
print("   ‚Ä¢ >0.7: Very creative")
print("     - Insightful but may be less consistent")
print()
print("üí° TO CHANGE: Modify ENTITY_EXTRACTION_TEMP and QUERY_ANSWERING_TEMP")
print("   values above, then re-run this cell")
print("=" * 70)
print(f"‚úÖ Entity extraction temperature set to {ENTITY_EXTRACTION_TEMP}")
print(f"‚úÖ Query answering temperature set to {QUERY_ANSWERING_TEMP}")
print("=" * 70)

In [None]:
# Cell 15
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# CONFIGURATION: Set to False to skip graph building and use existing graph
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
REBUILD_GRAPH = True
# REBUILD_GRAPH = False

# Stale-graph detection: Warn if extraction code changed since last build
import hashlib
from pathlib import Path

# Monitor multiple extraction pipeline files for changes
files_to_monitor = [
    'imap_email_ingestion_pipeline/table_entity_extractor.py',
    'imap_email_ingestion_pipeline/enhanced_doc_creator.py'
]
combined_hash = hashlib.md5()
for f in files_to_monitor:
    if Path(f).exists():
        combined_hash.update(Path(f).read_bytes())
current_hash = combined_hash.hexdigest()[:8]

version_file = Path('ice_lightrag/storage/.extractor_version')
if version_file.exists() and version_file.read_text().strip() != current_hash and not REBUILD_GRAPH:
    print("‚ö†Ô∏è  STALE GRAPH DETECTED")
    print(f"   Extraction code has changed since last graph build.")
    print(f"   Files monitored: table_entity_extractor.py, enhanced_doc_creator.py")
    print(f"   ‚Üí Set REBUILD_GRAPH=True to get latest extraction fixes!")
    print(f"   ‚Üí IMPORTANT: Restart kernel (Kernel ‚Üí Restart) before rebuilding!\n")

if REBUILD_GRAPH:
    # Execute data ingestion
    # NOTE: This operation may take several minutes. If it hangs, restart kernel.
    print(f"\nüì• Fetching Portfolio Data")
    print(f"‚îÅ" * 50)

    if not (ice and ice.is_ready()):
        raise RuntimeError("ICE system not ready for data ingestion")

    # Fetch historical data (1 year for faster processing - adjust years parameter as needed)
    print(f"üîÑ Fetching data for {len(test_holdings)} holdings...")
    ingestion_result = ice.ingest_historical_data(
        test_holdings, 
        years=1,
        email_limit=email_limit,
        news_limit=news_limit,
        financial_limit=financial_limit,
        market_limit=market_limit,
        sec_limit=sec_limit,
        research_limit=research_limit,
        email_files=email_files_to_process if email_source_enabled else None
    )

    # Display results
    print(f"\nüìä Ingestion Results:")
    print(f"  Status: {ingestion_result['status']}")
    print(f"  Holdings: {len(ingestion_result['holdings_processed'])}/{len(test_holdings)}")
    print(f"  Documents: {ingestion_result['total_documents']}")

    # Show successful holdings
    if ingestion_result['holdings_processed']:
        print(f"  ‚úÖ Successful: {', '.join(ingestion_result['holdings_processed'])}")

    # Show metrics
    if 'metrics' in ingestion_result:
        print(f"\n‚è±Ô∏è  Processing Time: {ingestion_result['metrics']['processing_time']:.2f}s")
        if 'data_sources_used' in ingestion_result['metrics']:
            print(f"  Data Sources: {', '.join(ingestion_result['metrics']['data_sources_used'])}")
        
        # Display investment signals from Phase 2.6.1 EntityExtractor
        if 'investment_signals' in ingestion_result['metrics']:
            signals = ingestion_result['metrics']['investment_signals']
            print(f"\nüìß Investment Signals Captured:")
            print(f"  Broker emails: {signals['email_count']}")
            print(f"  Tickers covered: {signals['tickers_covered']}")
            print(f"  BUY ratings: {signals['buy_ratings']}")
            print(f"  SELL ratings: {signals['sell_ratings']}")
            print(f"  Avg confidence: {signals['avg_confidence']:.2f}")

        # Document Source Breakdown
        if 'metrics' in ingestion_result and 'investment_signals' in ingestion_result['metrics']:
            signals = ingestion_result['metrics']['investment_signals']
            email_count = signals['email_count']
            
            # Parse remaining document types from total
            total_docs = ingestion_result.get('total_documents', 0)
            api_sec_count = total_docs - email_count
            
            print(f"\nüìÇ Document Source Breakdown:")
            print(f"  üìß Email (broker research): {email_count} documents")
            print(f"  üåê API + SEC (market data): {api_sec_count} documents")
            print(f"  üìä Total documents: {total_docs}")

    # Show failures if any
    if ingestion_result.get('failed_holdings'):
        print(f"\n‚ùå Failed Holdings:")
        for failure in ingestion_result['failed_holdings']:
            print(f"  {failure['symbol']}: {failure['error']}")
else:
    # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    # STALENESS WARNING: Alert user if selectors changed
    # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    print("\n" + "="*70)
    print("‚ö†Ô∏è  REBUILD_GRAPH = False")
    print("‚ö†Ô∏è  Using existing graph - NOT rebuilding with current selectors!")
    print("‚ö†Ô∏è  If you changed PORTFOLIO/EMAIL/SOURCE configuration,")
    print("‚ö†Ô∏è  set REBUILD_GRAPH=True to avoid querying STALE DATA!")
    print("="*70 + "\n")
    
    print("Using existing graph from: ice_lightrag/storage/")
    print("To rebuild, set REBUILD_GRAPH = True above and re-run this cell")
    
    # Create mock ingestion_result for downstream cells
    import json
    from pathlib import Path
    
    doc_count = 0
    if Path('ice_lightrag/storage/kv_store_doc_status.json').exists():
        with open('ice_lightrag/storage/kv_store_doc_status.json') as f:
            doc_count = len(json.load(f))
    
    ingestion_result = {
        'status': 'skipped',
        'total_documents': doc_count,
        'holdings_processed': holdings,
        'failed_holdings': [],
        'metrics': {
            'processing_time': 0.0,
            'data_sources_used': [],
            'investment_signals': {
                'email_count': 0,
                'tickers_covered': 0,
                'buy_ratings': 0,
                'sell_ratings': 0,
                'avg_confidence': 0.0
            }
        }
    }
    
    print(f"\nüìä Existing graph contains {doc_count} documents")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# FILE STORAGE DIAGNOSTIC (Added for debugging)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print(f"\n{'='*70}")
print(f"üìÅ FILE STORAGE DIAGNOSTIC")
print(f"{'='*70}")

from pathlib import Path
import os

# Determine storage path (handles both script and notebook contexts)
if '__file__' in globals():
    storage_path = Path(__file__).parent.parent.parent / 'data' / 'attachments'
else:
    storage_path = Path('data/attachments')

resolved_path = storage_path.resolve()
print(f"Expected storage path: {resolved_path}")
print(f"Path exists: {resolved_path.exists()}")

if resolved_path.exists():
    all_items = list(resolved_path.rglob("*"))
    all_files = [f for f in all_items if f.is_file()]
    pdfs = [f for f in all_files if f.suffix == '.pdf']
    txt_files = [f for f in all_files if f.name == 'extracted.txt']
    metadata_files = [f for f in all_files if f.name == 'metadata.json']
    
    print(f"\nStorage contents:")
    print(f"  Total items: {len(all_items)} (files + directories)")
    print(f"  Total files: {len(all_files)}")
    print(f"  PDF files: {len(pdfs)}")
    print(f"  extracted.txt files: {len(txt_files)}")
    print(f"  metadata.json files: {len(metadata_files)}")
    
    if pdfs:
        print(f"\n‚úÖ PDF files found:")
        for pdf in pdfs[:5]:
            size_kb = pdf.stat().st_size / 1024
            print(f"  [{size_kb:>8.1f} KB] {pdf.name}")
            print(f"              ‚Üí {pdf}")
        if len(pdfs) > 5:
            print(f"  ... and {len(pdfs) - 5} more PDFs")
    else:
        print(f"\n‚ùå No PDF files found in storage")
    
    if metadata_files:
        print(f"\n‚úÖ metadata.json files found:")
        for meta in metadata_files[:3]:
            print(f"  ‚Üí {meta.parent}")
            # Read and show key fields
            try:
                import json as json_module
                with open(meta, 'r') as f:
                    meta_data = json_module.load(f)
                    source_type = meta_data.get('source_type', 'unknown')
                    file_hash = meta_data.get('file_info', {}).get('file_hash', 'N/A')[:16]
                    print(f"     Type: {source_type}, Hash: {file_hash}...")
            except:
                pass
        if len(metadata_files) > 3:
            print(f"  ... and {len(metadata_files) - 3} more metadata files")
    else:
        print(f"\n‚ùå No metadata.json files found")
        
    # Check if any subdirectories exist
    subdirs = [d for d in all_items if d.is_dir()]
    if subdirs:
        print(f"\nüìÇ Subdirectories: {len(subdirs)}")
        for subdir in subdirs[:5]:
            print(f"  ‚Üí {subdir.name}")
    else:
        print(f"\n‚ö†Ô∏è  No subdirectories (expected: {'{email_uid}/{file_hash}/'} structure)")
else:
    print(f"‚ùå Storage path does not exist: {resolved_path}")
    print(f"   This directory should be created by IntelligentLinkProcessor")

print(f"{'='*70}\n")


INFO:  == LLM cache == saving: default:extract:d4b884e5dd4eee6b92df1688a66233a8
INFO: Chunk 40 of 84 extracted 13 Ent + 13 Rel chunk-c490a7d9095d762b4d0899e2d023077d


In [None]:
# Cell 15.5 - ENHANCED PDF Processing & Entity Extraction Diagnostics
# Purpose: Comprehensive validation of URL PDF processing pipeline
# Checks: PDF download ‚Üí Text extraction ‚Üí Entity extraction ‚Üí LightRAG ingestion

print("="*70)
print("üîç COMPREHENSIVE PDF PROCESSING DIAGNOSTICS")
print("="*70)

from pathlib import Path
import json

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# DIAGNOSTIC 1: Check EntityExtractor Initialization (IMPROVED)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print("\n[1/6] üîß EntityExtractor Initialization Check")
print("-" * 70)

# Try multiple possible locations (robust against refactoring)
entity_extractor = None
ee_paths = [
    ('ice.data_ingester.entity_extractor', lambda: ice.data_ingester.entity_extractor if hasattr(ice, 'data_ingester') else None),
    ('ice.entity_extractor', lambda: ice.entity_extractor if hasattr(ice, 'entity_extractor') else None),
]

for path_name, getter in ee_paths:
    try:
        candidate = getter()
        if candidate is not None:
            entity_extractor = candidate
            print(f"‚úÖ EntityExtractor found at: {path_name}")
            break
    except (AttributeError, TypeError):
        continue

if entity_extractor:
    print(f"   NLP Model (spaCy): {'‚úÖ Loaded' if hasattr(entity_extractor, 'nlp') and entity_extractor.nlp else '‚ùå Not loaded'}")
    if hasattr(entity_extractor, 'tickers'):
        print(f"   Known Tickers: {len(entity_extractor.tickers)} loaded")
    if hasattr(entity_extractor, 'companies'):
        print(f"   Companies: {len(entity_extractor.companies)} loaded")
else:
    print(f"‚ùå EntityExtractor NOT found in any expected location")
    print(f"   Tried paths: {[p[0] for p in ee_paths]}")
    print(f"   ‚Üí This may indicate initialization issue")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# DIAGNOSTIC 2: Check Downloaded PDFs
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print("\n[2/6] üìÅ Downloaded PDF Verification")
print("-" * 70)

# Fixed: Changed from legacy data/downloaded_reports to unified storage
storage_dir = Path("data/attachments")
latest_pdf = None

if storage_dir.exists():
    pdfs = list(storage_dir.glob("*/*/original/*.pdf"))
    print(f"PDFs in directory: {len(pdfs)}")
    
    if pdfs:
        latest_pdf = max(pdfs, key=lambda p: p.stat().st_mtime)
        size_kb = latest_pdf.stat().st_size / 1024
        
        print(f"\nLatest PDF:")
        print(f"  File: {latest_pdf.name}")
        print(f"  Size: {size_kb:.1f} KB")
        print(f"  Path: {latest_pdf}")
        
        # Verify PDF header
        with open(latest_pdf, 'rb') as f:
            header = f.read(4)
            if header == b'%PDF':
                print(f"  ‚úÖ Valid PDF file")
            else:
                print(f"  ‚ùå Invalid PDF header: {header}")
                
        # Check for metadata.json
        metadata_path = latest_pdf.parent.parent / 'metadata.json'
        if metadata_path.exists():
            print(f"  ‚úÖ metadata.json found")
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
                print(f"     Source type: {metadata.get('source_type', 'unknown')}")
                print(f"     Original URL: {metadata.get('source_context', {}).get('original_url', 'unknown')[:60]}...")
        else:
            print(f"  ‚ö†Ô∏è  metadata.json NOT found")
    else:
        print("‚ö†Ô∏è  No PDFs found")
else:
    print("‚ùå Download directory doesn't exist")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# DIAGNOSTIC 3: PDF Content Verification in LightRAG (MOVED UP)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print("\n[3/6] üîó PDF Content Verification in LightRAG")
print("-" * 70)

# Read actual documents from LightRAG storage
kv_path = Path('ice_lightrag/storage/kv_store_full_docs.json')
pdf_content_found = False

if kv_path.exists():
    try:
        with open(kv_path, 'r') as f:
            docs = json.load(f)
        
        for doc_id, doc_data in docs.items():
            content = doc_data.get('content', '')
            
            # Search for PDF marker
            if '[LINKED_REPORT:' in content:
                pdf_content_found = True
                # Extract PDF URL from marker
                import re
                pdf_urls = re.findall(r'\[LINKED_REPORT:(.+?)\]', content)
                
                file_path = doc_data.get('__vector_store__', {}).get('file_path', 'unknown')
                print(f"‚úÖ PDF content found in document: {file_path}")
                print(f"   PDF URLs embedded: {len(pdf_urls)}")
                for i, url in enumerate(pdf_urls[:3], 1):
                    print(f"      [{i}] {url[:60]}...")
                
                # Estimate PDF text length
                pdf_sections = content.split('[LINKED_REPORT:')
                if len(pdf_sections) > 1:
                    pdf_text_len = sum(len(s) for s in pdf_sections[1:])
                    print(f"   PDF text length: ~{pdf_text_len:,} chars")
                break  # Just show first document with PDF content
        
        if not pdf_content_found:
            print(f"‚ùå No PDF content markers found in {len(docs)} documents")
            print(f"   Documents may contain only email body text")
    except Exception as e:
        print(f"‚ùå Error reading LightRAG documents: {e}")
else:
    print(f"‚ùå LightRAG storage not found at: {kv_path}")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# DIAGNOSTIC 4: Entity Markup Analysis
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print("\n[4/6] üìÑ Entity Markup Analysis")
print("-" * 70)

entity_markup_found = False

if kv_path.exists():
    try:
        with open(kv_path, 'r') as f:
            docs = json.load(f)
        
        print(f"Total documents in LightRAG: {len(docs)}")
        
        for doc_id, doc_data in docs.items():
            content = doc_data.get('content', '')
            
            # Check for entity markup patterns
            if any(marker in content for marker in ['[TICKER:', '[COMPANY:', '[FINANCIAL]']):
                entity_markup_found = True
                break
        
        if entity_markup_found:
            print("‚úÖ Entity markup found in documents")
        else:
            print("‚ùå NO entity markup found (extraction may have failed)")
            
    except Exception as e:
        print(f"‚ùå Error reading documents: {e}")
else:
    print(f"‚ùå LightRAG storage not found")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# DIAGNOSTIC 5: LightRAG Graph Statistics
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print("\n[5/6] üìä LightRAG Knowledge Graph Stats")
print("-" * 70)

# Try to get graph stats from LightRAG
try:
    if kv_path.exists():
        with open(kv_path, 'r') as f:
            docs = json.load(f)
            print(f"Documents: {len(docs)}")
    else:
        print("Documents: 0")
    
    # Count chunks
    chunks_path = Path('ice_lightrag/storage/kv_store_text_chunks.json')
    if chunks_path.exists():
        with open(chunks_path, 'r') as f:
            chunks = json.load(f)
            print(f"Chunks: {len(chunks)}")
    else:
        print("Chunks: 0")
        
except Exception as e:
    print(f"‚ùå Error getting stats: {e}")

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# DIAGNOSTIC 6: Summary & Next Steps
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
print("\n[6/6] üìã DIAGNOSTIC SUMMARY")
print("="*70)

issues = []
if not entity_extractor:
    issues.append("EntityExtractor NOT found in any expected location")
if not latest_pdf:
    issues.append("No PDFs downloaded")
if not pdf_content_found:
    issues.append("PDF content NOT found in LightRAG documents")
if not entity_markup_found:
    issues.append("NO entity markup (extraction may have failed)")

if issues:
    print("\n‚ùå ISSUES DETECTED:")
    for i, issue in enumerate(issues, 1):
        print(f"   {i}. {issue}")
    
    print("\nüí° NEXT STEPS:")
    print("   1. Check logs from Cell 15 for 'üìÅ STORAGE VERIFIED:' messages")
    print("   2. Immediately run in terminal: ls -la data/attachments/*/*/original/")
    print("   3. Check for 'üóÇÔ∏è  STORAGE PATH RESOLUTION:' in Cell 15 logs")
    print("   4. If files missing, check async timing (wait 5 seconds, re-run diagnostic)")
else:
    print("\n‚úÖ ALL CHECKS PASSED!")
    print("   PDF processing pipeline working correctly")

print("="*70)


In [None]:
# Cell 16
#######################################################################################
# DIAGNOSTIC: Verify configuration is correct
print("="*70)
print("üìã CONFIGURATION CHECK")
print("="*70)
print(f"EMAIL_SELECTOR: {EMAIL_SELECTOR}")
print(f"email_source_enabled: {email_source_enabled}")
print(f"\n‚úÖ Expected: EMAIL_SELECTOR = 'crawl4ai_test'")
print(f"‚úÖ Expected: email_source_enabled = True")
print(f"\nüìß Selected email files ({len(email_sets[EMAIL_SELECTOR]['email_files'])}):")
for i, fname in enumerate(email_sets[EMAIL_SELECTOR]['email_files'], 1):
    print(f"  {i}. {fname[:70]}...")
print("="*70)

#######################################################################################
# DIAGNOSTIC: Check if PDFs were downloaded
from pathlib import Path
from datetime import datetime, timedelta

# Fixed: Changed from legacy data/downloaded_reports to unified storage
storage_dir = Path("data/attachments")
all_pdfs = list(storage_dir.glob("*/*/original/*.pdf"))

# Check for recent downloads (last 10 minutes)
recent_cutoff = datetime.now() - timedelta(minutes=10)
recent_pdfs = [f for f in all_pdfs if datetime.fromtimestamp(f.stat().st_mtime) > recent_cutoff]

print("="*70)
print("üìÅ FILE VERIFICATION")
print("="*70)
print(f"Total PDFs in directory: {len(all_pdfs)}")
print(f"New PDFs (last 10 min): {len(recent_pdfs)}")

if recent_pdfs:
    print(f"\n‚úÖ URL PROCESSING WORKED! Downloaded {len(recent_pdfs)} PDFs:")
    for pdf in recent_pdfs:
        size_kb = pdf.stat().st_size / 1024
        mtime = datetime.fromtimestamp(pdf.stat().st_mtime).strftime('%H:%M:%S')
        print(f"   ‚Ä¢ {pdf.name} ({size_kb:.1f} KB, {mtime})")
else:
    print(f"\n‚ÑπÔ∏è  No new downloads (may already be cached)")
print("="*70)

#######################################################################################
# DIAGNOSTIC: Search Cell 27 output for URL processing indicators
# Note: This only works if you saved Cell 27 output to a variable

# If you can't find the prominent boxes, look for these INFO logs:
print("="*70)
print("üìã WHAT TO LOOK FOR IN CELL 27 OUTPUT")
print("="*70)
print("\n‚úÖ SUCCESS INDICATORS (search Cell 27 output for these):\n")
print("1. IntelligentLinkProcessor initialized:")
print("   'INFO:...data_ingestion:‚úÖ IntelligentLinkProcessor initialized'")
print("\n2. URL extraction (should be > 0):")
print("   'INFO:...intelligent_link_processor:Extracted X links from email'")
print("\n3. Research reports classified (should be > 0):")
print("   'INFO:...intelligent_link_processor:Classified Y research reports'")
print("\n4. Reports downloaded (‚â• 0):")
print("   'INFO:...intelligent_link_processor:Downloaded Z reports'")
print("\n5. Prominent boxes (NEW - added for visibility):")
print("   'üîó URL PROCESSING: [email filename]'")
print("="*70)

#######################################################################################
# DIAGNOSTIC: Quick pass/fail check
from pathlib import Path
from datetime import datetime, timedelta

# Fixed: Changed from legacy data/downloaded_reports to unified storage
storage_dir = Path("data/attachments")
recent_cutoff = datetime.now() - timedelta(minutes=10)
recent_pdfs = [f for f in storage_dir.glob("*/*/original/*.pdf")
               if datetime.fromtimestamp(f.stat().st_mtime) > recent_cutoff]

print("="*70)
print("‚úÖ QUICK SUCCESS CHECK")
print("="*70)

# Check 1: Configuration
config_ok = 'EMAIL_SELECTOR' in dir() and EMAIL_SELECTOR == 'crawl4ai_test'
print(f"{'‚úÖ' if config_ok else '‚ùå'} EMAIL_SELECTOR = 'crawl4ai_test'")

# Check 2: Email source enabled
email_ok = 'email_source_enabled' in dir() and email_source_enabled == True
print(f"{'‚úÖ' if email_ok else '‚ùå'} email_source_enabled = True")

# Check 3: New downloads
downloads_ok = len(recent_pdfs) > 0
print(f"{'‚úÖ' if downloads_ok else '‚ÑπÔ∏è '} New PDFs downloaded: {len(recent_pdfs)}")

# Overall
if config_ok and email_ok:
    if downloads_ok:
        print(f"\nüéâ SUCCESS: URL processing is working!")
        print(f"   Downloaded {len(recent_pdfs)} new PDFs")
    else:
        print(f"\n‚ÑπÔ∏è  Configuration correct, but no new downloads")
        print(f"   Either: (1) Files cached from previous run")
        print(f"           (2) Check Cell 27 logs for 'Extracted 0 links'")
else:
    print(f"\n‚ö†Ô∏è  Configuration issue - check Cell 26 settings")
print("="*70)

#######################################################################################

In [None]:
# Cell 17
### Check if any PDFs have been downloaded from the URL links in the emails.

# Check for newly downloaded PDFs
import os
from pathlib import Path
from datetime import datetime, timedelta

# Fixed: Changed from legacy data/downloaded_reports to unified storage
storage_dir = Path("data/attachments")
recent_cutoff = datetime.now() - timedelta(minutes=10)

recent_pdfs = [
    f for f in storage_dir.glob("*/*/original/*.pdf")
    if datetime.fromtimestamp(f.stat().st_mtime) > recent_cutoff]

print(f"‚úÖ {len(recent_pdfs)} NEW PDFs downloaded in last 10 minutes")
for pdf in recent_pdfs:
    print(f"   ‚Ä¢ {pdf.name} ({pdf.stat().st_size/1024:.1f} KB)")

In [None]:
# Cell 18
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# CRAWL4AI CONFIGURATION STATUS CHECK
# Verify browser automation settings for URL processing
#
# INSTRUCTIONS: Copy this entire cell and paste as NEW CELL after Cell 30
# (the cell that checks for downloaded PDFs)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

print("\n" + "="*70)
print("üåê CRAWL4AI CONFIGURATION STATUS")
print("="*70)

# Check if orchestrator and link processor are available
if 'orchestrator' in locals() and orchestrator and hasattr(orchestrator, 'link_processor'):
    link_processor = orchestrator.link_processor

    if link_processor:
        print(f"\n‚úÖ IntelligentLinkProcessor: INITIALIZED")
        print(f"\nüîß Configuration:")
        print(f"  Crawl4AI Enabled: {link_processor.use_crawl4ai}")
        print(f"  Timeout: {link_processor.crawl4ai_timeout}s")
        print(f"  Headless Mode: {link_processor.crawl4ai_headless}")

        print(f"\nüìä 6-Tier URL Classification System:")
        print(f"  Tier 1: Direct downloads (.pdf, .xlsx) ‚Üí Simple HTTP")
        print(f"  Tier 2: Token-auth (DBS ?E=) ‚Üí Simple HTTP")
        print(f"  Tier 3: News sites ‚Üí {'Crawl4AI' if link_processor.use_crawl4ai else 'Simple HTTP (fallback)'}")
        print(f"  Tier 4: Research portals ‚Üí {'Crawl4AI' if link_processor.use_crawl4ai else 'Simple HTTP (fallback)'}")
        print(f"  Tier 5: Paywalls ‚Üí {'Crawl4AI' if link_processor.use_crawl4ai else 'Simple HTTP (fallback)'}")
        print(f"  Tier 6: Social/tracking ‚Üí Skip")

        if link_processor.use_crawl4ai:
            print(f"\n‚úÖ Crawl4AI is ENABLED")
            print(f"   ‚Ä¢ Browser automation active for Tier 3/4/5 URLs")
            print(f"   ‚Ä¢ Expected: Higher success rate on complex sites (60-80%)")
            print(f"   ‚Ä¢ Impact: 70-90% more premium broker research captured")
        else:
            print(f"\n‚ö†Ô∏è  Crawl4AI is DISABLED (default)")
            print(f"   ‚Ä¢ Using simple HTTP only for all URLs")
            print(f"   ‚Ä¢ Success rate: ~30-40% on Tier 3/4/5 URLs")
            print(f"   ‚Ä¢ Missing: Premium portals (Goldman, Morgan Stanley, JPM)")
            print(f"\nüí° To Enable: Set USE_CRAWL4AI_LINKS=true before starting notebook")
    else:
        print(f"\n‚ùå IntelligentLinkProcessor: NOT INITIALIZED")
        print(f"   ‚Ä¢ Link processing will not work")
        print(f"   ‚Ä¢ Check data_ingestion.py initialization")
else:
    print(f"\n‚ö†Ô∏è  Orchestrator not available yet")
    print(f"   ‚Ä¢ Run Cell 22 (Orchestrator Initialization) first")
    print(f"   ‚Ä¢ This cell will show Crawl4AI status after initialization")

print("\n" + "="*70)


In [None]:
# Cell 19
# Comprehensive 3-Tier Knowledge Graph Statistics
stats = ice.get_comprehensive_stats()

print("üìä ICE Knowledge Graph Statistics")
print("=" * 70)

# TIER 1: Document Source Breakdown
print("\nüìÑ TIER 1: Document Source Breakdown")
print("-" * 70)

t1 = stats['tier1']
diversity = t1.get('source_diversity', {})

print(f"Total Documents: {t1['total']}")
print(f"\nüìä Source Distribution (Visual Breakdown):")
print(f"  üìß Email:    {ice._format_progress_bar(t1['email'], t1['total'])}")
print(f"  üåê API:      {ice._format_progress_bar(t1['api_total'], t1['total'])}")
print(f"  üìã SEC:      {ice._format_progress_bar(t1['sec_total'], t1['total'])}")

print(f"\nüìà Source Diversity Metrics:")
print(f"  ‚Ä¢ Unique sources detected: {diversity.get('unique_sources', 0)}")
print(f"  ‚Ä¢ Expected sources (Email/API/SEC): {diversity.get('expected_sources_present', 0)}/3")
print(f"  ‚Ä¢ Coverage: {diversity.get('coverage_percentage', 0.0):.1f}% ({diversity.get('documents_with_markers', 0)}/{t1['total']} docs with markers)")
print(f"  ‚Ä¢ Status: {diversity.get('status', 'unknown').upper()}")

if diversity.get('status') == 'incomplete' or diversity.get('coverage_percentage', 0) < 80:
    print(f"\n  ‚ö†Ô∏è  Low coverage detected! Set REBUILD_GRAPH=True in Cell 22 to rebuild with correct markers")
elif diversity.get('status') == 'complete':
    print(f"\n  ‚úÖ All data sources properly tagged!")

print(f"\nüìä Detailed Breakdown:")
print(f"  üìß Email: {t1['email']} documents")
print(f"     ‚Ä¢ Portfolio-wide broker research")
print(f"  üåê API: {t1['api_total']} documents")
print(f"     ‚Ä¢ NewsAPI: {t1.get('newsapi', 0)}")
print(f"     ‚Ä¢ FMP: {t1.get('fmp', 0)}")
print(f"     ‚Ä¢ Alpha Vantage: {t1.get('alpha_vantage', 0)}")
print(f"     ‚Ä¢ Polygon: {t1.get('polygon', 0)}")
if t1.get('finnhub', 0) > 0:
    print(f"     ‚Ä¢ Finnhub: {t1.get('finnhub', 0)}")
if t1.get('marketaux', 0) > 0:
    print(f"     ‚Ä¢ MarketAux: {t1.get('marketaux', 0)}")
if t1.get('benzinga', 0) > 0:
    print(f"     ‚Ä¢ Benzinga: {t1.get('benzinga', 0)}")
print(f"  üìã SEC: {t1['sec_total']} documents")
print(f"     ‚Ä¢ SEC EDGAR filings: {t1.get('sec_edgar', 0)}")

# TIER 2: Graph Structure
print("\n\nüï∏Ô∏è  TIER 2: Knowledge Graph Structure")
print("-" * 70)

t2 = stats['tier2']
print(f"Total Entities: {t2['total_entities']:,}")
print(f"Total Relationships: {t2['total_relationships']:,}")
if t2['total_entities'] > 0:
    print(f"Avg Connections per Entity: {t2['avg_connections']:.2f}")

# TIER 3: Investment Intelligence
print("\n\nüíº TIER 3: Investment Intelligence")
print("-" * 70)

t3 = stats['tier3']
if t3['tickers_covered']:
    print(f"Portfolio Coverage: {', '.join(t3['tickers_covered'])} ({len(t3['tickers_covered'])} tickers)")
else:
    print(f"Portfolio Coverage: No tickers detected")

print(f"\nInvestment Signals:")
print(f"  ‚Ä¢ BUY ratings: {t3['buy_signals']}")
print(f"  ‚Ä¢ SELL ratings: {t3['sell_signals']}")
print(f"  ‚Ä¢ Price targets: {t3['price_targets']}")
print(f"  ‚Ä¢ Risk mentions: {t3['risk_mentions']}")

print("\n" + "=" * 70)
print("‚úÖ Comprehensive statistics complete!")


In [None]:
# Cell 20
# Knowledge Graph Building - Already completed during ingestion
print(f"\nüß† Knowledge Graph Building")
print(f"‚îÅ" * 60)

if not (ice and ice.core.is_ready()):
    raise RuntimeError("LightRAG not ready")

print(f"‚ÑπÔ∏è  NOTE: Knowledge graph building happens automatically during data ingestion")
print(f"   The ingestion method (ingest_historical_data) already added documents")
print(f"   to the graph via LightRAG. This cell validates that building succeeded.\n")

# Validate that building succeeded by checking storage
storage_stats = ice.core.get_storage_stats()

if storage_stats['total_storage_bytes'] > 0:
    print(f"‚úÖ KNOWLEDGE GRAPH BUILT SUCCESSFULLY")

    # Save extraction code version (for stale-graph detection)
    # Monitor multiple extraction pipeline files
    files_to_monitor = [
        "imap_email_ingestion_pipeline/table_entity_extractor.py",
        "imap_email_ingestion_pipeline/enhanced_doc_creator.py"
    ]
    combined_hash = hashlib.md5()
    for f in files_to_monitor:
        if Path(f).exists():
            combined_hash.update(Path(f).read_bytes())
    current_hash = combined_hash.hexdigest()[:8]
    version_file = Path("ice_lightrag/storage/.extractor_version")
    version_file.parent.mkdir(parents=True, exist_ok=True)
    version_file.write_text(current_hash)
    print(f"‚îÅ" * 40)
    print(f"   üìÑ Documents processed: {ingestion_result.get('total_documents', 0)}")
    print(f"   üíæ Storage size: {storage_stats['total_storage_bytes'] / (1024*1024):.2f} MB")
    
    components_ready = sum(1 for c in storage_stats['components'].values() if c['exists'])
    print(f"   üîó Components ready: {components_ready}/4")
    
    # Create success result for metrics tracking
    building_result = {
        'status': 'success',
        'total_documents': ingestion_result.get('total_documents', 0),
        'metrics': {
            'building_time': ingestion_result.get('metrics', {}).get('processing_time', 0.0),
            'graph_initialized': True
        }
    }
    
    print(f"\nüéØ Graph Building Process:")
    print(f"   1Ô∏è‚É£ Text Chunking: 1200 tokens (optimal for financial documents)")
    print(f"   2Ô∏è‚É£ Entity Extraction: Companies, metrics, risks, regulations")
    print(f"   3Ô∏è‚É£ Relationship Discovery: Dependencies, impacts, correlations")
    print(f"   4Ô∏è‚É£ Graph Construction: LightRAG optimized structure")
    print(f"   5Ô∏è‚É£ Storage: chunks_vdb, entities_vdb, relationships_vdb, graph")
    
    print(f"\nüöÄ System ready for intelligent queries!")
    
else:
    print(f"‚ö†Ô∏è NO GRAPH DATA DETECTED")
    print(f"   Storage size: 0 MB")
    print(f"   Check ingestion results above for errors")
    print(f"   Possible causes:")
    print(f"   - No API keys configured")
    print(f"   - All holdings failed to fetch data")
    print(f"   - Network connectivity issues")
    
    building_result = {
        'status': 'error',
        'message': 'No graph data - check ingestion results'
    }

In [None]:
# Cell 21
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# üìã FOOTNOTE TRACEABILITY FEATURE
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# Location: ice_building_workflow.ipynb Cell 31 (v1.4.9 Migration)
# Purpose: Add footnote-style citations with knowledge graph reasoning paths
# Why: Transparent source attribution using LightRAG v1.4.9 structured data
# Relevant Files: citation_formatter.py, ice_rag_fixed.py

from typing import Any
from src.ice_core.citation_formatter import CitationFormatter

# Quality badge mapping
QUALITY_BADGES = {
    'email': 'üî¥ Tertiary',
    'api': 'üü° Secondary',
    'entity_extraction': 'üî¥ Tertiary',
    'sec_filing': 'üü¢ Primary',
    'news': 'üü° Secondary',
    'research': 'üü¢ Primary'
}

# Smart confidence mapping
CONFIDENCE_MAP = {
    'email': 0.85,
    'api': 0.90,
    'entity_extraction': 0.75,
    'sec_filing': 0.95,
    'news': 0.88,
    'research': 0.92
}

def add_footnote_citations(query_result):
    """
    Add footnote-style citations with knowledge graph reasoning paths.
    
    v1.4.9 Migration: Uses structured data from parsed_context (no regex parsing).
    
    Features:
    - File-level citations (Layer 1) from chunks metadata
    - Graph reasoning paths (Layer 2) from entities/relationships
    - Confidence color-coding (üü¢‚â•85%, üü°70-85%, üî¥<70%)
    - Graceful degradation when data missing
    """
    import re  # Moved inside function for proper scoping
    
    # ========================================================================
    # HELPER FUNCTIONS (Nested inside parent function)
    # ========================================================================
    
    def build_confidence_cache(chunks):
        """
        Build O(1) lookup cache for entity confidence scores from markup.
        
        Scans chunks once to extract confidence from inline markup like:
        [TICKER:NVDA|confidence:0.95] or [RATING:BUY|...|confidence:0.87]
        
        Returns dict: {entity_name: max_confidence}
        """
        confidence_cache = {}
        
        # Generic pattern for any markup type with confidence
        pattern = r'\[([A-Z_]+):([^|\]]+)\|[^\]]*confidence:([\d.]+)\]'
        
        for chunk in chunks:
            content = chunk.get('content', '')
            for match in re.finditer(pattern, content):
                entity_value = match.group(2).strip()
                confidence = float(match.group(3))
                
                # Take max if entity appears multiple times
                if entity_value in confidence_cache:
                    confidence_cache[entity_value] = max(confidence_cache[entity_value], confidence)
                else:
                    confidence_cache[entity_value] = confidence
        
        return confidence_cache
    
    def get_entity_confidence(entity_name, entities, confidence_cache=None):
        """Extract confidence with 3-tier fallback: cache ‚Üí metadata ‚Üí 0.75"""
        # Tier 1: Check confidence cache (O(1) markup lookup)
        if confidence_cache and entity_name in confidence_cache:
            return confidence_cache[entity_name]
        
        # Tier 2: Check entity metadata (future-proof)
        for e in entities:
            if e.get('entity_name') == entity_name:
                conf = e.get('confidence', e.get('score', 0.75))
                return float(conf) if conf else 0.75
        
        # Tier 3: Default for LLM-extracted entities
        return 0.75
    

    def _infer_source_type(file_path):
        """Infer source_type from file_path prefix (email:, api:, sec:, etc.)"""
        if ':' not in file_path:
            return None
        
        prefix = file_path.split(':', 1)[0].lower()
        
        # Map prefixes to QUALITY_BADGES keys
        prefix_map = {
            'email': 'email',
            'api': 'api',
            'sec': 'sec_filing',
            'news': 'news',
            'research': 'research',
            'entity': 'entity_extraction'
        }
        
        return prefix_map.get(prefix, None)
    
    # ========================================================================
    # MAIN EXECUTION CODE
    # ========================================================================
    
    # Extract entities mentioned in answer (semantic case-insensitive matching)
    answer = query_result.get('answer', '')
    answer_lower = answer.lower()
    answer_entities = set[Any]()
    
    parsed_context = query_result.get('parsed_context', {})
    entities_list = parsed_context.get('entities', [])
    
    for e in entities_list:
        entity_name = e.get('entity_name', '')
        if entity_name and entity_name.lower() in answer_lower:
            answer_entities.add(entity_name)
    
    print(f"\nüîç Answer entities extracted: {len(answer_entities)} entities")
    if answer_entities:
        print(f"   Entities: {list(answer_entities)[:]}")

    # Build confidence cache BEFORE using it in relationship loop
    chunks = query_result.get('parsed_context', {}).get('chunks', [])
    confidence_cache = build_confidence_cache(chunks)

    
    # Build reasoning paths from relationships
    relationships = parsed_context.get('relationships', [])
    graph_paths = []
    path_candidates = []
    
    for rel in relationships:
        src = rel.get('src_id', '')
        tgt = rel.get('tgt_id', '')
        rel_type = rel.get('relation_type', 'related_to')
        
        # Filter to paths involving answer entities
        if src in answer_entities or tgt in answer_entities:
            # Get confidence for both entities
            src_conf = get_entity_confidence(src, entities_list, confidence_cache)
            tgt_conf = get_entity_confidence(tgt, entities_list, confidence_cache)
            avg_conf = (src_conf + tgt_conf) / 2
            
            # Format confidence with color coding
            if avg_conf >= 0.85:
                conf_str = f"üü¢ {avg_conf:.0%}"
            elif avg_conf >= 0.70:
                conf_str = f"üü° {avg_conf:.0%}"
            else:
                conf_str = f"üî¥ {avg_conf:.0%}"
            
            path = f"{src} ‚Üí {rel_type} ‚Üí {tgt} (Cof: {conf_str})"
            path_candidates.append((path, avg_conf))
    
    # Sort by confidence (highest first), limit to top 5
    path_candidates.sort(key=lambda x: x[1], reverse=True)
    graph_paths = [p[0] for p in path_candidates[:]]
    
    print(f"üîó Graph paths built: {len(graph_paths)} paths")
    
    # Build confidence cache once for O(1) lookups (performance optimization)
    
    if not chunks:
        return query_result
    
    seen_sources = set()
    enriched_sources = []
    
    for chunk in chunks:
        file_path = chunk.get('file_path', 'unknown_source')
        
        if file_path not in seen_sources:
            seen_sources.add(file_path)
            
            # Get source type and assign quality badge
            source_type = _infer_source_type(file_path) or chunk.get('source_type', 'unknown')
            quality_badge = QUALITY_BADGES.get(source_type, '‚ö™ Unknown')
            
            # Get confidence from cache or default
            confidence = CONFIDENCE_MAP.get(source_type, 0.75)
            
            enriched_sources.append({
                'file_path': file_path,
                'source_type': source_type,
                'quality_badge': quality_badge,
                'confidence': confidence
            })
    
    # Create footnotes section
    footnotes = []
    for idx, source in enumerate(enriched_sources, 1):
        footnotes.append(
            f"[{idx}] {source['quality_badge']} | {source['file_path']} "
            f"(Confidence: {source['confidence']:.0%})"
        )
    
    # Append formatted citations to answer
    citations_text = "\n\n" + "="*80 + "\n"
    citations_text += "üìö SOURCES & REASONING PATHS\n"
    citations_text += "="*80 + "\n\n"
    
    if footnotes:
        citations_text += "üìÑ Document Sources:\n"
        citations_text += "\n".join(footnotes)
    
    if graph_paths:
        citations_text += "\n\nüß† Knowledge Graph Paths:\n"
        for path in graph_paths:
            citations_text += f"   ‚Ä¢ {path}\n"
    
    query_result['citation_display'] = query_result.get('result', '') + citations_text
    
    return query_result



In [None]:
# Cell 22

def clear_llm_cache():
    """
    Clear LLM response cache using actual ICE storage path.
    
    Use when:
    - Testing traceability features
    - Graph structure has changed
    - Want to see updated KG sections in query results
    
    Fix: Dynamically resolves cache path from initialized ICE system
    instead of hardcoding path (avoids path mismatch issues)
    """
    import os
    
    try:
        # Get actual working directory from initialized ICE system
        working_dir = str(ice.core._system_manager.working_dir)
        cache_file = os.path.join(working_dir, "kv_store_llm_response_cache.json")
        
        if os.path.exists(cache_file):
            os.remove(cache_file)
            print(f"‚úÖ LLM cache cleared: {cache_file}")
            print("   Next query will retrieve fresh KG sections")
        else:
            print(f"‚ÑπÔ∏è  No cache file found at: {cache_file}")
            print("   (This is normal if cache was already cleared or no queries run yet)")
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not clear cache: {e}")
        print("   Make sure ICE system is initialized first")

print("üìã Cache management tool loaded")
print("   Run clear_llm_cache() to clear LLM response cache")
print("   Useful when testing traceability features or after graph rebuild")

# Uncomment to clear cache before testing:
clear_llm_cache()


In [None]:
# Cell 23
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# üß™ COMPREHENSIVE QUERY TESTING WITH GRANULAR TRACEABILITY
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# Location: ice_building_workflow.ipynb Cell 31 (FIXED)
# Purpose: Test knowledge graph with complete granular source attribution
# Why: Sentence-level attribution, per-hop tracking, beautiful display (Phases 1-5)
# Relevant Files: context_parser.py, sentence_attributor.py, graph_path_attributor.py, granular_display_formatter.py

print("="*70)
print("üß™ COMPREHENSIVE QUERY TESTING")
print("="*70)
print(f"üìä Portfolio: {', '.join(test_holdings)}")
print(f"üí° Example queries:")
print(f"   - Historical: 'What was Tencent's Q2 2025 operating margin?'")
print(f"   - Current: 'What are the current headwinds for NVDA?'")
print(f"   - Trend: 'How has AAPL revenue been trending?'")
print(f"   - Multi-hop: 'How does China risk impact NVDA through TSMC?'")
print()

###
# query = input("üí¨ Enter your question: ") or "Does the email contain any URL link? What does the URL contain?"
query = input("üí¨ Enter your question: ") or "What is Tencent's operating margin in Q2 2025?"
# query = input("üí¨ Enter your question: ") or "What are Tencent's international games?"

### EMAIL: 'CH_HK_ Tencent Music Entertainment (1698 HK)_ Stronger growth with expanding revenue streams  (NOT RATED).eml'
# query = input("üí¨ Enter your question: ") or "What is Tencent Music Entertainment's 2Q 2024 paying users?"
# query = input("üí¨ Enter your question: ") or "What is TME's tiered monetization strategy?"
# query = input("üí¨ Enter your question: ") or "What is TME's music streaming service revenue?"

### EMAIL: 'FW_ RHB | Singapore Morning Cuppa _ 15 August 2025 (ST Engineering, First Resources, Golden Agri-Resources, StarHub).eml'
# query = input("üí¨ Enter your question: ") or "What is the TP of DBS?"
# query = input("üí¨ Enter your question: ") or "What is the dividend yield of Singtel?"

mode = input("üîç Mode (naive/local/global/hybrid/mix) [hybrid]: ") or "hybrid"

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# STEP 1: Query (Dual Strategy Happens Internally)
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# Display query configuration
print(f"\nüìã Query Configuration:")
print(f"   Query: {query}")
print(f"   Mode: {mode}")
print(f"\n‚è≥ Querying graph (mode: {mode})...")

# Single query - ice_rag_fixed.py handles dual query strategy internally:
# 1. Retrieves context with SOURCE markers (only_need_context=True)
# 2. Generates answer (normal query)
# 3. Returns both context and parsed_context in result dict
result = ice.core.query(query, mode=mode)
# result = ice.query_with_router(query) # to delete later.

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# FOOTNOTE CITATIONS: Add source attribution
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
result = add_footnote_citations(result)  # Add footnote-style citations

if result.get('status') != 'success':
    print(f"‚ùå Query failed: {result.get('error', 'Unknown error')}")
else:
    answer = result.get('result', '')
    
    # Extract context (already retrieved by dual strategy)
    raw_context = result.get('context', '')  # Raw LightRAG markdown with SOURCE markers
    parsed_context = result.get('parsed_context')  # Already parsed by context_parser!
    
    # Get causal paths if available (for multi-hop queries)
    causal_paths = result.get('graph_context', {}).get('causal_paths', [])
    
    # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    # DISPLAY CITATIONS: Show footnote-style source attribution
    # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    if 'citation_display' in result:
        print('\n' + '='*80)
        print('üìö Generated Response')
        print('='*80)
        print(result['citation_display'])
        print('='*80)
    else:
        print('\n‚ö†Ô∏è  No citation_display field available')



    # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    # CHUNK QUALITY METRICS: Show top chunk similarities (NEW: 2025-11-02)
    # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    chunks = parsed_context.get('chunks', [])
    if chunks:
        chunks_with_scores = [c for c in chunks if c.get('distance') is not None]
        if chunks_with_scores:
            print('\n' + '='*80)
            print('üìä CHUNK QUALITY METRICS')
            print('='*80)
            # for idx, chunk in enumerate(chunks_with_scores[:3], 1):
            for idx, chunk in enumerate(chunks_with_scores[:], 1):
                distance = chunk['distance']  # Guaranteed by pre-filtering
                similarity = (1 - distance) * 100
                quality = "üü¢" if distance < 0.2 else "üü°" if distance < 0.4 else "üü†"
                print(f"{quality} Chunk {idx}: {similarity:.1f}% similar (distance: {distance:.3f})")
            
            avg_dist = sum(c['distance'] for c in chunks_with_scores) / len(chunks_with_scores)
            avg_sim = (1 - avg_dist) * 100
            print(f"\n   Average similarity: {avg_sim:.1f}% across {len(chunks_with_scores)} chunks")
            print('='*80)


In [None]:
# Cell 24
# result['parsed_context']['chunks']
chunks = [c for c in result['parsed_context']['chunks'] if c.get('distance') is not None]
chunks


In [None]:
# Cell 25
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# üîç DEBUG: CHUNK SIMILARITY SCORES INSPECTOR
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# Location: ice_building_workflow.ipynb Cell 34 (NEW)
# Purpose: Display chunk similarity scores from LightRAG distance field
# Why: Expose cosine similarity metrics for chunk relevance transparency
# Relevant Files: lightrag/utils.py:2929 (distance field modification)

print("\n" + "="*80)
print("üîç CHUNK SIMILARITY SCORES (LightRAG Distance Field)")
print("="*80)

chunks = result.get('parsed_context', {}).get('chunks', [])
print(f"Total chunks retrieved: {len(chunks)}")
print(f"Showing top 5 chunks by similarity:\n")

for idx, chunk in enumerate(chunks[:5], 1):
    distance = chunk.get('distance')
    
    # Calculate similarity percentage (distance = 1 - cosine_similarity)
    if distance is not None:
        similarity_pct = (1 - distance) * 100
        
        # Color-code by similarity level
        if distance < 0.2:
            quality = "üü¢ HIGH"
        elif distance < 0.4:
            quality = "üü° MODERATE"
        else:
            quality = "üü† LOW"
    else:
        similarity_pct = None
        quality = "‚ö™ N/A (graph-traversal chunk)"
    
    print(f"Chunk #{idx}:")
    print(f"  Quality: {quality}")
    print(f"  Distance: {distance:.4f}" if distance is not None else "  Distance: N/A")
    print(f"  Similarity: {similarity_pct:.1f}%" if similarity_pct is not None else "  Similarity: N/A")
    print(f"  File: {chunk.get('file_path', 'unknown')[-50:]}")
    print(f"  Content: {chunk.get('content', '')[:150]}...")
    print()

# Statistics
chunks_with_distance = [c for c in chunks if c.get('distance') is not None]
if chunks_with_distance:
    avg_distance = sum(c['distance'] for c in chunks_with_distance) / len(chunks_with_distance)
    avg_similarity = (1 - avg_distance) * 100
    
    print("="*80)
    print("üìä CHUNK QUALITY STATISTICS")
    print("="*80)
    print(f"Chunks with similarity scores: {len(chunks_with_distance)}/{len(chunks)}")
    print(f"Average distance: {avg_distance:.4f}")
    print(f"Average similarity: {avg_similarity:.1f}%")
    print(f"\nüí° Interpretation:")
    print(f"   ‚Ä¢ Distance 0.0-0.2 (80-100% similar): Highly relevant")
    print(f"   ‚Ä¢ Distance 0.2-0.4 (60-80% similar): Relevant context")
    print(f"   ‚Ä¢ Distance 0.4-0.6 (40-60% similar): Tangentially related")
    print(f"   ‚Ä¢ Distance >0.6 (<40% similar): Filtered out by threshold")
else:
    print("\n‚ö†Ô∏è  No chunks with distance scores (all from graph traversal)")


In [None]:
# Cell 26
# Add this as a new cell in the notebook right after Cell  33 to debug

print("=" * 80)
print("üîç DEBUGGING parsed_context")
print("=" * 80)

# Check what's in the result
print("\n1Ô∏è‚É£ Keys in result:")
for key in result.keys():
    print(f"   - {key}")

print("\n2Ô∏è‚É£ Checking 'context' field:")
if 'context' in result:
    context = result['context']
    print(f"   Type: {type(context)}")
    print(f"   Length: {len(context) if  isinstance(context, str) else 'N/A'}")
    print(f"   Has 'Entities(KG)': {'Entities(KG)' in  context if isinstance(context, str) else 'N/A'}")
    print(f"   Has 'Relationships(KG)':  {'Relationships(KG)' in context if isinstance(context,  str) else 'N/A'}")
    print(f"\n   First 500 chars of context:")
    print(context[:500] if isinstance(context, str) else context)
else:
    print("   ‚ùå 'context' field missing!")

print("\n3Ô∏è‚É£ Checking 'parsed_context' field:")
if 'parsed_context' in result:
    parsed = result['parsed_context']
    print(f"   Type: {type(parsed)}")
    if isinstance(parsed, dict):
        print(f"   Keys: {list(parsed.keys())}")
        if 'entities' in parsed:
            print(f"   Entities count: {len(parsed['entities'])}")
        if 'relationships' in parsed:
            print(f"   Relationships count: {len(parsed['relationships'])}")
        if 'chunks' in parsed:
            print(f"   Chunks count: {len(parsed['chunks'])}")
else:
    print("   ‚ùå 'parsed_context' field missing!")

print("\n4Ô∏è‚É£ Checking 'citation_display' field:")
if 'citation_display' in result:
    print("   ‚úÖ citation_display exists")
    print(f"\n   Content:\n{result['citation_display']}")
else:
    print("   ‚ùå 'citation_display' field missing!")

print("=" * 80)


In [None]:
# Cell 27
# Deep debug - check why graph paths aren't being built
# Run this in a new notebook cell after the previous debug cell

import re
import json

print("=" * 80)
print("üîç DEEP DEBUGGING - Why no graph paths?")
print("=" * 80)

# Step 1: Extract reference entities from answer
answer = result.get('answer', '')
print("\n1Ô∏è‚É£ Reference entity extraction:")
print("-" * 80)

ref_match = re.search(r'#{0,3}\s*References?:?\s*\n(?:-  \[(?:KG|DC)\][^\n]*(?:\n|$))+', answer, re.DOTALL)
if ref_match:
    print(f"‚úÖ References matched:")
    print(ref_match.group())
    refs = re.findall(r'\[KG\]\s*([^\n\-\[]+)',ref_match.group())
    ref_entities = set(r.strip() for r in refs)
    print(f"\n‚úÖ Extracted ref_entities: {ref_entities}")
else:
    print("‚ùå References NOT matched")
    ref_entities = set()

# Step 2: Check parsed_context entities
print("\n2Ô∏è‚É£ Entities from parsed_context:")
print("-" * 80)

parsed = result.get('parsed_context', {})
entities_list = parsed.get('entities', [])
print(f"Total entities: {len(entities_list)}")
print(f"\nFirst 10 entities:")
for i, e in enumerate(entities_list[:10]):
    entity_id = e.get('id', e.get('entity_id', 'unknown'))
    entity_name = e.get('entity', e.get('name',
'unknown'))
    print(f"   {i}: id={entity_id}, name={entity_name}")

# Build entities dict
entities = {}
for e in entities_list:
    entity_id = e.get('id', e.get('entity_id'))
    entity_name = e.get('entity', e.get('name',
'unknown'))
    if entity_id is not None:
        entities[entity_id] = entity_name

print(f"\nBuilt entities dict with {len(entities)} entries")

# Step 3: Check relationships
print("\n3Ô∏è‚É£ Relationships from parsed_context:")
print("-" * 80)

relationships = parsed.get('relationships', [])
print(f"Total relationships: {len(relationships)}")
print(f"\nFirst 10 relationships:")
for i, rel in enumerate(relationships[:10]):
    src_id = rel.get('src_id', rel.get('source_id'))
    tgt_id = rel.get('tgt_id', rel.get('target_id'))
    desc = rel.get('description', rel.get('type', 'RELATED'))

    src_name = entities.get(src_id, f"Unknown({src_id})")
    tgt_name = entities.get(tgt_id, f"Unknown({tgt_id})")

    print(f"   {i}: [{src_name}] --{desc}-->  [{tgt_name}]")

# Step 4: Test matching logic
print("\n4Ô∏è‚É£ Testing matching logic (ref_entities vs  relationships):")
print("-" * 80)

if ref_entities and entities and relationships:
    print(f"‚úÖ All preconditions met")
    print(f"   ref_entities: {ref_entities}")

    matched_count = 0
    print(f"\n   Checking first 20 relationships for  matches:")
    for i, rel in enumerate(relationships[:20]):
        src_id = rel.get('src_id', rel.get('source_id'))
        tgt_id = rel.get('tgt_id', rel.get('target_id'))
        src = entities.get(src_id, f"Unknown({src_id})")
        tgt = entities.get(tgt_id, f"Unknown({tgt_id})")

        # Test matching logic
        matches = []
        for ref in ref_entities:
            if ref.lower() in src.lower():
                matches.append(f"'{ref}' in src '{src}'")
            if ref.lower() in tgt.lower():
                matches.append(f"'{ref}' in tgt '{tgt}'")

        if matches:
            matched_count += 1
            desc = rel.get('description', rel.get('type', 'RELATED'))
            print(f"   ‚úÖ Match {matched_count}: [{src}]  --{desc}--> [{tgt}]")
            print(f"      Reason: {', '.join(matches)}")

            if matched_count >= 5:
                break

    if matched_count == 0:
        print(f"\n   ‚ùå NO MATCHES FOUND!")
        print(f"   This is why graph paths aren't  showing!")
        print(f"\n   Let me check entity names more  carefully:")
        print(f"   ref_entities to match: {ref_entities}")
        print(f"\n   Sample entity names in graph:")
        for i in range(min(20, len(entities))):
            print(f"      {i}:  {list(entities.values())[i]}")
else:
    print(f"‚ùå Preconditions NOT met:")
    print(f"   ref_entities: {len(ref_entities)}")
    print(f"   entities: {len(entities)}")
    print(f"   relationships: {len(relationships)}")

print("\n" + "=" * 80)


In [None]:
# Cell 28
# Check the raw answer field vs citation_display
# Run this in notebook

print("=" * 80)
print("üîç Comparing 'answer' vs 'citation_display'")
print("=" * 80)

print("\n1Ô∏è‚É£ Raw 'answer' field from LightRAG:")
print("-" * 80)
raw_answer = result.get('answer', '')
print(f"Length: {len(raw_answer)}")
print(f"\nContent:\n{raw_answer}")

print("\n" + "=" * 80)
print("\n2Ô∏è‚É£ Processed 'citation_display' field:")
print("-" * 80)
citation_display = result.get('citation_display', '')
print(f"Length: {len(citation_display)}")
print(f"\nContent:\n{citation_display}")

print("\n" + "=" * 80)
print("\n3Ô∏è‚É£ Testing regex on BOTH fields:")
print("-" * 80)

import re

pattern = r'#{0,3}\s*References?:?\s*\n(?:- \[(?:KG|DC)\][^\n]*(?:\n|$))+'

print("\nTesting on raw 'answer':")
match1 = re.search(pattern, raw_answer, re.DOTALL)
if match1:
    print(f"‚úÖ MATCHED in answer")
    print(f"Matched text:\n{match1.group()}")
else:
    print(f"‚ùå NOT matched in answer")
    # Check if References exists at all
    if "References" in raw_answer:
        print(f"   But 'References' keyword EXISTS in answer")
        idx = raw_answer.find("References")
        print(f"   Context around References:")
        print(f"   {raw_answer[max(0,idx-50):idx+150]}")
    else:
        print(f"   'References' keyword does NOT exist in raw answer")

print("\nTesting on 'citation_display':")
match2 = re.search(pattern, citation_display, re.DOTALL)
if match2:
    print(f"‚úÖ MATCHED in citation_display")
    print(f"Matched text:\n{match2.group()}")
else:
    print(f"‚ùå NOT matched in citation_display")

print("\n" + "=" * 80)


In [None]:
# Cell 29
# Add this as a new cell after Cell 33
print("üîç DEBUGGING INFO:")
print(f"1. result keys: {list(result.keys())}")
print(f"2. Has parsed_context: {'parsed_context' in result}")
if 'parsed_context' in result and result['parsed_context']:
    print(f"3. parsed_context keys: {list(result['parsed_context'].keys())}")
    print(f"4. Number of chunks: {len(result['parsed_context'].get('chunks', []))}")
print(f"5. Has citation_display: {'citation_display' in result}")
if 'context' in result:
    has_entities = '-----Entities(KG)-----' in result['context']
    has_relationships = '-----Relationships(KG)-----' in result['context']
    print(f"6. Context has Entities section: {has_entities}")
    print(f"7. Context has Relationships section: {has_relationships}")

In [None]:
# Cell 30
result.keys()

num_keys = len(result.keys())
print(f"Number of keys in result dict: {num_keys}")

import pprint
pprint.pprint(result)



In [None]:
# Cell 30.5: Confidence-Based Entity Filtering
# Purpose: Filter query results by entity confidence scores
# Why: Separate validated entities (Layer 1) from LightRAG automatic extraction (Layer 2)

import re
from lightrag import QueryParam

def analyze_entity_confidence(query_text: str, min_confidence: float = 0.80):
    """
    Show entity quality breakdown: validated vs automatic extraction.
    
    High (>=0.80): EntityExtractor + TickerValidator validated
    Low (<0.80): LightRAG automatic (verify manually)
    """
    # Validate dependencies
    if 'rag' not in globals():
        print("‚ùå Error: 'rag' not defined. Run earlier cells first (Cell 22).")
        return None, {}
    
    print("="*80)
    print(f"üîç Query: {query_text}")
    print(f"üìä Threshold: {min_confidence}")
    print("="*80)
    
    # Run query
    result = rag.query(query_text, param=QueryParam(mode="hybrid"))
    
    # Extract entities: [TYPE:value|confidence:0.XX]
    pattern = r'\[([A-Z_]+):([^\|]+)\|confidence:([0-9.]+)\]'
    entities = {}
    
    for match in re.finditer(pattern, result):
        etype, value, conf = match.groups()
        conf = float(conf)
        
        if etype not in entities:
            entities[etype] = {'high': [], 'low': []}
        
        category = 'high' if conf >= min_confidence else 'low'
        if value not in [v for v, _ in entities[etype][category]]:
            entities[etype][category].append((value, conf))
    
    # Summary
    total_high = sum(len(e['high']) for e in entities.values())
    total_low = sum(len(e['low']) for e in entities.values())
    
    print(f"\nüìà Summary: {total_high} validated, {total_low} unvalidated")
    
    if entities:
        print(f"\nüìã Breakdown:")
        for etype in sorted(entities.keys()):
            high, low = entities[etype]['high'], entities[etype]['low']
            if high or low:
                print(f"\n  {etype}:")
                if high:
                    print(f"    ‚úÖ Validated: {[v for v, _ in high]}")
                if low:
                    print(f"    ‚ö†Ô∏è  Auto-extracted: {[v for v, _ in low]}")
    
    print(f"\nüí° Answer:")
    print("-"*80)
    print(result)
    
    return result, entities

print("‚úÖ Confidence filtering ready. Usage:")
print("   analyze_entity_confidence('your query here')")
print("\nTest with: analyze_entity_confidence('What is the investment rating for Tencent Music?')")


In [None]:
# Cell 31
# result['citation_display']

In [None]:
# Cell 32

# Only visualize if query was successful
if result.get('status') == 'success':
    import networkx as nx
    import matplotlib.pyplot as plt
    import re
    from pathlib import Path
    import warnings
    warnings.filterwarnings('ignore')

    def extract_entities_from_answer(answer_text, graph):
        """Extract entity names mentioned in the answer by matching graph nodes."""
        found_entities = []
        text_upper = answer_text.upper()

        # Organize nodes by entity type
        nodes_by_type = {}
        for node, data in graph.nodes(data=True):
            entity_type = data.get('entity_type', 'Unknown')
            if entity_type not in nodes_by_type:
                nodes_by_type[entity_type] = []
            nodes_by_type[entity_type].append(node)

        # Priority entity types (most likely to appear in answers)
        priority_types = ['Organization', 'Person', 'Product', 'Technology']

        # Search priority types first
        for entity_type in priority_types:
            if entity_type in nodes_by_type:
                for entity in nodes_by_type[entity_type]:
                    if len(entity) >= 2:
                        pattern = r'\b' + re.escape(entity.upper()) + r'\b'
                        if re.search(pattern, text_upper):
                            found_entities.append(entity)

        # If no priority entities found, search all other types
        if not found_entities:
            for entity_type, entities in nodes_by_type.items():
                if entity_type not in priority_types:
                    for entity in entities:
                        if len(entity) >= 3:
                            pattern = r'\b' + re.escape(entity.upper()) + r'\b'
                            if re.search(pattern, text_upper):
                                found_entities.append(entity)

        return list(set(found_entities))

    def build_subgraph(graph, seed_entities, max_hops=2, max_nodes=30):
        """Build k-hop neighborhood subgraph from seed entities."""
        if not seed_entities:
            return nx.Graph()

        # Verify seed entities exist in graph
        seed_nodes = set(e for e in seed_entities if e in graph)
        if not seed_nodes:
            return nx.Graph()

        # Expand to k-hop neighborhood
        subgraph_nodes = set(seed_nodes)
        current_frontier = set(seed_nodes)

        for hop in range(max_hops):
            if len(subgraph_nodes) >= max_nodes:
                break

            next_frontier = set()
            for node in current_frontier:
                neighbors = set(graph.neighbors(node))
                next_frontier.update(neighbors)

            # Add nodes within budget
            remaining_budget = max_nodes - len(subgraph_nodes)
            new_nodes = list(next_frontier - subgraph_nodes)[:remaining_budget]
            subgraph_nodes.update(new_nodes)
            current_frontier = set(new_nodes)

        return graph.subgraph(subgraph_nodes).copy()

    # Load knowledge graph
    graph_path = Path("ice_lightrag/storage/graph_chunk_entity_relation.graphml")
    if graph_path.exists():
        try:
            print("\n" + "="*70)
            print("üé® KNOWLEDGE GRAPH VISUALIZATION")
            print("="*70)

            G = nx.read_graphml(str(graph_path))
            print(f"üìä Full graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

            # Extract entities from answer
            answer_text = result.get('answer', '')
            seed_entities = extract_entities_from_answer(answer_text, G)

            if seed_entities:
                print(f"üéØ Entities found in answer: {len(seed_entities)}")
                print(f"   {', '.join(seed_entities[:10])}")
                if len(seed_entities) > 10:
                    print(f"   ... and {len(seed_entities) - 10} more")

                # Build subgraph (2-hop neighborhood, max 30 nodes)
                subgraph = build_subgraph(G, seed_entities, max_hops=2, max_nodes=30)

                if subgraph.number_of_nodes() > 0:
                    print(f"üîó Subgraph: {subgraph.number_of_nodes()} nodes, {subgraph.number_of_edges()} edges")

                    # Create visualization
                    # === SIMPLIFIED MATPLOTLIB VISUALIZATION (15 lines vs 80 lines) ===
                    
                    plt.figure(figsize=(14, 10))

                    # Color code: Red = entities in answer, Teal = 2-hop neighbors
                    colors = ['#E74C3C' if n in seed_entities else '#2874A6' for n in subgraph.nodes()]

                    # Single unified draw call (replaces 4 separate draw_networkx_* calls)
                    nx.draw(subgraph,
                            pos=nx.spring_layout(subgraph, k=2, iterations=50, seed=42),
                            node_color=colors,
                            node_size=1200,
                            with_labels=True,
                            font_size=9,
                            font_weight='bold',
                            font_color='black',
                            edge_color='#95A5A6',
                            width=2,
                            alpha=0.9,
                            arrows=True,
                            arrowsize=15)

                    plt.title(f"Knowledge Graph: {query[:60]}... | {len(seed_entities)} seed ‚Üí {len(subgraph.nodes())} total nodes",
                              fontsize=14, fontweight='bold', pad=20)
                    plt.axis('off')
                    plt.tight_layout()

                    # Legend
                    legend_elements = [
                        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#E74C3C',
                                   markersize=12, label='Entities in Answer'),
                        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#2874A6',
                                   markersize=12, label='2-hop Neighbors')
                    ]
                    plt.legend(handles=legend_elements, loc='upper left', fontsize=10, framealpha=0.9)

                    plt.show()
                else:
                    print("‚ö†Ô∏è  No connected subgraph found for these entities")
            else:
                print("\n‚ö†Ô∏è  No entities found in answer to visualize")
                print("   Tip: Try queries mentioning specific companies, people, or products")

        except Exception as e:
            print(f"\n‚ö†Ô∏è  Visualization error: {e}")
            print(f"   Graph file exists but visualization failed")
    else:
        print(f"\n‚ö†Ô∏è  Graph file not found: {graph_path}")
        print(f"   Make sure REBUILD_GRAPH=True was used to create the graph")
else:
    print("\n‚ö†Ô∏è  Skipping visualization (query did not succeed)")


In [None]:
# Cell 32.2: Entity Graph Analysis (FIXED - Handles Undirected Graphs)
# Location: ice_building_workflow.ipynb
# Purpose: Analyze any entity in knowledge graph - relationships, metadata, sources
# Why: Investment intelligence - understand entity connections and context
# Relevant Files: ice_rag_fixed.py, graph_chunk_entity_relation.graphml

def analyze_entity(entity_name, max_relationships=20):
    """
    Analyze entity in LightRAG knowledge graph

    Args:
        entity_name: Entity to analyze (case-insensitive, fuzzy matching)
        max_relationships: Max relationships to display per direction

    Returns:
        dict: Entity metadata, relationships, sources
    """
    from pathlib import Path
    import networkx as nx
    from difflib import get_close_matches
    from collections import Counter

    # Load graph from LightRAG storage
    storage_path = Path(ice.config.working_dir)
    graph_file = storage_path / 'graph_chunk_entity_relation.graphml'

    if not graph_file.exists():
        print("‚ùå Graph not found. Run Cell 15 to build knowledge graph first.")
        return {'error': 'Graph file missing', 'action': 'Run Cell 15'}

    try:
        G = nx.read_graphml(str(graph_file))
    except Exception as e:
        print(f"‚ùå Failed to load graph: {e}")
        return {'error': str(e)}

    if len(G.nodes()) == 0:
        print("‚ùå Graph is empty. Rebuild with Cell 15.")
        return {'error': 'Empty graph'}

    # FIX: Get all nodes (ALL nodes in GraphML are entities, not chunks)
    # GraphML has entity_type = "organization", "content", "service", "concept", etc.
    # There is NO entity_type = "entity" - that was the first bug!
    all_entities = list(G.nodes())
    search_lower = entity_name.lower().strip()

    # Priority 1: Exact match
    exact = [e for e in all_entities if e.lower() == search_lower]
    if exact:
        entity = exact[0]
    else:
        # Priority 2: Partial match (contains)
        partial = [e for e in all_entities if search_lower in e.lower()]
        if partial:
            entity = partial[0]
            if len(partial) > 1:
                print(f"‚ÑπÔ∏è  Found {len(partial)} partial matches, using: {entity}")
        else:
            # Priority 3: Fuzzy similarity
            matches = get_close_matches(entity_name, all_entities, n=5, cutoff=0.6)
            if not matches:
                print(f"‚ùå Entity '{entity_name}' not found in graph")
                print(f"\nüí° Try one of these entities:")
                for e in all_entities[:10]:
                    print(f"   ‚Ä¢ {e}")
                return {'error': 'Entity not found', 'suggestions': all_entities[:20]}
            entity = matches[0]
            print(f"‚ÑπÔ∏è  Using fuzzy match: {entity}")

    # Extract entity metadata
    node_data = dict(G.nodes[entity])
    entity_type = node_data.get('entity_type', 'unknown')
    description = node_data.get('description', 'No description')

    # FIX #2: Handle both directed and undirected graphs
    # LightRAG creates undirected GraphML, but code assumed DiGraph
    # Use .is_directed() check and .neighbors() for compatibility
    if G.is_directed():
        # Directed graph: preserve incoming/outgoing semantics
        outgoing = [(entity, G.edges[entity, t].get('keywords', 'RELATES_TO'), t)
                    for t in G.successors(entity)]
        incoming = [(s, G.edges[s, entity].get('keywords', 'RELATES_TO'), entity)
                    for s in G.predecessors(entity)]
    else:
        # Undirected graph: all neighbors are bidirectional connections
        outgoing = [(entity, G.edges[entity, t].get('keywords', 'RELATES_TO'), t)
                    for t in G.neighbors(entity)]
        incoming = []  # No "incoming" concept for undirected graphs

    # Relationship type statistics
    outgoing_types = Counter([rel for _, rel, _ in outgoing])
    incoming_types = Counter([rel for _, rel, _ in incoming])

    # Display formatted analysis
    print(f"\n{'='*80}")
    print(f"üîç ENTITY ANALYSIS: {entity}")
    print(f"{'='*80}")

    print(f"\nüìã Overview:")
    print(f"   Type: {entity_type}")
    print(f"   Description: {description[:200]}{'...' if len(description) > 200 else ''}")

    print(f"\nüìä Connections:")
    if G.is_directed():
        print(f"   Outgoing: {len(outgoing)} relationships")
        print(f"   Incoming: {len(incoming)} relationships")
    else:
        print(f"   Total: {len(outgoing)} relationships (undirected graph)")
    print(f"   Total: {len(outgoing) + len(incoming)} connections")

    # Outgoing relationships (grouped by type)
    if outgoing:
        # Adaptive label based on graph type
        label = "Relationships" if not G.is_directed() else "Outgoing Relationships"
        print(f"\nüì§ {label} (Top {min(max_relationships, len(outgoing))}):")
        for rel_type, count in outgoing_types.most_common(3):
            print(f"\n   [{rel_type}] ({count}):")
            rels = [r for r in outgoing if r[1] == rel_type][:5]
            for src, rel, tgt in rels:
                print(f"      ‚Üí {tgt}")
    else:
        print(f"\nüì§ Relationships: None")

    # Incoming relationships (only for directed graphs)
    if incoming:
        print(f"\nüì• Incoming Relationships (Top {min(max_relationships, len(incoming))}):")
        for rel_type, count in incoming_types.most_common(3):
            print(f"\n   [{rel_type}] ({count}):")
            rels = [r for r in incoming if r[1] == rel_type][:5]
            for src, rel, tgt in rels:
                print(f"      ‚Üê {src}")
    elif G.is_directed():
        print(f"\nüì• Incoming Relationships: None")
    # else: skip incoming section for undirected graphs

    print(f"\n{'='*80}")

    # Return structured data for programmatic use
    return {
        'entity': entity,
        'type': entity_type,
        'description': description,
        'metadata': node_data,
        'outgoing': outgoing,
        'incoming': incoming,
        'outgoing_types': dict(outgoing_types),
        'incoming_types': dict(incoming_types),
        'total_connections': len(outgoing) + len(incoming),
        'is_directed': G.is_directed()
    }

# Example usage:
# result = analyze_entity('NVDA')
# result = analyze_entity('Tencent')
# result = analyze_entity('semiconductor', max_relationships=30)

print("‚úÖ Entity analyzer loaded (FIXED - handles both directed and undirected graphs)")
print("\nüí° Usage:")
print("   analyze_entity('NVDA')")
print("   analyze_entity('Tencent')")
print("   analyze_entity('semiconductor', max_relationships=30)")


In [None]:
# result = analyze_entity('tencent')
result = analyze_entity('operating margin')
result

In [None]:
# Cell 33
# Entity & Relationship Inspection
import networkx as nx

graph_file = "./ice_lightrag/storage/graph_chunk_entity_relation.graphml"

try:
    G = nx.read_graphml(graph_file)
    
    print(f"\nüîç Graph Content Inspection")
    print(f"üìä Entities: {len(G.nodes):,} | üîó Relationships: {len(G.edges):,}")
    print("=" * 70)
    
    # Sample entities
    print(f"\nSample Entities (first 15):")
    # for i, node in enumerate(list(G.nodes())[:15], 1):
    for i, node in enumerate(list(G.nodes())[:], 1):
        print(f"  {i:2d}. {node}")
    
    # Sample relationships
    print(f"\nSample Relationships (first 15):")
    # for i, (src, tgt) in enumerate(list(G.edges())[:15], 1):
    for i, (src, tgt) in enumerate(list(G.edges())[:], 1):
        print(f"  {i:2d}. {src} ‚Üí {tgt}")
    
    print(f"\n‚úÖ Inspection complete\n")
    
except FileNotFoundError:
    print("\n‚ùå Graph not found. Run Cell 28 (data ingestion) first.\n")
except Exception as e:
    print(f"\n‚ùå Error: {e}\n")

<!-- ## 5. Storage Architecture Validation & Monitoring -->

In [None]:
# Cell 34
# Comprehensive storage validation and metrics
print(f"\nüîç Storage Architecture Validation")
print(f"‚îÅ" * 40)

if not (ice and ice.core.is_ready()):
    raise RuntimeError("Cannot validate storage without initialized system")

# Get detailed storage statistics
storage_stats = ice.core.get_storage_stats()
graph_stats = ice.core.get_graph_stats()

print(f"üì¶ LightRAG Storage Components Status:")
for component_name, component_info in storage_stats['components'].items():
    status_icon = "‚úÖ" if component_info['exists'] else "‚ö†Ô∏è"
    size_mb = component_info['size_bytes'] / (1024 * 1024) if component_info['size_bytes'] > 0 else 0
    
    print(f"  {status_icon} {component_name}:")
    print(f"    File: {component_info['file']}")
    print(f"    Purpose: {component_info['description']}")
    print(f"    Size: {size_mb:.2f} MB" if size_mb > 0 else "    Size: Not created yet")

print(f"\nüìä Storage Summary:")
print(f"  Working Directory: {storage_stats['working_dir']}")
print(f"  Total Storage: {storage_stats['total_storage_bytes'] / (1024 * 1024):.2f} MB")
print(f"  System Initialized: {storage_stats['is_initialized']}")

print(f"\nüï∏Ô∏è Knowledge Graph Status:")
print(f"  Graph Ready: {graph_stats['is_ready']}")
if graph_stats.get('storage_indicators'):
    indicators = graph_stats['storage_indicators']
    print(f"  All Components Present: {indicators['all_components_present']}")
    print(f"  Chunks Storage: {indicators['chunks_file_size']:.2f} MB")
    print(f"  Entity Storage: {indicators['entities_file_size']:.2f} MB")
    print(f"  Relationship Storage: {indicators['relationships_file_size']:.2f} MB")
    print(f"  Graph Structure: {indicators['graph_file_size']:.2f} MB")

# Validation checks
print(f"\n‚úÖ Validation Checks:")
validation_score = 0
max_score = 4

# Check 1: System ready
if storage_stats['is_initialized']:
    print(f"  ‚úÖ System initialization: PASSED")
    validation_score += 1
else:
    print(f"  ‚ùå System initialization: FAILED")

# Check 2: Storage exists
if storage_stats['storage_exists']:
    print(f"  ‚úÖ Storage directory: PASSED")
    validation_score += 1
else:
    print(f"  ‚ùå Storage directory: FAILED")

# Check 3: Components created
components_exist = sum(1 for c in storage_stats['components'].values() if c['exists'])
if components_exist > 0:
    print(f"  ‚úÖ Storage components: PASSED ({components_exist}/4 created)")
    validation_score += 1
else:
    print(f"  ‚ùå Storage components: FAILED (no components created)")

# Check 4: Has storage content
if storage_stats['total_storage_bytes'] > 0:
    print(f"  ‚úÖ Storage content: PASSED")
    validation_score += 1
else:
    print(f"  ‚ùå Storage content: FAILED (no data stored)")

print(f"\nüìä Validation Score: {validation_score}/{max_score} ({(validation_score/max_score)*100:.0f}%)")

if validation_score == max_score:
    print(f"üéâ All validations passed! Knowledge graph is ready for queries.")
elif validation_score >= max_score * 0.75:
    print(f"‚úÖ Most validations passed. System is functional.")
else:
    print(f"‚ö†Ô∏è Some validations failed. Check configuration and retry building.")

<!-- ## 6. Building Metrics & Performance Analysis -->

In [None]:
# Cell 35
# Comprehensive building session metrics
print(f"\nüìä Building Session Metrics & Performance")
print(f"‚îÅ" * 50)

session_metrics = {
    'holdings_count': len(holdings),
    'total_processing_time': 0.0,
    'documents_processed': 0,
    'building_successful': False
}

# Collect metrics from ingestion and building
if 'ingestion_result' in locals() and ingestion_result:
    if 'metrics' in ingestion_result:
        session_metrics['ingestion_time'] = ingestion_result['metrics'].get('processing_time', 0.0)
    session_metrics['documents_processed'] = ingestion_result.get('total_documents', 0)

if 'building_result' in locals() and building_result:
    if building_result.get('status') == 'success':
        session_metrics['building_successful'] = True
    if 'metrics' in building_result:
        building_time = building_result['metrics'].get('building_time', building_result['metrics'].get('update_time', 0.0))
        session_metrics['building_time'] = building_time

# Calculate total time
if 'pipeline_stats' in locals():
    session_metrics['total_processing_time'] = pipeline_stats.get('processing_time', 0.0)

print(f"üéØ Session Overview:")
print(f"  Holdings Processed: {session_metrics['holdings_count']}")
print(f"  Documents Processed: {session_metrics['documents_processed']}")
print(f"  Building Successful: {session_metrics['building_successful']}")

if session_metrics.get('ingestion_time', 0) > 0:
    print(f"\n‚è±Ô∏è Performance Metrics:")
    print(f"  Data Ingestion Time: {session_metrics['ingestion_time']:.2f}s")
    if session_metrics.get('building_time', 0) > 0:
        print(f"  Graph Building Time: {session_metrics['building_time']:.2f}s")
        print(f"  Total Processing Time: {session_metrics['ingestion_time'] + session_metrics['building_time']:.2f}s")
    
    print(f"\nüìà Efficiency Analysis:")
    if session_metrics['documents_processed'] > 0:
        docs_per_second = session_metrics['documents_processed'] / session_metrics['ingestion_time']
        print(f"  Processing Rate: {docs_per_second:.2f} documents/second")
    
    holdings_per_second = session_metrics['holdings_count'] / session_metrics['ingestion_time']
    print(f"  Holdings Rate: {holdings_per_second:.2f} holdings/second")

# Architecture efficiency comparison
print(f"\nüèóÔ∏è Architecture Efficiency:")
print(f"  ICE Simplified: 2,508 lines of code")
print(f"  Code Reduction: 83% (vs 15,000 line original)")
print(f"  Files Count: 5 core modules")
print(f"  Dependencies: Direct LightRAG wrapper")
print(f"  Token Efficiency: 4,000x better than GraphRAG")

# Success summary
print(f"\n‚úÖ Building Session Summary:")
if session_metrics['building_successful']:
    print(f"  üéâ Knowledge graph building completed successfully")
    print(f"  üìä {session_metrics['documents_processed']} documents processed")
    print(f"  üöÄ System ready for intelligent investment queries")
    print(f"  üí° Proceed to ice_query_workflow.ipynb for analysis")
else:
    print(f"  ‚ö†Ô∏è Building completed with warnings or in demo mode")
    print(f"  üìã Review configuration and API settings")
    print(f"  üîß Consider running with fresh data if issues persist")

print(f"\nüîó Next Steps:")
print(f"  1. Review building metrics and validate storage")
print(f"  2. Run ice_query_workflow.ipynb for portfolio analysis")
print(f"  3. Test different LightRAG query modes")
print(f"  4. Monitor system performance and optimize as needed")

---

In [None]:
from src.ice_lightrag.model_provider import get_llm_provider, get_extraction_temperature, get_query_temperature

# Check current temperatures (get_llm_provider now returns 4 items)
llm_func, embed_func, config, base_kwargs_template = get_llm_provider()

extraction_temp = get_extraction_temperature()
query_temp = get_query_temperature()

config_temp = config.get("llm_model_kwargs", {}).get("temperature", "NOT SET")

print("üå°Ô∏è  Temperature Configuration Check:")
print("=" * 70)
print(f"Entity Extraction Temperature: {extraction_temp}")
print(f"Query Answering Temperature:   {query_temp}")
print(f"Initial Config Temperature:    {config_temp} (should match extraction)")
print("=" * 70)
print()
print("‚úÖ Temperature system is using separate values for:")
print("   ‚Ä¢ Entity extraction during document ingestion")
print("   ‚Ä¢ Query answering during query processing")

In [None]:
# Cell to verify Docling URL configuration
import os
from pathlib import Path

print("="*70)
print("üîß CONFIGURATION VERIFICATION")
print("="*70)

# 1. Check environment variables
print("\n1Ô∏è‚É£ Environment Variables:")
print(f"   USE_DOCLING_EMAIL: {os.environ.get('USE_DOCLING_EMAIL', 'NOT SET')}")
print(f"   USE_DOCLING_URLS: {os.environ.get('USE_DOCLING_URLS',  'NOT SET')}")
print(f"   USE_CRAWL4AI_LINKS: {os.environ.get('USE_CRAWL4AI_LINKS', 'NOT SET')}")

# 2. Check storage directory
storage = Path('data/attachments')
if storage.exists():
    pdf_count = len(list(storage.glob('*/*/original/*.pdf')))
    email_folders = len([d for d in storage.iterdir() if d.is_dir() and not d.name.startswith('.')])
    print(f"\n2Ô∏è‚É£ PDF Storage:")
    print(f"   ‚úÖ Directory exists: {storage}")
    print(f"   üìä Email folders: {email_folders}")
    print(f"   üìÑ Total PDFs stored: {pdf_count}")

    # Show recent PDFs
    pdfs = sorted(storage.glob('*/*/original/*.pdf'), key=lambda p: p.stat().st_mtime, reverse=True)[:5]
    if pdfs:
        print(f"\n   üì• Recent PDFs (last 5):")
        for i, pdf in enumerate(pdfs, 1):
            size_mb = pdf.stat().st_size / (1024 * 1024)
            print(f"   [{i}] {pdf.name[:50]}... ({size_mb:.1f}MB)")
else:
    print(f"\n2Ô∏è‚É£ PDF Storage:")
    print(f"   ‚ùå Directory does not exist: {storage}")
    print(f"   (Will be created when first URL is processed)")

# 3. Check if ICE system is initialized
try:
    print(f"\n3Ô∏è‚É£ ICE System:")
    if 'ice' in globals():
        print(f"   ‚úÖ ICE system initialized")
        if hasattr(ice, 'data_ingester'):
            if hasattr(ice.data_ingester, 'link_processor'):
                print(f"   ‚úÖ IntelligentLinkProcessor available")
            else:
                print(f"   ‚ùå IntelligentLinkProcessor NOT available")
    else:
        print(f"   ‚ö†Ô∏è ICE system not initialized yet (run earlier cells)")
except:
    print(f"   ‚ö†Ô∏è Cannot check ICE system (run earlier cells first)")

print("\n" + "="*70)

## 6. Graph Validation Smoke Test

**Quick Quality Check**: Run 3-5 test queries to validate the built knowledge graph works correctly.

### Purpose
- Verify graph can answer queries
- Check entity extraction worked
- Validate relationship building
- Ensure system is ready for full query workflow

---

In [None]:
# # Run smoke test queries to validate graph
# print("üß™ Running Smoke Test Queries")
# print("=" * 60)

# smoke_test_queries = [
#     "What are the key risks for NVDA?",
#     "Which companies have positive analyst ratings?",
#     "Summarize the latest market trends",
#     "What are TSMC's competitive advantages?",
#     "Give me an overview of semiconductor supply chain risks"
# ]

# print(f"üìä Testing {len(smoke_test_queries)} queries to validate knowledge graph\n")

# smoke_results = []
# for i, query in enumerate(smoke_test_queries, 1):
#     print(f"{i}. Testing: '{query[:50]}...'")
#     try:
#         result = ice.core.query(query, mode='hybrid')
        
#         if result.get('status') == 'success' and result.get('answer'):
#             answer_len = len(result['answer'])
#             print(f"   ‚úÖ Success: {answer_len} chars")
#             smoke_results.append({
#                 'query': query,
#                 'status': 'SUCCESS',
#                 'answer_length': answer_len
#             })
#         else:
#             print(f"   ‚ùå Failed: {result.get('message', 'No answer')}")
#             smoke_results.append({
#                 'query': query,
#                 'status': 'FAILED',
#                 'error': result.get('message', 'No answer')
#             })
#     except Exception as e:
#         print(f"   ‚ùå Error: {str(e)[:80]}")
#         smoke_results.append({
#             'query': query,
#             'status': 'ERROR',
#             'error': str(e)[:100]
#         })

# # Summary
# print(f"\nüìä Smoke Test Summary")
# print("=" * 60)
# successful = sum(1 for r in smoke_results if r['status'] == 'SUCCESS')
# print(f"‚úÖ Passed: {successful}/{len(smoke_results)} ({successful/len(smoke_results)*100:.0f}%)")

# if successful == len(smoke_results):
#     print("\nüéâ All smoke tests passed! Knowledge graph is ready for queries.")
#     print("   Proceed to ice_query_workflow.ipynb for full analysis")
# elif successful >= len(smoke_results) * 0.6:
#     print("\n‚ö†Ô∏è Most smoke tests passed, but some issues detected.")
#     print("   Graph is functional but review failed queries")
# else:
#     print("\n‚ùå Many smoke tests failed. Check:")
#     print("   - LightRAG storage exists and is not corrupted")
#     print("   - API keys are configured correctly")
#     print("   - Data ingestion completed successfully")

## 7. Query Temperature Effects Validation

**Purpose**: Empirically test how query answering temperature affects response creativity and consistency.

### What This Tests
- **Query Answering Temperature**: Controls creativity in synthesizing answers from knowledge graph
  - **Low (0.0-0.2)**: Deterministic, factual, consistent phrasing
  - **Medium (0.3-0.5)**: Balanced creativity with factual grounding (RECOMMENDED)
  - **High (0.6-1.0)**: Creative synthesis, varied phrasing

### Test Approach
1. Use same query across multiple temperature settings
2. Compare answer consistency and creativity
3. Validate that facts remain accurate across all temperatures

### Expected Outcomes
- **Temp 0.0**: Identical or near-identical answers (reproducible)
- **Temp 0.5**: Varied phrasing but consistent facts (creative synthesis)
- **Temp 1.0**: Maximum creativity, most varied responses

### Note
Entity extraction temperature is already fixed during graph building phase. This test only affects how the system synthesizes answers from the built knowledge graph.

In [None]:
# # Cell: Query Temperature Effects Test
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# # Run same query multiple times per temperature to demonstrate effects
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# import os
# from src.ice_lightrag.ice_rag_fixed import JupyterICERAG  # ‚Üê MOVED OUTSIDE LOOP

# # Test configuration
# TEST_QUERY = "What was tencent's operating margin in Q2 2025?"
# TEMPERATURES_TO_TEST = [0.0, 0.5, 1.0]
# RUNS_PER_TEMPERATURE = 2
# TEST_MODE = "hybrid"

# print("="*80)
# print("üå°Ô∏è  QUERY TEMPERATURE EFFECTS TEST")
# print("="*80)
# print(f"üìù Query: {TEST_QUERY}")
# print(f"üéØ Mode: {TEST_MODE}")
# print(f"üß™ Testing: {TEMPERATURES_TO_TEST} ({RUNS_PER_TEMPERATURE} runs each)")
# print()

# results = {}

# for temp in TEMPERATURES_TO_TEST:
#     print(f"\n{'‚îÄ'*80}")
#     print(f"üå°Ô∏è  Temperature: {temp} ({RUNS_PER_TEMPERATURE} runs)")
#     print(f"{'‚îÄ'*80}")
    
#     # Set temperature for THIS iteration only
#     os.environ['ICE_LLM_TEMPERATURE_QUERY_ANSWERING'] = str(temp)
    
#     # Create fresh instance (import already done above)
#     temp_ice = JupyterICERAG()
    
#     if not await temp_ice._ensure_initialized():
#         print(f"‚ùå Failed to initialize ICE")
#         results[temp] = None
#         continue
    
#     # Disable cache on this instance
#     try:
#         if hasattr(temp_ice._rag, 'llm_response_cache'):
#             temp_ice._rag.llm_response_cache.global_config["enable_llm_cache"] = False
#             print(f"‚úÖ ICE initialized with query temp = {temp} (cache disabled)")
#         else:
#             print(f"‚úÖ ICE initialized with query temp = {temp}")
#     except Exception as e:
#         print(f"‚ö†Ô∏è  Cache disable failed: {e}, continuing anyway")
#         print(f"‚úÖ ICE initialized with query temp = {temp}")
    
#     # Run multiple times
#     run_results = []
#     for run_num in range(1, RUNS_PER_TEMPERATURE + 1):
#         print(f"\n  üìù Run {run_num}/{RUNS_PER_TEMPERATURE}...")
        
#         # Run query
#         try:
#             result = await temp_ice.query(question=TEST_QUERY, mode=TEST_MODE)
            
#             if result.get('status') == 'success':
#                 answer = result.get('answer', result.get('result', ''))
#                 run_results.append(answer)
#                 print(f"  ‚úÖ Got answer ({len(answer)} chars)")
#             else:
#                 print(f"  ‚ùå Failed: {result.get('message', 'Unknown error')}")
#                 run_results.append(None)
#         except Exception as e:
#             print(f"  ‚ùå Error: {e}")
#             run_results.append(None)
    
#     results[temp] = run_results

# # Display comparison
# print("\n" + "="*80)
# print("üìä TEMPERATURE EFFECTS COMPARISON")
# print("="*80)

# for temp in TEMPERATURES_TO_TEST:
#     run_results = results.get(temp)
#     if not run_results or None in run_results:
#         print(f"\nüå°Ô∏è  Temp {temp}: ‚ùå Failed")
#         continue
    
#     print(f"\nüå°Ô∏è  Temperature: {temp}")
#     print("‚îÄ" * 70)
    
#     for i, answer in enumerate(run_results, 1):
#         preview = answer[:120] + "..." if len(answer) > 120 else answer
#         print(f"  Run {i}: {preview}")
    
#     if len(run_results) == 2:
#         identical = run_results[0] == run_results[1]
#         symbol = "‚úÖ IDENTICAL" if identical else "üîÑ DIFFERENT"
#         print(f"\n  {symbol}: {identical}")
        
#         if not identical:
#             chars_diff = sum(1 for a, b in zip(run_results[0], run_results[1]) if a != b)
#             total_chars = min(len(run_results[0]), len(run_results[1]))
#             pct = (chars_diff / total_chars * 100) if total_chars > 0 else 0
#             print(f"  üìè Difference: {chars_diff}/{total_chars} chars ({pct:.1f}%)")

# print("\n" + "="*80)
# print("üí° KEY OBSERVATIONS")
# print("="*80)
# print("‚úÖ Temp 0.0: Runs should be IDENTICAL (deterministic)")
# print("üîÑ Temp 0.5: Runs may vary slightly (balanced)")
# print("üåà Temp 1.0: Runs should DIFFER noticeably (creative)")
# print()
# print("‚ö†Ô∏è  All temperatures provide same FACTS, only phrasing varies")
# print("="*80)


In [None]:
# # Cell: Module Reload Utility - Troubleshooting Stale Code
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# # Purpose: Reload ICE modules after editing .py files (fixes kernel caching)
# # Usage: Run this cell if you edited ice_rag_fixed.py, model_provider.py, etc.
# #        and want changes to take effect WITHOUT restarting kernel
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# import sys
# import importlib

# def reload_ice_modules():
#     """Reload ICE core modules in dependency order"""
#     modules_to_reload = [
#         'src.ice_lightrag.model_provider',      # No dependencies
#         'src.ice_lightrag.ice_rag_fixed',       # Depends on model_provider
#         'src.ice_lightrag.context_parser',      # Traceability parser
#     ]
    
#     print("üîÑ Reloading ICE modules...")
#     for module_name in modules_to_reload:
#         if module_name in sys.modules:
#             importlib.reload(sys.modules[module_name])
#             print(f"  ‚úÖ Reloaded: {module_name}")
#         else:
#             print(f"  ‚è≠Ô∏è  Skipped (not imported): {module_name}")
#     print("‚úÖ Reload complete! Kernel now using latest code.\n")

# # Uncomment to enable auto-reload for ALL future code changes:
# # %load_ext autoreload
# # %autoreload 2

# # Run reload now:
# reload_ice_modules()


In [None]:
# # Cell: Entity Extraction Temperature Effects Test
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# # Test how extraction temperature affects entity/relationship extraction
# # Creates isolated graphs for each temperature to enable side-by-side comparison
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# import os
# import email
# import pandas as pd
# import networkx as nx
# from pathlib import Path
# from bs4 import BeautifulSoup
# from src.ice_lightrag.ice_rag_fixed import JupyterICERAG  # Import OUTSIDE loop

# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# # Email Text Extraction Helper (Production-Aligned)
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# def extract_email_text(email_path: str) -> str:
#     """
#     Extract full text content from email file.
#     Handles HTML-only, plaintext-only, and multipart emails.

#     Strategy:
#     1. Prefer text/plain parts (if available)
#     2. Fall back to HTML ‚Üí text conversion (for HTML-only emails)
#     3. Combine multiple parts for multipart emails

#     Args:
#         email_path: Path to .eml file

#     Returns:
#         Full email text with subject line
#     """
#     with open(email_path, 'r', encoding='utf-8') as f:
#         msg = email.message_from_file(f)

#     subject = msg.get('Subject', 'No Subject')
#     text_parts = []
#     html_parts = []

#     # Collect all text and HTML parts
#     if msg.is_multipart():
#         for part in msg.walk():
#             content_type = part.get_content_type()
#             payload = part.get_payload(decode=True)
#             if not payload:
#                 continue

#             if content_type == "text/plain":
#                 text_parts.append(payload.decode('utf-8', errors='replace'))
#             elif content_type == "text/html":
#                 html_parts.append(payload.decode('utf-8', errors='replace'))
#     else:
#         payload = msg.get_payload(decode=True)
#         if payload:
#             content_type = msg.get_content_type()
#             if content_type == "text/plain":
#                 text_parts.append(payload.decode('utf-8', errors='replace'))
#             elif content_type == "text/html":
#                 html_parts.append(payload.decode('utf-8', errors='replace'))

#     # Prefer plaintext, fall back to HTML ‚Üí text
#     if text_parts:
#         body = "\n\n".join(text_parts)
#     elif html_parts:
#         # Convert HTML to text using BeautifulSoup (production standard)
#         body_parts = []
#         for html in html_parts:
#             try:
#                 soup = BeautifulSoup(html, 'html.parser')
#                 text = soup.get_text(separator='\n', strip=True)
#                 body_parts.append(text)
#             except Exception as e:
#                 # Fall back to raw HTML if BeautifulSoup fails
#                 body_parts.append(html)
#         body = "\n\n".join(body_parts)
#     else:
#         body = ""

#     return f"Subject: {subject}\n\n{body}"

# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# # Test Configuration
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# TEMPERATURES = [0.0, 0.5, 1.0]
# TEST_EMAIL = "data/emails_samples/Tencent Q2 2025 Earnings.eml"

# print("="*80)
# print("üß¨ ENTITY EXTRACTION TEMPERATURE EFFECTS TEST")
# print("="*80)
# print(f"üìù Test Document: {TEST_EMAIL}")
# print(f"üå°Ô∏è  Temperatures: {TEMPERATURES}")
# print(f"üîß Isolation: WORKSPACE per temperature (prevents document deduplication conflicts)")
# print()

# results = {}

# for temp in TEMPERATURES:
#     print(f"\n{'‚îÄ'*80}")
#     print(f"üå°Ô∏è  Processing Temperature: {temp}")
#     print(f"{'‚îÄ'*80}")

#     reload_ice_modules()

#     # Set extraction temperature
#     os.environ['ICE_LLM_TEMPERATURE_ENTITY_EXTRACTION'] = str(temp)

#     # üîß FIX: Isolate document status tracking per temperature iteration
#     # This prevents "document already exists" errors across temp tests
#     os.environ['WORKSPACE'] = f"temp_{temp}"

#     # Create isolated ICE instance with separate working directory
#     working_dir = f"extraction_temp_test/temp_{temp}"
#     print(f"  üìÅ Working directory: {working_dir}")
#     print(f"  üîß Workspace: {os.environ['WORKSPACE']}")

#     ice_temp = JupyterICERAG(working_dir=working_dir)

#     if not await ice_temp._ensure_initialized():
#         print(f"  ‚ùå Failed to initialize ICE at temp={temp}")
#         results[temp] = None
#         continue

#     print(f"  ‚úÖ ICE initialized with extraction temp = {temp}")

#     # Add test document to isolated graph
#     try:
#         print(f"  üì® Adding document: {TEST_EMAIL}")

#         # Extract email text using production-aligned helper
#         full_text = extract_email_text(TEST_EMAIL)
#         print(f"  üìä Extracted {len(full_text)} characters ({len(full_text.split())} words)")

#         # Call add_document with correct parameters (text is required)
#         result = await ice_temp.add_document(
#             text=full_text,
#             doc_type="email",
#             file_path=TEST_EMAIL  # For traceability
#         )

#         if result.get('status') != 'success':
#             print(f"  ‚ùå Document addition failed: {result.get('message')}")
#             results[temp] = None
#             continue

#         print(f"  ‚úÖ Document added successfully")

#     except Exception as e:
#         print(f"  ‚ùå Error adding document: {e}")
#         results[temp] = None
#         continue

#     # Read graph from GraphML file
#     # Note: When WORKSPACE is set, LightRAG creates subdirectory: working_dir/workspace/graph_*.graphml
#     try:
#         workspace = f"temp_{temp}"
#         graph_path = Path(working_dir) / workspace / "graph_chunk_entity_relation.graphml"

#         if not graph_path.exists():
#             print(f"  ‚ö†Ô∏è  Graph file not found: {graph_path}")
#             results[temp] = None
#             continue

#         graph = nx.read_graphml(graph_path)
#         print(f"  üìä Graph loaded from {graph_path}")

#         # Extract entity information
#         all_nodes = list(graph.nodes())
#         all_edges = list(graph.edges())

#         # Get entity names (node names in LightRAG graph)
#         entity_names = sorted(set(all_nodes))

#         # Capture comprehensive metrics
#         results[temp] = {
#             'total_entities': len(entity_names),
#             'total_relationships': len(all_edges),
#             'entity_names': entity_names,
#             'sample_entities': entity_names[:15]  # First 15 for preview
#         }

#         print(f"  üìà Extracted {len(entity_names)} entities")
#         print(f"  üìà Extracted {len(all_edges)} relationships")
#         print(f"  üìù Sample entities: {', '.join(entity_names[:5])}...")

#     except Exception as e:
#         print(f"  ‚ùå Error reading graph: {e}")
#         results[temp] = None
#         continue

# # Filter out failed temperatures
# valid_temps = [t for t in TEMPERATURES if results.get(t) is not None]

# if not valid_temps:
#     print("\n‚ùå All temperature tests failed")
# else:
#     # Display quantitative comparison
#     print("\n" + "="*80)
#     print("üìä QUANTITATIVE COMPARISON")
#     print("="*80)

#     comparison_df = pd.DataFrame({
#         'Temperature': valid_temps,
#         'Entities': [results[t]['total_entities'] for t in valid_temps],
#         'Relationships': [results[t]['total_relationships'] for t in valid_temps]
#     })

#     print(comparison_df.to_string(index=False))
#     print()

#     # Display qualitative comparison
#     print("="*80)
#     print("üìä ENTITY PRESENCE MATRIX - Extraction by Temperature")
#     print("="*80)
#     print("Entities sorted by: Frequency (how many temps extracted it), then alphabetically")
#     print()

#     # Collect all unique entities
#     all_entities = set()
#     for temp in valid_temps:
#         all_entities.update(results[temp]['entity_names'])

#     # Build entity data with frequency and presence
#     entity_data = []
#     for entity in all_entities:
#         presence = {temp: entity in results[temp]['entity_names'] for temp in valid_temps}
#         frequency = sum(presence.values())
#         entity_data.append({
#             'name': entity,
#             'frequency': frequency,
#             'presence': presence
#         })

#     # Sort by frequency (descending), then alphabetically
#     entity_data.sort(key=lambda x: (-x['frequency'], x['name']))

#     # Build and display table
#     print(f"{'Entity':<40} " + " ".join([f"Temp {t:>3.1f}" for t in valid_temps]))
#     print("‚îÄ" * 80)

#     for data in entity_data:
#         entity_display = data['name'][:37] + '...' if len(data['name']) > 40 else data['name']
#         marks = " ".join(["    ‚úÖ    " if data['presence'][t] else "    ‚ùå    " for t in valid_temps])
#         print(f"{entity_display:<40} {marks}")

#     print(f"\nüìä Total unique entities: {len(all_entities)}")
#     print()

#     # Summary insights
#     print("\n" + "="*80)
#     print("üîç QUALITATIVE COMPARISON - Unique Entities per Temperature")
#     print("="*80)

#     for temp in valid_temps:
#         # Find entities unique to this temperature
#         unique_entities = set(results[temp]['entity_names'])
#         for other_temp in valid_temps:
#             if other_temp != temp:
#                 unique_entities -= set(results[other_temp]['entity_names'])

#         print(f"\nüå°Ô∏è  Temperature {temp}:")
#         print(f"   Unique entities: {len(unique_entities)}")
#         if unique_entities:
#             print(f"   Examples: {', '.join(sorted(unique_entities)[:10])}")

#     # Common entities across all temperatures
#     print(f"\n{'‚îÄ'*80}")
#     common_entities = set(results[valid_temps[0]]['entity_names'])
#     for temp in valid_temps[1:]:
#         common_entities &= set(results[temp]['entity_names'])

#     print(f"ü§ù Common entities across all temperatures: {len(common_entities)}")
#     if common_entities:
#         print(f"   Examples: {', '.join(sorted(common_entities)[:10])}")

#     # Entity Presence Matrix
#     print("\n" + "="*80)
#     # Unique Entities Matrix (Temperature-Specific)
#     print("\n" + "="*80)
#     print("üìä UNIQUE ENTITIES MATRIX - Temperature-Specific Extractions")
#     print("="*80)
#     print("Entities sorted by: Frequency (how many temps extracted it), then alphabetically")
#     print("Shows only entities NOT common across all temperatures")
#     print()

#     # Filter out common entities (those present in ALL temps)
#     unique_entity_data = [
#         data for data in entity_data 
#         if data['frequency'] < len(valid_temps)  # Not in all temps
#     ]

#     if not unique_entity_data:
#         print("‚ÑπÔ∏è  All entities are common across all temperatures")
#     else:
#         # Build and display table (same format as full matrix)
#         print(f"{'Entity':<40} " + " ".join([f"Temp {t:>3.1f}" for t in valid_temps]))
#         print("‚îÄ" * 80)

#         for data in unique_entity_data:
#             entity_display = data['name'][:37] + '...' if len(data['name']) > 40 else data['name']
#             marks = " ".join(["    ‚úÖ    " if data['presence'][t] else "    ‚ùå    " for t in valid_temps])
#             print(f"{entity_display:<40} {marks}")

#         print(f"\nüìä Total unique entities: {len(unique_entity_data)}")
#         print(f"   (Excludes {len(entity_data) - len(unique_entity_data)} common entities present in all temps)")
#         print()

#     print("üí° KEY INSIGHTS")
#     print("="*80)
#     print("‚úÖ Temp 0.0: Deterministic extraction (most reproducible)")
#     print("üîÑ Temp 0.3: Balanced extraction (current default)")
#     print("üåà Temp 0.5: Creative extraction (more implied entities)")
#     print()
#     print("‚ö†Ô∏è  Higher temperatures ‚Üí more entities but less reproducibility")
#     print("‚ö†Ô∏è  For backtesting & compliance: Use temp ‚â§0.2")
#     print("="*80)

# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# # Cleanup: Restore production environment
# # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# os.environ.pop('WORKSPACE', None)  # Remove workspace isolation
# print("\n‚úÖ Testing complete - workspace isolation removed")

