In [7]:
"""
Enhanced Multi-Knowledge Graph RAG System with Text Chunking
Handles large texts by processing them in chunks to avoid token limits
RAG FUNCTIONALITY DEACTIVATED
"""

import os
import re
import time
import json
import hashlib
from typing import List, Dict, Any, Optional, Tuple
import logging
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import tiktoken

from dotenv import load_dotenv
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Namespace, Literal
from rdflib.namespace import XSD, SKOS

# Configuration
INPUT_TEXT_FILE = "grover3.txt"
ONTOLOGY_PATH = "wiki.owl"
LOCATION_ONTOLOGY_PATH = "locations.owl"
OUTPUT_RAG_TTL = 'extracted_events_norag_with_multi_kg_G.ttl'
OUTPUT_RAG_OWL = 'extracted_events_norag_with_multi_kg_G.owl'
KG_CACHE_FILE = 'kg_cache.json'
LOCATION_CACHE_FILE = 'location_cache.json'
KG_ANALYSIS_REPORT = 'multi_kg_analysis_report.txt'

# Token limits
MAX_TOKENS_PER_REQUEST = 100000  # Conservative limit for GPT-4
CHUNK_OVERLAP = 200  # Characters to overlap between chunks

# RAG DEACTIVATION FLAG
RAG_ENABLED = False  # Set to False to deactivate RAG

# Namespaces
EX = Namespace("http://example.org/")
STE = Namespace("http://www.example.org/ste#")
DBP = Namespace("http://dbpedia.org/ontology/")
LAC = Namespace("http://ontologia.fr/OTB/lac#")
WD = Namespace("http://www.wikidata.org/entity/")
YAGO = Namespace("http://yago-knowledge.org/resource/")
CN = Namespace("http://conceptnet.io/c/en/")
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
DBPR = Namespace("http://dbpedia.org/resource/")

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Imports
try:
    if RAG_ENABLED:
        from langchain_community.embeddings import HuggingFaceEmbeddings
        from langchain_community.vectorstores import FAISS
    from langchain_openai import ChatOpenAI
    from langchain.schema import HumanMessage
except ImportError as e:
    print(f"ImportError: {e}")
    print("pip install rdflib python-dotenv langchain langchain-openai langchain-community faiss-cpu sentence-transformers tiktoken requests")
    exit(1)

@dataclass
class LocationInfo:
    """Location information with coordinates"""
    name: str
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    country: Optional[str] = None
    region: Optional[str] = None
    source: str = "extracted"
    confidence: float = 1.0
    uri: Optional[str] = None

@dataclass
class EnhancedKnowledgeFact:
    """Enhanced knowledge fact with metadata"""
    subject: str
    predicate: str
    object: str
    source: str
    confidence: float = 1.0
    context: Optional[str] = None
    temporal: Optional[str] = None
    spatial: Optional[str] = None
    evidence_score: float = 1.0
    source_uri: Optional[str] = None

class LocationExtractor:
    """Extracts and enriches location information"""
    
    def __init__(self, ontology_path: str = LOCATION_ONTOLOGY_PATH):
        self.ontology_path = ontology_path
        self.location_graph = None
        self.location_cache = self._load_location_cache()
        self.load_location_ontology()
        
    def _load_location_cache(self) -> Dict:
        """Load location cache"""
        if os.path.exists(LOCATION_CACHE_FILE):
            try:
                with open(LOCATION_CACHE_FILE, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                logger.warning(f"Could not load location cache: {e}")
        return {}
    
    def _save_location_cache(self):
        """Save location cache"""
        try:
            with open(LOCATION_CACHE_FILE, 'w', encoding='utf-8') as f:
                json.dump(self.location_cache, f, indent=2, ensure_ascii=False)
        except Exception as e:
            logger.warning(f"Could not save location cache: {e}")
    
    def load_location_ontology(self):
        """Load locations.owl ontology"""
        try:
            if os.path.exists(self.ontology_path):
                self.location_graph = Graph()
                self.location_graph.parse(self.ontology_path, format="xml")
                logger.info(f"Loaded location ontology from {self.ontology_path}")
            else:
                logger.warning(f"Location ontology not found at {self.ontology_path}")
                self.location_graph = None
        except Exception as e:
            logger.error(f"Error loading location ontology: {e}")
            self.location_graph = None
    
    def extract_locations_from_text(self, text: str) -> List[str]:
        """Extract potential location names from text"""
        location_patterns = [
            r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:City|County|State|Province|Country|Region|Island|Bay|Sea|Ocean|River|Mountain|Valley|Desert))\b',
            r'\b(?:Mount|Lake|River|Cape|Fort|Port|Saint|St\.)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b',
            r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*(?=\s+(?:in|near|at|from|to))\b',
            r'\b[A-Z][a-zA-Z]{2,}(?:\s+[A-Z][a-zA-Z]{2,})*\b'
        ]
        
        locations = []
        for pattern in location_patterns:
            matches = re.findall(pattern, text)
            locations.extend(matches)
        
        location_stopwords = {
            'The', 'This', 'That', 'These', 'Those', 'And', 'But', 'Or', 'So', 'If', 
            'When', 'Where', 'Who', 'What', 'How', 'Why', 'All', 'Some', 'Many', 'Most',
            'First', 'Second', 'Third', 'Last', 'Next', 'Before', 'After', 'During',
            'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 
            'September', 'October', 'November', 'December', 'Monday', 'Tuesday', 
            'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
        }
        
        filtered_locations = []
        for loc in locations:
            loc = loc.strip()
            if (loc not in location_stopwords and len(loc) > 2 and 
                not loc.isdigit() and not re.match(r'^\d+', loc)):
                filtered_locations.append(loc)
        
        return list(set(filtered_locations))
    
    def get_location_from_ontology(self, location_name: str) -> Optional[LocationInfo]:
        """Get location info from local ontology"""
        if not self.location_graph:
            return None
            
        try:
            query = f"""
            SELECT DISTINCT ?location ?lat ?long ?country ?region WHERE {{
                ?location rdfs:label ?label .
                FILTER(regex(?label, "{location_name}", "i"))
                OPTIONAL {{ ?location geo:lat ?lat }}
                OPTIONAL {{ ?location geo:long ?long }}
                OPTIONAL {{ ?location dbp:country ?country }}
                OPTIONAL {{ ?location dbp:region ?region }}
            }}
            """
            
            results = self.location_graph.query(query)
            for row in results:
                return LocationInfo(
                    name=location_name,
                    latitude=float(row.lat) if row.lat else None,
                    longitude=float(row.long) if row.long else None,
                    country=str(row.country) if row.country else None,
                    region=str(row.region) if row.region else None,
                    source="local_ontology",
                    uri=str(row.location) if row.location else None
                )
        except Exception as e:
            logger.debug(f"Ontology query failed for {location_name}: {e}")
        
        return None
    
    def get_location_from_dbpedia(self, location_name: str) -> Optional[LocationInfo]:
        """Get location coordinates from DBpedia"""
        try:
            time.sleep(0.5)
            entity_uri = f"http://dbpedia.org/resource/{location_name.replace(' ', '_')}"
            
            sparql_query = f"""
            SELECT DISTINCT ?lat ?long ?country ?region WHERE {{
                <{entity_uri}> geo:lat ?lat ;
                               geo:long ?long .
                OPTIONAL {{ <{entity_uri}> dbo:country ?country }}
                OPTIONAL {{ <{entity_uri}> dbo:region ?region }}
            }}
            """
            
            params = {'query': sparql_query, 'format': 'json'}
            response = requests.get("https://dbpedia.org/sparql", params=params, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                bindings = data.get('results', {}).get('bindings', [])
                
                if bindings:
                    binding = bindings[0]
                    return LocationInfo(
                        name=location_name,
                        latitude=float(binding.get('lat', {}).get('value', 0)),
                        longitude=float(binding.get('long', {}).get('value', 0)),
                        country=binding.get('country', {}).get('value', ''),
                        region=binding.get('region', {}).get('value', ''),
                        source="dbpedia",
                        uri=entity_uri
                    )
                    
        except Exception as e:
            logger.debug(f"DBpedia location query failed for {location_name}: {e}")
        
        return None
    
    def get_location_from_wikidata(self, location_name: str) -> Optional[LocationInfo]:
        """Get location coordinates from Wikidata with disambiguation"""
        try:
            time.sleep(0.5)
            
            # Try multiple query strategies to get the right location
            queries = [
                # Try exact label match first
                f"""
                SELECT DISTINCT ?item ?itemLabel ?coord ?country ?countryLabel WHERE {{
                  ?item rdfs:label "{location_name}"@en .
                  ?item wdt:P625 ?coord .
                  OPTIONAL {{ ?item wdt:P17 ?country }}
                  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
                }}
                LIMIT 5
                """,
                # Try with additional filters for places/locations
                f"""
                SELECT DISTINCT ?item ?itemLabel ?coord ?country ?countryLabel WHERE {{
                  ?item rdfs:label "{location_name}"@en .
                  ?item wdt:P625 ?coord .
                  ?item wdt:P31/wdt:P279* wd:Q486972 .  # human settlement
                  OPTIONAL {{ ?item wdt:P17 ?country }}
                  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
                }}
                LIMIT 5
                """
            ]
            
            for query in queries:
                params = {'query': query, 'format': 'json'}
                response = requests.get("https://query.wikidata.org/sparql", params=params, timeout=10)
                
                if response.status_code == 200:
                    data = response.json()
                    bindings = data.get('results', {}).get('bindings', [])
                    
                    if bindings:
                        # Prefer results with country information
                        best_binding = None
                        for binding in bindings:
                            if binding.get('country'):
                                best_binding = binding
                                break
                        
                        if not best_binding:
                            best_binding = bindings[0]
                        
                        coord_str = best_binding.get('coord', {}).get('value', '')
                        
                        coord_match = re.search(r'Point\(([+-]?\d*\.?\d+)\s+([+-]?\d*\.?\d+)\)', coord_str)
                        if coord_match:
                            longitude = float(coord_match.group(1))
                            latitude = float(coord_match.group(2))
                            
                            return LocationInfo(
                                name=location_name,
                                latitude=latitude,
                                longitude=longitude,
                                country=best_binding.get('countryLabel', {}).get('value', ''),
                                source="wikidata",
                                uri=best_binding.get('item', {}).get('value', '')
                            )
                        
        except Exception as e:
            logger.debug(f"Wikidata location query failed for {location_name}: {e}")
        
        return None
    
    def validate_coordinates(self, location_info: LocationInfo) -> bool:
        """Validate that coordinates make sense for the location"""
        if not location_info.latitude or not location_info.longitude:
            return True
        
        lat, lon = location_info.latitude, location_info.longitude
        
        # Basic coordinate range validation
        if not (-90 <= lat <= 90) or not (-180 <= lon <= 180):
            logger.warning(f"Invalid coordinates for {location_info.name}: {lat}, {lon}")
            return False
        
        # Generic geographic validation - flag obviously wrong coordinates
        # If coordinates suggest North America but no clear indication it should be there
        if (-130 < lon < -60) and (25 < lat < 50):  # North America range
            logger.warning(f"Coordinates for '{location_info.name}' appear to be in North America ({lat}, {lon}). "
                         f"Please verify if this is correct for your historical context.")
            # Don't auto-correct, just warn - let the user/context decide
        
        # If coordinates suggest Australia/Oceania for what might be European/Mediterranean names
        elif (110 < lon < 180) and (-45 < lat < -10):  # Australia/Oceania range
            logger.warning(f"Coordinates for '{location_info.name}' appear to be in Australia/Oceania ({lat}, {lon}). "
                         f"Please verify if this is correct for your historical context.")
        
        return True
    
    def enrich_location(self, location_name: str) -> Optional[LocationInfo]:
        """Get enriched location information with coordinates"""
        if location_name in self.location_cache:
            cached = self.location_cache[location_name]
            return LocationInfo(**cached) if cached else None
        
        location_info = None
        
        location_info = self.get_location_from_ontology(location_name)
        
        if not location_info:
            location_info = self.get_location_from_wikidata(location_name)
        
        if not location_info:
            location_info = self.get_location_from_dbpedia(location_name)
        
        if location_info:
            self.location_cache[location_name] = {
                'name': location_info.name,
                'latitude': location_info.latitude,
                'longitude': location_info.longitude,
                'country': location_info.country,
                'region': location_info.region,
                'source': location_info.source,
                'confidence': location_info.confidence,
                'uri': location_info.uri
            }
        else:
            self.location_cache[location_name] = None
        
        self._save_location_cache()
        
        if location_info:
            self.validate_coordinates(location_info)
        
        return location_info

class TextChunker:
    """Handles text chunking to manage token limits"""
    
    def __init__(self, model_name: str = "gpt-4o"):
        self.tokenizer = tiktoken.encoding_for_model(model_name)
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.tokenizer.encode(text))
    
    def chunk_text_by_sentences(self, text: str, max_tokens: int = 15000) -> List[str]:
        """Chunk text by sentences to maintain coherence"""
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
                
            test_chunk = current_chunk + " " + sentence if current_chunk else sentence
            
            if self.count_tokens(test_chunk) > max_tokens and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk = test_chunk
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks

class BaseKGConnector:
    """Base class for knowledge graph connectors"""
    
    def __init__(self, name: str, base_url: str, rate_limit: float = 1.0):
        self.name = name
        self.base_url = base_url
        self.rate_limit = rate_limit
        self.last_request_time = 0
        self.request_count = 0
        self.success_count = 0
        
    def _rate_limit_wait(self):
        """Enforce rate limiting"""
        current_time = time.time()
        time_since_last = current_time - self.last_request_time
        if time_since_last < self.rate_limit:
            time.sleep(self.rate_limit - time_since_last)
        self.last_request_time = time.time()
        self.request_count += 1
    
    def get_stats(self) -> Dict[str, Any]:
        """Get connector statistics"""
        return {
            'name': self.name,
            'requests': self.request_count,
            'successes': self.success_count,
            'success_rate': self.success_count / max(1, self.request_count)
        }
    
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Abstract method to retrieve facts"""
        raise NotImplementedError

class EnhancedWikidataConnector(BaseKGConnector):
    """Wikidata connector"""
    
    def __init__(self):
        super().__init__("Wikidata", "https://query.wikidata.org/sparql", 1.0)
        
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Retrieve facts from Wikidata with timeout protection"""
        try:
            self._rate_limit_wait()
            
            sparql_query = f"""
            SELECT DISTINCT ?subject ?subjectLabel ?predicate ?predicateLabel ?object ?objectLabel WHERE {{
              {{
                ?subject ?label "{entity}"@en .
              }} UNION {{
                ?subject rdfs:label "{entity}"@en .
              }}
              
              ?subject ?predicate ?object .
              FILTER(?predicate != wdt:P31 && ?predicate != wdt:P279)
              
              SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
            }}
            LIMIT {limit}
            """
            
            params = {'query': sparql_query, 'format': 'json'}
            response = requests.get(self.base_url, params=params, timeout=12)  # Reduced timeout
            
            if response.status_code == 200:
                data = response.json()
                facts = []
                
                for binding in data.get('results', {}).get('bindings', []):
                    fact = EnhancedKnowledgeFact(
                        subject=binding.get('subjectLabel', {}).get('value', entity),
                        predicate=binding.get('predicateLabel', {}).get('value', 'related_to'),
                        object=binding.get('objectLabel', {}).get('value', ''),
                        source=self.name,
                        confidence=0.9,
                        source_uri=binding.get('subject', {}).get('value')
                    )
                    facts.append(fact)
                
                self.success_count += 1
                logger.info(f"Retrieved {len(facts)} facts from Wikidata for '{entity}'")
                return facts
            else:
                logger.warning(f"Wikidata returned status {response.status_code} for {entity}")
                
        except requests.Timeout:
            logger.warning(f"Wikidata query timeout for '{entity}'")
        except Exception as e:
            logger.warning(f"Wikidata query failed for '{entity}': {e}")
        
        return []

class EnhancedDBpediaConnector(BaseKGConnector):
    """DBpedia connector"""
    
    def __init__(self):
        super().__init__("DBpedia", "https://dbpedia.org/sparql", 1.0)
        
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Retrieve facts from DBpedia with timeout protection"""
        try:
            self._rate_limit_wait()
            
            entity_uri = f"http://dbpedia.org/resource/{entity.replace(' ', '_')}"
            
            sparql_query = f"""
            SELECT DISTINCT ?predicate ?object WHERE {{
              <{entity_uri}> ?predicate ?object .
              FILTER(LANG(?object) = "en" || !isLiteral(?object))
              FILTER(!isBlank(?object))
            }}
            LIMIT {limit}
            """
            
            params = {'query': sparql_query, 'format': 'json'}
            response = requests.get(self.base_url, params=params, timeout=12)  # Reduced timeout
            
            if response.status_code == 200:
                data = response.json()
                facts = []
                
                for binding in data.get('results', {}).get('bindings', []):
                    predicate = binding.get('predicate', {}).get('value', '')
                    obj = binding.get('object', {}).get('value', '')
                    
                    predicate_name = predicate.split('/')[-1].replace('_', ' ')
                    
                    fact = EnhancedKnowledgeFact(
                        subject=entity,
                        predicate=predicate_name,
                        object=obj,
                        source=self.name,
                        confidence=0.85,
                        source_uri=entity_uri
                    )
                    facts.append(fact)
                
                self.success_count += 1
                logger.info(f"Retrieved {len(facts)} facts from DBpedia for '{entity}'")
                return facts
            else:
                logger.warning(f"DBpedia returned status {response.status_code} for {entity}")
                
        except requests.Timeout:
            logger.warning(f"DBpedia query timeout for '{entity}'")
        except Exception as e:
            logger.warning(f"DBpedia query failed for '{entity}': {e}")
        
        return []

class EnhancedConceptNetConnector(BaseKGConnector):
    """ConceptNet connector with dynamic concept discovery"""
    
    def __init__(self):
        super().__init__("ConceptNet", "http://api.conceptnet.io", 0.5)
        
    def search_related_concepts(self, entity: str) -> List[str]:
        """Search for related concepts using ConceptNet's search API"""
        try:
            # Try search API first
            search_url = f"{self.base_url}/search?text={entity.replace(' ', '%20')}&limit=10"
            response = requests.get(search_url, timeout=10)
            
            related_concepts = []
            if response.status_code == 200:
                data = response.json()
                for edge in data.get('edges', []):
                    start = edge.get('start', {}).get('label', '')
                    end = edge.get('end', {}).get('label', '')
                    
                    # Extract concept paths and clean them
                    for concept_path in [start, end]:
                        if concept_path and '/c/en/' in concept_path:
                            concept = concept_path.replace('/c/en/', '').replace('_', ' ')
                            if concept.lower() != entity.lower() and len(concept) > 2:
                                related_concepts.append(concept)
            
            return list(set(related_concepts))[:5]  # Return top 5 unique concepts
            
        except Exception as e:
            logger.debug(f"ConceptNet search failed for {entity}: {e}")
            return []
    
    def query_concept_directly(self, concept: str, limit: int = 20) -> List[dict]:
        """Query a specific concept and return raw edges"""
        try:
            concept_path = f"/c/en/{concept.lower().replace(' ', '_')}"
            url = f"{self.base_url}{concept_path}?limit={limit}"
            
            response = requests.get(url, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                return data.get('edges', [])
            
        except Exception as e:
            logger.debug(f"ConceptNet direct query failed for {concept}: {e}")
        
        return []
        
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Retrieve facts from ConceptNet through dynamic discovery"""
        try:
            self._rate_limit_wait()
            all_facts = []
            
            # Strategy 1: Try direct query first
            direct_edges = self.query_concept_directly(entity, limit//2)
            
            # Strategy 2: Search for related concepts and query them
            related_concepts = self.search_related_concepts(entity)
            
            # Process direct edges
            for edge in direct_edges:
                fact = self._edge_to_fact(edge, entity, "direct")
                if fact:
                    all_facts.append(fact)
            
            # Process related concept edges
            for concept in related_concepts:
                concept_edges = self.query_concept_directly(concept, 5)
                for edge in concept_edges:
                    fact = self._edge_to_fact(edge, entity, f"via_{concept}")
                    if fact:
                        all_facts.append(fact)
            
            if all_facts:
                self.success_count += 1
                logger.info(f"Retrieved {len(all_facts)} facts from ConceptNet for '{entity}'")
                if related_concepts:
                    logger.info(f"  - Found related concepts: {related_concepts}")
            
            return all_facts[:limit]
                
        except Exception as e:
            logger.error(f"ConceptNet query failed for '{entity}': {e}")
        
        return []
    
    def _edge_to_fact(self, edge: dict, original_entity: str, discovery_method: str) -> Optional[EnhancedKnowledgeFact]:
        """Convert ConceptNet edge to EnhancedKnowledgeFact"""
        try:
            start = edge.get('start', {})
            end = edge.get('end', {})
            relation = edge.get('rel', {})
            weight = edge.get('weight', 1.0)
            
            start_label = start.get('label', '').replace('/c/en/', '').replace('_', ' ')
            end_label = end.get('label', '').replace('/c/en/', '').replace('_', ' ')
            rel_label = relation.get('label', 'related_to')
            
            # Skip if labels are empty or too short
            if not start_label or not end_label or len(start_label) < 2 or len(end_label) < 2:
                return None
            
            # Determine confidence based on discovery method
            confidence_multiplier = 1.0 if discovery_method == "direct" else 0.6
            
            return EnhancedKnowledgeFact(
                subject=original_entity,
                predicate=rel_label,
                object=end_label if start_label.lower() in original_entity.lower() else start_label,
                source=self.name,
                confidence=min(weight * confidence_multiplier, 1.0),
                context=f"Discovered {discovery_method}"
            )
            
        except Exception as e:
            logger.debug(f"Error converting edge to fact: {e}")
            return None

class MultiKGCache:
    """Caching system for knowledge graph facts"""
    
    def __init__(self, cache_file: str = KG_CACHE_FILE):
        self.cache_file = cache_file
        self.cache = self._load_cache()
        
    def _load_cache(self) -> Dict:
        """Load cache from file"""
        if os.path.exists(self.cache_file):
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                logger.warning(f"Could not load cache: {e}")
        return {}
    
    def _save_cache(self):
        """Save cache to file"""
        try:
            with open(self.cache_file, 'w', encoding='utf-8') as f:
                json.dump(self.cache, f, indent=2, ensure_ascii=False)
        except Exception as e:
            logger.warning(f"Could not save cache: {e}")
    
    def get_cache_key(self, source: str, entity: str) -> str:
        """Generate cache key"""
        return f"{source}:{hashlib.md5(entity.encode()).hexdigest()}"
    
    def get(self, source: str, entity: str) -> Optional[List[Dict]]:
        """Get cached facts"""
        key = self.get_cache_key(source, entity)
        return self.cache.get(key)
    
    def set(self, source: str, entity: str, facts: List[EnhancedKnowledgeFact]):
        """Cache facts"""
        key = self.get_cache_key(source, entity)
        serializable_facts = []
        for fact in facts:
            serializable_facts.append({
                'subject': fact.subject,
                'predicate': fact.predicate,
                'object': fact.object,
                'source': fact.source,
                'confidence': fact.confidence,
                'context': fact.context,
                'temporal': fact.temporal,
                'spatial': fact.spatial,
                'evidence_score': fact.evidence_score,
                'source_uri': fact.source_uri
            })
        self.cache[key] = serializable_facts
        self._save_cache()

class EnhancedMultiKGRAGSystem:
    """Multi-Knowledge Graph system with RAG functionality DEACTIVATED"""
    
    def __init__(self):
        self.connectors = {
            'wikidata': EnhancedWikidataConnector(),
            'dbpedia': EnhancedDBpediaConnector(),
            'conceptnet': EnhancedConceptNetConnector()
        }
        self.cache = MultiKGCache()
        self.chunker = TextChunker()
        self.location_extractor = LocationExtractor()
        self.global_locations = {}
        # RAG components deactivated
        self.vectorstore = None  
        self.document_chunks = []  
        self.stats = {
            'queries_processed': 0,
            'entities_extracted': 0,
            'facts_retrieved': 0,
            'cache_hits': 0,
            'chunks_processed': 0,
            'locations_found': 0,
            'locations_with_coordinates': 0,
            'location_duplicates_avoided': 0,
            'rag_queries': 0
        }
        
    def extract_entities_advanced(self, text: str) -> List[str]:
        """Extract entities from text"""
        entities = []
        
        pattern = r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b'
        matches = re.findall(pattern, text)
        entities.extend(matches)
        
        stop_words = {
            'The', 'This', 'That', 'These', 'Those', 'And', 'But', 'Or', 'So', 'If', 'When', 'Where',
            'Who', 'What', 'How', 'Why', 'All', 'Some', 'Many', 'Few', 'Most', 'Each', 'Every',
            'First', 'Second', 'Third', 'Last', 'Next', 'Previous', 'Before', 'After', 'During'
        }
        
        filtered_entities = []
        for entity in entities:
            entity = entity.strip()
            if (entity not in stop_words and len(entity) > 2 and not entity.isdigit()):
                filtered_entities.append(entity)
        
        seen = set()
        unique_entities = []
        for entity in filtered_entities:
            if entity.lower() not in seen:
                seen.add(entity.lower())
                unique_entities.append(entity)
        
        return unique_entities[:15]
    
    def retrieve_kg_facts_enhanced(self, entities: List[str]) -> Dict[str, List[EnhancedKnowledgeFact]]:
        """Retrieve facts from knowledge graphs with improved timeout handling"""
        all_facts = {}
        cache_hits = 0
        
        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = {}
            
            for entity in entities:
                for source_name, connector in self.connectors.items():
                    # Check cache first
                    cached_facts = self.cache.get(source_name, entity)
                    if cached_facts:
                        cache_hits += 1
                        if entity not in all_facts:
                            all_facts[entity] = []
                        for fact_data in cached_facts:
                            fact = EnhancedKnowledgeFact(**fact_data)
                            all_facts[entity].append(fact)
                    else:
                        future = executor.submit(connector.retrieve_facts, entity, 3)
                        futures[future] = (entity, source_name)
            
            # Collect results with better timeout handling
            completed = 0
            total_futures = len(futures)
            
            try:
                for future in as_completed(futures, timeout=45):  # Increased timeout
                    entity, source_name = futures[future]
                    completed += 1
                    
                    try:
                        facts = future.result(timeout=5)  # Individual future timeout
                        if facts:
                            self.cache.set(source_name, entity, facts)
                            
                            if entity not in all_facts:
                                all_facts[entity] = []
                            all_facts[entity].extend(facts)
                            
                            self.stats['facts_retrieved'] += len(facts)
                        
                        logger.debug(f"✅ {source_name} completed for {entity} ({completed}/{total_futures})")
                        
                    except Exception as e:
                        logger.warning(f"❌ {source_name} failed for {entity}: {e}")
                        continue
                        
            except TimeoutError:
                pending_count = total_futures - completed
                logger.warning(f"⏰ Timeout: {pending_count}/{total_futures} KG queries still pending, continuing with available results")
                
                # Cancel remaining futures
                for future in futures:
                    if not future.done():
                        future.cancel()
        
        self.stats['cache_hits'] += cache_hits
        logger.info(f"KG retrieval completed: {completed}/{total_futures} successful, {cache_hits} cache hits")
        return all_facts
    
    def format_kg_context_enhanced(self, kg_facts: Dict[str, List[EnhancedKnowledgeFact]]) -> str:
        """Format KG facts into context string"""
        context_parts = []
        
        for entity, facts in kg_facts.items():
            if facts:
                sorted_facts = sorted(facts, key=lambda f: f.confidence, reverse=True)
                
                context_parts.append(f"\n=== Knowledge about {entity} ===")
                
                by_source = {}
                for fact in sorted_facts[:3]:
                    if fact.source not in by_source:
                        by_source[fact.source] = []
                    by_source[fact.source].append(fact)
                
                for source, source_facts in by_source.items():
                    context_parts.append(f"\nFrom {source}:")
                    for fact in source_facts[:2]:
                        fact_str = f"- {fact.subject} {fact.predicate} {fact.object}"
                        if fact.confidence < 0.8:
                            fact_str += f" (confidence: {fact.confidence:.2f})"
                        context_parts.append(fact_str)
        
        return "\n".join(context_parts)
    
    def register_global_location(self, location_info: LocationInfo) -> str:
        """Register location globally and return unique identifier"""
        location_key = location_info.name.lower().strip()
        
        if location_key in self.global_locations:
            existing = self.global_locations[location_key]
            if (location_info.latitude and location_info.longitude and 
                (not existing.latitude or not existing.longitude)):
                self.global_locations[location_key] = location_info
                logger.info(f"Updated coordinates for {location_info.name}")
            else:
                self.stats['location_duplicates_avoided'] += 1
                logger.debug(f"Location {location_info.name} already registered")
        else:
            self.global_locations[location_key] = location_info
            logger.info(f"Registered new location: {location_info.name}")
        
        clean_name = re.sub(r'[^a-zA-Z0-9]', '', location_info.name)
        return f"ste:Location_{clean_name}"
    
    def process_chunk(self, chunk: str, chunk_num: int, llm) -> str:
        """Process a single chunk of text WITHOUT RAG (RAG DEACTIVATED)"""
        logger.info(f"Processing chunk {chunk_num} ({len(chunk)} chars) - RAG DISABLED")
        
        # RAG retrieval DEACTIVATED - skip this step
        # relevant_context = ""
        
        # Extract entities and locations (this remains the same)
        entities = self.extract_entities_advanced(chunk)
        locations = self.location_extractor.extract_locations_from_text(chunk)
        logger.info(f"Found potential locations in chunk {chunk_num}: {locations}")
        
        # Enrich locations with coordinates
        enriched_locations = {}
        for location_name in locations[:10]:
            location_info = self.location_extractor.enrich_location(location_name)
            if location_info:
                self.register_global_location(location_info)
                enriched_locations[location_name] = location_info
                self.stats['locations_found'] += 1
                if location_info.latitude and location_info.longitude:
                    self.stats['locations_with_coordinates'] += 1
        
        if not entities and not enriched_locations:
            logger.info(f"No entities or locations found in chunk {chunk_num}")
            return ""
        
        logger.info(f"Found entities in chunk {chunk_num}: {entities[:5]}...")
        logger.info(f"Enriched {len(enriched_locations)} locations with coordinates")
        
        # Get KG facts for entities (this remains the same)
        kg_facts = self.retrieve_kg_facts_enhanced(entities)
        kg_context = self.format_kg_context_enhanced(kg_facts)
        location_context = self.format_location_context(enriched_locations)
        
        # SIMPLIFIED PROMPT WITHOUT RAG
        simplified_prompt = f"""You are extracting historical events from text chunks. Use knowledge graph facts and location coordinates to enhance your extraction.

CURRENT TEXT CHUNK {chunk_num} TO ANALYZE:
{chunk}

KNOWLEDGE GRAPH FACTS FOR ENTITIES IN THIS CHUNK:
{kg_context}

LOCATION INFORMATION WITH COORDINATES:
{location_context}

TASK: Extract ONLY the events that are actually mentioned in the current text chunk.

Requirements:
1. Extract ONLY events mentioned in the CURRENT text chunk
2. Use KG facts to enhance entity information
3. Use location coordinates to provide precise geographical data
4. Include ALL these properties for each event:
   - ste:hasType (description of event)
   - ste:hasAgent (who caused/led the event)
   - ste:hasTime (when it happened)
   - ste:hasLocation (location name from text)
   - ste:hasLatitude (latitude coordinate if available)
   - ste:hasLongitude (longitude coordinate if available)
   - ste:hasCountry (country if available)
   - ste:hasRegion (region if available)
   - ste:hasLocationSource (source of coordinates: wikidata/dbpedia/local_ontology)
   - ste:hasResult (outcome/consequence)

Output format (do not include prefixes, they will be added later):
```turtle
ste:Event{chunk_num}_1 a ste:Event, dbp:SpecificEventType ;
    ste:hasType "specific description from current chunk" ;
    ste:hasAgent "specific person from current chunk" ;
    ste:hasTime "specific date from current chunk" ;
    ste:hasLocation "specific location from current chunk" ;
    ste:hasLatitude "37.1234"^^xsd:double ;
    ste:hasLongitude "15.5678"^^xsd:double ;
    ste:hasCountry "Italy" ;
    ste:hasRegion "Sicily" ;
    ste:hasLocationSource "wikidata" ;
    ste:hasResult "specific outcome from current chunk" .
```

IMPORTANT: 
- Extract events ONLY from the CURRENT text chunk
- Use KG facts to enrich entity details
- Include precise coordinates from location sources
- Only extract events explicitly mentioned in the current chunk
- If no clear events are found in current chunk, return empty
"""
        
        try:
            response = llm.invoke([HumanMessage(content=simplified_prompt)])
            turtle_output = self.clean_turtle(response.content)
            self.stats['chunks_processed'] += 1
            logger.info(f"Generated RDF for chunk {chunk_num} (without RAG)")
            return turtle_output
        except Exception as e:
            logger.error(f"Error processing chunk {chunk_num}: {e}")
            return ""
    
    def format_location_context(self, enriched_locations: Dict[str, LocationInfo]) -> str:
        """Format location information into context string"""
        if not enriched_locations:
            return "No location coordinates available."
        
        context_parts = ["\n=== Location Information ==="]
        
        for location_name, location_info in enriched_locations.items():
            context_parts.append(f"\n{location_name}:")
            context_parts.append(f"  - Source: {location_info.source}")
            
            if location_info.latitude and location_info.longitude:
                context_parts.append(f"  - Coordinates: {location_info.latitude}, {location_info.longitude}")
                if location_info.source == "corrected":
                    context_parts.append(f"  - NOTE: Coordinates were corrected for historical accuracy")
            else:
                context_parts.append("  - Coordinates: Not available")
            
            if location_info.country:
                context_parts.append(f"  - Country: {location_info.country}")
            
            if location_info.region:
                context_parts.append(f"  - Region: {location_info.region}")
            
            if location_info.uri:
                context_parts.append(f"  - URI: {location_info.uri}")
        
        return "\n".join(context_parts)
    
    def generate_global_location_rdf(self) -> str:
        """Generate RDF for all unique locations found across all chunks"""
        if not self.global_locations:
            return ""
        
        location_rdf_parts = []
        
        for location_key, location_info in self.global_locations.items():
            clean_name = re.sub(r'[^a-zA-Z0-9]', '', location_info.name)
            location_id = f"ste:Location_{clean_name}"
            
            rdf_lines = [f'{location_id} a ste:Location ;']
            rdf_lines.append(f'    rdfs:label "{location_info.name}" ;')
            
            if location_info.latitude and location_info.longitude:
                rdf_lines.append(f'    geo:lat "{location_info.latitude}"^^xsd:double ;')
                rdf_lines.append(f'    geo:long "{location_info.longitude}"^^xsd:double ;')
            
            if location_info.country:
                rdf_lines.append(f'    ste:hasCountry "{location_info.country}" ;')
            
            if location_info.region:
                rdf_lines.append(f'    ste:hasRegion "{location_info.region}" ;')
            
            if location_info.source:
                rdf_lines.append(f'    ste:hasSource "{location_info.source}" ;')
            
            if location_info.uri:
                rdf_lines.append(f'    ste:hasURI <{location_info.uri}> ;')
            
            if rdf_lines[-1].endswith(' ;'):
                rdf_lines[-1] = rdf_lines[-1][:-2] + ' .'
            
            location_rdf_parts.append('\n'.join(rdf_lines))
        
        return '\n\n'.join(location_rdf_parts)
    
    def clean_turtle(self, raw_output: str) -> str:
        """Clean turtle output"""
        m = re.search(r"```(?:turtle)?\s*(.*?)```", raw_output, re.DOTALL | re.IGNORECASE)
        if m:
            return m.group(1).strip()
        
        lines = raw_output.strip().split('\n')
        turtle_lines = []
        for line in lines:
            stripped = line.strip()
            if (stripped.startswith('@') or stripped.startswith('<') or 
                stripped.startswith(':') or stripped.startswith('_') or 
                stripped.startswith('a ') or ':' in stripped or stripped == ''):
                turtle_lines.append(line)
        
        return '\n'.join(turtle_lines)

    # RAG METHODS DEACTIVATED
    def prepare_vectorstore(self, text_chunks: List[str]):
        """RAG DEACTIVATED: Vector store preparation disabled"""
        logger.info("RAG functionality is DEACTIVATED - vectorstore not created")
        return False
    
    def rag_query(self, query: str, llm, k: int = 20) -> Dict[str, Any]:
        """RAG DEACTIVATED: RAG queries disabled"""
        return {"error": "RAG functionality is DEACTIVATED. Set RAG_ENABLED=True to enable RAG features."}
    
    def interactive_rag_session(self, llm):
        """RAG DEACTIVATED: Interactive RAG session disabled"""
        print("\n❌ RAG functionality is DEACTIVATED")
        print("To enable RAG, set RAG_ENABLED=True at the top of the script")

# Utility functions
def load_api_key():
    """Load OpenAI API key"""
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("Error: OPENAI_API_KEY not found")
        return None
    print("OpenAI API Key loaded successfully.")
    return api_key

def load_text_from_file(filepath: str) -> str:
    """Load text from file"""
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return ""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        print(f"Loaded text from {filepath}")
        return text
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")
        return ""

def initialize_llm(api_key: str):
    """Initialize LLM"""
    if not api_key:
        return None
    try:
        llm = ChatOpenAI(model_name="gpt-4o", temperature=0, openai_api_key=api_key)
        print("LLM initialized successfully.")
        return llm
    except Exception as e:
        print(f"Error initializing LLM: {e}")
        return None

def prepare_vectorstore_from_text(text: str, multi_kg_system):
    """RAG DEACTIVATED: Vector store creation disabled"""
    if not RAG_ENABLED:
        logger.info("RAG functionality is DEACTIVATED - vectorstore not created")
        return None
    
    # Original code would go here if RAG_ENABLED was True
    return None

def main():
    """Main function with chunking support (RAG DEACTIVATED)"""
    print("🚀 Starting Multi-Knowledge Graph System with Chunking (RAG DEACTIVATED)")
    
    api_key = load_api_key()
    if not api_key:
        return
    
    domain_text = load_text_from_file(INPUT_TEXT_FILE)
    if not domain_text:
        print("⚠️  No input file found, using sample text")
        domain_text = """The Battle of Salamis was a decisive naval battle in 480 BC. 
        Themistocles led the Greek fleet to victory over the Persians commanded by Xerxes. 
        This victory established Greek naval supremacy in the Aegean Sea."""
    else:
        print(f"📄 Using YOUR text from {INPUT_TEXT_FILE}")
        print(f"📝 Text length: {len(domain_text)} characters")
    
    multi_kg_system = EnhancedMultiKGRAGSystem()
    llm = initialize_llm(api_key)
    
    if not llm:
        return
    
    # Vector store preparation SKIPPED (RAG deactivated)
    print("\n❌ RAG vector store setup SKIPPED (RAG is DEACTIVATED)")
    
    token_count = multi_kg_system.chunker.count_tokens(domain_text)
    print(f"🔢 Total tokens in text: {token_count:,}")
    
    if token_count > 15000:
        print("📊 Text is large, chunking into smaller pieces...")
        chunks = multi_kg_system.chunker.chunk_text_by_sentences(domain_text, max_tokens=15000)
        print(f"📄 Created {len(chunks)} chunks")
    else:
        print("📄 Text is small enough to process as single chunk")
        chunks = [domain_text]
    
    # Extract events and create RDF (without RAG)
    all_turtle_outputs = []
    all_entities = set()
    
    print("\n🔄 Processing chunks for event extraction (without RAG)...")
    for i, chunk in enumerate(chunks, 1):
        print(f"\n🔄 Processing chunk {i}/{len(chunks)}...")
        
        turtle_output = multi_kg_system.process_chunk(chunk, i, llm)
        if turtle_output:
            all_turtle_outputs.append(turtle_output)
            
        chunk_entities = multi_kg_system.extract_entities_advanced(chunk)
        all_entities.update(chunk_entities)
        
        if i < len(chunks):
            time.sleep(1)
    
    # Save RDF output
    if all_turtle_outputs:
        prefixes = """@prefix ste: <http://www.example.org/ste#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix dbp: <http://dbpedia.org/ontology/> .
@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix dbpr: <http://dbpedia.org/resource/> .

"""
        
        final_output = prefixes + "# Historical Events with Knowledge Graph Enhanced Location Data (RAG DEACTIVATED)\n" + "\n\n".join(all_turtle_outputs)
        
        with open(OUTPUT_RAG_TTL, 'w', encoding='utf-8') as f:
            f.write(final_output)
        
        print(f"\n✅ Saved RDF to {OUTPUT_RAG_TTL}")
        print(f"📊 Processing Statistics:")
        print(f"   - Total chunks processed: {len(chunks)}")
        print(f"   - Successful chunks: {len(all_turtle_outputs)}")
        print(f"   - Unique entities found: {len(all_entities)}")
        print(f"   - Total KG facts retrieved: {multi_kg_system.stats['facts_retrieved']}")
        print(f"   - Cache hits: {multi_kg_system.stats['cache_hits']}")
        print(f"   - Locations found: {multi_kg_system.stats['locations_found']}")
        print(f"   - Locations with coordinates: {multi_kg_system.stats['locations_with_coordinates']}")
        print(f"   - Location duplicates avoided: {multi_kg_system.stats['location_duplicates_avoided']}")
        print(f"   - Unique global locations: {len(multi_kg_system.global_locations)}")
        print(f"   - RAG status: DEACTIVATED")
        
        print(f"\n🔗 Knowledge Graph Connector Statistics:")
        for name, connector in multi_kg_system.connectors.items():
            stats = connector.get_stats()
            print(f"   - {stats['name']}: {stats['successes']}/{stats['requests']} requests ({stats['success_rate']:.1%} success)")
        
        if multi_kg_system.location_extractor.location_cache:
            successful_locations = sum(1 for v in multi_kg_system.location_extractor.location_cache.values() if v is not None)
            total_locations = len(multi_kg_system.location_extractor.location_cache)
            print(f"   - Location enrichment: {successful_locations}/{total_locations} locations enriched ({successful_locations/total_locations:.1%} success)")
        
        print(f"\n📝 Sample of generated RDF:")
        print("="*60)
        print(final_output[:1000] + "..." if len(final_output) > 1000 else final_output)
        print("="*60)
        
    else:
        print("❌ No events were extracted from any chunks")
    
    # RAG SESSION DEACTIVATED
    print(f"\n❌ RAG System is DEACTIVATED")
    print(f"💡 To enable RAG functionality:")
    print(f"   1. Set RAG_ENABLED = True at the top of the script")
    print(f"   2. Ensure langchain dependencies are installed")
    print(f"   3. Re-run the script")
    
    print(f"\n🎉 Process complete! Check {OUTPUT_RAG_TTL} for RDF results.")
    print(f"📊 System ran in NON-RAG mode - only Knowledge Graph and Location enrichment was used.")

if __name__ == '__main__':
    main()

2025-05-27 22:59:09,972 - INFO - Loaded location ontology from locations.owl


🚀 Starting Multi-Knowledge Graph System with Chunking (RAG DEACTIVATED)
OpenAI API Key loaded successfully.
Loaded text from grover3.txt
📄 Using YOUR text from grover3.txt
📝 Text length: 57806 characters
LLM initialized successfully.

❌ RAG vector store setup SKIPPED (RAG is DEACTIVATED)
🔢 Total tokens in text: 22,227
📊 Text is large, chunking into smaller pieces...


2025-05-27 22:59:10,577 - INFO - Processing chunk 1 (35923 chars) - RAG DISABLED
2025-05-27 22:59:10,579 - INFO - Found potential locations in chunk 1: ['Adidas', 'Giorgo Agamben', 'Exhibition', 'Pictures', 'Mussorgsky', 'Lutrinae']


📄 Created 2 chunks

🔄 Processing chunks for event extraction (without RAG)...

🔄 Processing chunk 1/2...


2025-05-27 22:59:17,094 - INFO - Registered new location: Exhibition
2025-05-27 22:59:24,291 - INFO - Found entities in chunk 1: ['Giorgo Agamben', 'Pictures', 'Exhibition', 'Mussorgsky', 'Lutrinae']...
2025-05-27 22:59:24,292 - INFO - Enriched 1 locations with coordinates
2025-05-27 22:59:24,526 - INFO - Retrieved 0 facts from DBpedia for 'Giorgo Agamben'
2025-05-27 22:59:24,588 - INFO - Retrieved 0 facts from Wikidata for 'Giorgo Agamben'
2025-05-27 22:59:34,747 - INFO - KG retrieval completed: 3/3 successful, 15 cache hits
2025-05-27 23:00:01,854 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-27 23:00:01,872 - INFO - Generated RDF for chunk 1 (without RAG)
2025-05-27 23:00:02,878 - INFO - Processing chunk 2 (21542 chars) - RAG DISABLED
2025-05-27 23:00:02,885 - INFO - Found potential locations in chunk 2: ['Seven', 'Tsiolis']



🔄 Processing chunk 2/2...


2025-05-27 23:00:03,961 - INFO - Registered new location: Seven
2025-05-27 23:00:05,801 - INFO - Found entities in chunk 2: ['Seven', 'Tsiolis']...
2025-05-27 23:00:05,803 - INFO - Enriched 1 locations with coordinates
2025-05-27 23:00:06,031 - INFO - Retrieved 0 facts from DBpedia for 'Tsiolis'
2025-05-27 23:00:06,093 - INFO - Retrieved 0 facts from Wikidata for 'Tsiolis'
2025-05-27 23:00:15,784 - INFO - KG retrieval completed: 3/3 successful, 3 cache hits
2025-05-27 23:00:33,529 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-27 23:00:33,566 - INFO - Generated RDF for chunk 2 (without RAG)



✅ Saved RDF to extracted_events_norag_with_multi_kg_G.ttl
📊 Processing Statistics:
   - Total chunks processed: 2
   - Successful chunks: 2
   - Unique entities found: 8
   - Total KG facts retrieved: 0
   - Cache hits: 18
   - Locations found: 2
   - Locations with coordinates: 2
   - Location duplicates avoided: 0
   - Unique global locations: 2
   - RAG status: DEACTIVATED

🔗 Knowledge Graph Connector Statistics:
   - Wikidata: 2/2 requests (100.0% success)
   - DBpedia: 2/2 requests (100.0% success)
   - ConceptNet: 0/2 requests (0.0% success)
   - Location enrichment: 59/225 locations enriched (26.2% success)

📝 Sample of generated RDF:
@prefix ste: <http://www.example.org/ste#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix dbp: <http://dbpedia.org/ontology/> .
@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix dbpr: <http://dbpedia.or

In [2]:
"""
Enhanced Multi-Knowledge Graph RAG System with Text Chunking - CLAUDE 4 VERSION
Handles large texts by processing them in chunks to avoid token limits
RAG FUNCTIONALITY DEACTIVATED
"""

import os
import re
import time
import json
import hashlib
from typing import List, Dict, Any, Optional, Tuple
import logging
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import tiktoken

from dotenv import load_dotenv
from rdflib import Graph, RDFS, RDF, OWL, URIRef, Namespace, Literal
from rdflib.namespace import XSD, SKOS

# Configuration
INPUT_TEXT_FILE = "part_aa"
ONTOLOGY_PATH = "wiki.owl"
LOCATION_ONTOLOGY_PATH = "locations.owl"
OUTPUT_RAG_TTL = 'extracted_events_norag_with_multi_kg_Claude.ttl'
OUTPUT_RAG_OWL = 'extracted_events_norag_with_multi_kg_Claude.owl'
KG_CACHE_FILE = 'kg_cache.json'
LOCATION_CACHE_FILE = 'location_cache.json'
KG_ANALYSIS_REPORT = 'multi_kg_analysis_report.txt'

# Token limits - UPDATED FOR CLAUDE 4
MAX_TOKENS_PER_REQUEST = 150000  # Claude 4 has higher token limits
CHUNK_OVERLAP = 200  # Characters to overlap between chunks

# RAG DEACTIVATION FLAG
RAG_ENABLED = False  # Set to False to deactivate RAG

# Namespaces
EX = Namespace("http://example.org/")
STE = Namespace("http://www.example.org/ste#")
DBP = Namespace("http://dbpedia.org/ontology/")
LAC = Namespace("http://ontologia.fr/OTB/lac#")
WD = Namespace("http://www.wikidata.org/entity/")
YAGO = Namespace("http://yago-knowledge.org/resource/")
CN = Namespace("http://conceptnet.io/c/en/")
GEO = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
DBPR = Namespace("http://dbpedia.org/resource/")

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Imports - UPDATED FOR CLAUDE
try:
    if RAG_ENABLED:
        from langchain_community.embeddings import HuggingFaceEmbeddings
        from langchain_community.vectorstores import FAISS
    # CHANGED: Import Anthropic instead of OpenAI
    import anthropic
except ImportError as e:
    print(f"ImportError: {e}")
    print("pip install rdflib python-dotenv anthropic langchain langchain-community faiss-cpu sentence-transformers tiktoken requests")
    exit(1)

@dataclass
class LocationInfo:
    """Location information with coordinates"""
    name: str
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    country: Optional[str] = None
    region: Optional[str] = None
    source: str = "extracted"
    confidence: float = 1.0
    uri: Optional[str] = None

@dataclass
class EnhancedKnowledgeFact:
    """Enhanced knowledge fact with metadata"""
    subject: str
    predicate: str
    object: str
    source: str
    confidence: float = 1.0
    context: Optional[str] = None
    temporal: Optional[str] = None
    spatial: Optional[str] = None
    evidence_score: float = 1.0
    source_uri: Optional[str] = None

class LocationExtractor:
    """Extracts and enriches location information"""
    
    def __init__(self, ontology_path: str = LOCATION_ONTOLOGY_PATH):
        self.ontology_path = ontology_path
        self.location_graph = None
        self.location_cache = self._load_location_cache()
        self.load_location_ontology()
        
    def _load_location_cache(self) -> Dict:
        """Load location cache"""
        if os.path.exists(LOCATION_CACHE_FILE):
            try:
                with open(LOCATION_CACHE_FILE, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                logger.warning(f"Could not load location cache: {e}")
        return {}
    
    def _save_location_cache(self):
        """Save location cache"""
        try:
            with open(LOCATION_CACHE_FILE, 'w', encoding='utf-8') as f:
                json.dump(self.location_cache, f, indent=2, ensure_ascii=False)
        except Exception as e:
            logger.warning(f"Could not save location cache: {e}")
    
    def load_location_ontology(self):
        """Load locations.owl ontology"""
        try:
            if os.path.exists(self.ontology_path):
                self.location_graph = Graph()
                self.location_graph.parse(self.ontology_path, format="xml")
                logger.info(f"Loaded location ontology from {self.ontology_path}")
            else:
                logger.warning(f"Location ontology not found at {self.ontology_path}")
                self.location_graph = None
        except Exception as e:
            logger.error(f"Error loading location ontology: {e}")
            self.location_graph = None
    
    def extract_locations_from_text(self, text: str) -> List[str]:
        """Extract potential location names from text"""
        location_patterns = [
            r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:City|County|State|Province|Country|Region|Island|Bay|Sea|Ocean|River|Mountain|Valley|Desert))\b',
            r'\b(?:Mount|Lake|River|Cape|Fort|Port|Saint|St\.)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b',
            r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*(?=\s+(?:in|near|at|from|to))\b',
            r'\b[A-Z][a-zA-Z]{2,}(?:\s+[A-Z][a-zA-Z]{2,})*\b'
        ]
        
        locations = []
        for pattern in location_patterns:
            matches = re.findall(pattern, text)
            locations.extend(matches)
        
        location_stopwords = {
            'The', 'This', 'That', 'These', 'Those', 'And', 'But', 'Or', 'So', 'If', 
            'When', 'Where', 'Who', 'What', 'How', 'Why', 'All', 'Some', 'Many', 'Most',
            'First', 'Second', 'Third', 'Last', 'Next', 'Before', 'After', 'During',
            'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 
            'September', 'October', 'November', 'December', 'Monday', 'Tuesday', 
            'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
        }
        
        filtered_locations = []
        for loc in locations:
            loc = loc.strip()
            if (loc not in location_stopwords and len(loc) > 2 and 
                not loc.isdigit() and not re.match(r'^\d+', loc)):
                filtered_locations.append(loc)
        
        return list(set(filtered_locations))
    
    def get_location_from_ontology(self, location_name: str) -> Optional[LocationInfo]:
        """Get location info from local ontology"""
        if not self.location_graph:
            return None
            
        try:
            query = f"""
            SELECT DISTINCT ?location ?lat ?long ?country ?region WHERE {{
                ?location rdfs:label ?label .
                FILTER(regex(?label, "{location_name}", "i"))
                OPTIONAL {{ ?location geo:lat ?lat }}
                OPTIONAL {{ ?location geo:long ?long }}
                OPTIONAL {{ ?location dbp:country ?country }}
                OPTIONAL {{ ?location dbp:region ?region }}
            }}
            """
            
            results = self.location_graph.query(query)
            for row in results:
                return LocationInfo(
                    name=location_name,
                    latitude=float(row.lat) if row.lat else None,
                    longitude=float(row.long) if row.long else None,
                    country=str(row.country) if row.country else None,
                    region=str(row.region) if row.region else None,
                    source="local_ontology",
                    uri=str(row.location) if row.location else None
                )
        except Exception as e:
            logger.debug(f"Ontology query failed for {location_name}: {e}")
        
        return None
    
    def get_location_from_dbpedia(self, location_name: str) -> Optional[LocationInfo]:
        """Get location coordinates from DBpedia"""
        try:
            time.sleep(0.5)
            entity_uri = f"http://dbpedia.org/resource/{location_name.replace(' ', '_')}"
            
            sparql_query = f"""
            SELECT DISTINCT ?lat ?long ?country ?region WHERE {{
                <{entity_uri}> geo:lat ?lat ;
                               geo:long ?long .
                OPTIONAL {{ <{entity_uri}> dbo:country ?country }}
                OPTIONAL {{ <{entity_uri}> dbo:region ?region }}
            }}
            """
            
            params = {'query': sparql_query, 'format': 'json'}
            response = requests.get("https://dbpedia.org/sparql", params=params, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                bindings = data.get('results', {}).get('bindings', [])
                
                if bindings:
                    binding = bindings[0]
                    return LocationInfo(
                        name=location_name,
                        latitude=float(binding.get('lat', {}).get('value', 0)),
                        longitude=float(binding.get('long', {}).get('value', 0)),
                        country=binding.get('country', {}).get('value', ''),
                        region=binding.get('region', {}).get('value', ''),
                        source="dbpedia",
                        uri=entity_uri
                    )
                    
        except Exception as e:
            logger.debug(f"DBpedia location query failed for {location_name}: {e}")
        
        return None
    
    def get_location_from_wikidata(self, location_name: str) -> Optional[LocationInfo]:
        """Get location coordinates from Wikidata with disambiguation"""
        try:
            time.sleep(0.5)
            
            # Try multiple query strategies to get the right location
            queries = [
                # Try exact label match first
                f"""
                SELECT DISTINCT ?item ?itemLabel ?coord ?country ?countryLabel WHERE {{
                  ?item rdfs:label "{location_name}"@en .
                  ?item wdt:P625 ?coord .
                  OPTIONAL {{ ?item wdt:P17 ?country }}
                  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
                }}
                LIMIT 5
                """,
                # Try with additional filters for places/locations
                f"""
                SELECT DISTINCT ?item ?itemLabel ?coord ?country ?countryLabel WHERE {{
                  ?item rdfs:label "{location_name}"@en .
                  ?item wdt:P625 ?coord .
                  ?item wdt:P31/wdt:P279* wd:Q486972 .  # human settlement
                  OPTIONAL {{ ?item wdt:P17 ?country }}
                  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
                }}
                LIMIT 5
                """
            ]
            
            for query in queries:
                params = {'query': query, 'format': 'json'}
                response = requests.get("https://query.wikidata.org/sparql", params=params, timeout=10)
                
                if response.status_code == 200:
                    data = response.json()
                    bindings = data.get('results', {}).get('bindings', [])
                    
                    if bindings:
                        # Prefer results with country information
                        best_binding = None
                        for binding in bindings:
                            if binding.get('country'):
                                best_binding = binding
                                break
                        
                        if not best_binding:
                            best_binding = bindings[0]
                        
                        coord_str = best_binding.get('coord', {}).get('value', '')
                        
                        coord_match = re.search(r'Point\(([+-]?\d*\.?\d+)\s+([+-]?\d*\.?\d+)\)', coord_str)
                        if coord_match:
                            longitude = float(coord_match.group(1))
                            latitude = float(coord_match.group(2))
                            
                            return LocationInfo(
                                name=location_name,
                                latitude=latitude,
                                longitude=longitude,
                                country=best_binding.get('countryLabel', {}).get('value', ''),
                                source="wikidata",
                                uri=best_binding.get('item', {}).get('value', '')
                            )
                        
        except Exception as e:
            logger.debug(f"Wikidata location query failed for {location_name}: {e}")
        
        return None
    
    def validate_coordinates(self, location_info: LocationInfo) -> bool:
        """Validate that coordinates make sense for the location"""
        if not location_info.latitude or not location_info.longitude:
            return True
        
        lat, lon = location_info.latitude, location_info.longitude
        
        # Basic coordinate range validation
        if not (-90 <= lat <= 90) or not (-180 <= lon <= 180):
            logger.warning(f"Invalid coordinates for {location_info.name}: {lat}, {lon}")
            return False
        
        # Generic geographic validation - flag obviously wrong coordinates
        # If coordinates suggest North America but no clear indication it should be there
        if (-130 < lon < -60) and (25 < lat < 50):  # North America range
            logger.warning(f"Coordinates for '{location_info.name}' appear to be in North America ({lat}, {lon}). "
                         f"Please verify if this is correct for your historical context.")
            # Don't auto-correct, just warn - let the user/context decide
        
        # If coordinates suggest Australia/Oceania for what might be European/Mediterranean names
        elif (110 < lon < 180) and (-45 < lat < -10):  # Australia/Oceania range
            logger.warning(f"Coordinates for '{location_info.name}' appear to be in Australia/Oceania ({lat}, {lon}). "
                         f"Please verify if this is correct for your historical context.")
        
        return True
    
    def enrich_location(self, location_name: str) -> Optional[LocationInfo]:
        """Get enriched location information with coordinates"""
        if location_name in self.location_cache:
            cached = self.location_cache[location_name]
            return LocationInfo(**cached) if cached else None
        
        location_info = None
        
        location_info = self.get_location_from_ontology(location_name)
        
        if not location_info:
            location_info = self.get_location_from_wikidata(location_name)
        
        if not location_info:
            location_info = self.get_location_from_dbpedia(location_name)
        
        if location_info:
            self.location_cache[location_name] = {
                'name': location_info.name,
                'latitude': location_info.latitude,
                'longitude': location_info.longitude,
                'country': location_info.country,
                'region': location_info.region,
                'source': location_info.source,
                'confidence': location_info.confidence,
                'uri': location_info.uri
            }
        else:
            self.location_cache[location_name] = None
        
        self._save_location_cache()
        
        if location_info:
            self.validate_coordinates(location_info)
        
        return location_info

class TextChunker:
    """Handles text chunking to manage token limits"""
    
    def __init__(self, model_name: str = "claude-3-5-sonnet-20241022"):  # CHANGED: Default to Claude model
        # Use GPT tokenizer as approximation for Claude tokens
        self.tokenizer = tiktoken.encoding_for_model("gpt-4")
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in text (approximation for Claude)"""
        return len(self.tokenizer.encode(text))
    
    def chunk_text_by_sentences(self, text: str, max_tokens: int = 25000) -> List[str]:  # CHANGED: Increased for Claude
        """Chunk text by sentences to maintain coherence"""
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
                
            test_chunk = current_chunk + " " + sentence if current_chunk else sentence
            
            if self.count_tokens(test_chunk) > max_tokens and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk = test_chunk
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks

class BaseKGConnector:
    """Base class for knowledge graph connectors"""
    
    def __init__(self, name: str, base_url: str, rate_limit: float = 1.0):
        self.name = name
        self.base_url = base_url
        self.rate_limit = rate_limit
        self.last_request_time = 0
        self.request_count = 0
        self.success_count = 0
        
    def _rate_limit_wait(self):
        """Enforce rate limiting"""
        current_time = time.time()
        time_since_last = current_time - self.last_request_time
        if time_since_last < self.rate_limit:
            time.sleep(self.rate_limit - time_since_last)
        self.last_request_time = time.time()
        self.request_count += 1
    
    def get_stats(self) -> Dict[str, Any]:
        """Get connector statistics"""
        return {
            'name': self.name,
            'requests': self.request_count,
            'successes': self.success_count,
            'success_rate': self.success_count / max(1, self.request_count)
        }
    
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Abstract method to retrieve facts"""
        raise NotImplementedError

class EnhancedWikidataConnector(BaseKGConnector):
    """Wikidata connector"""
    
    def __init__(self):
        super().__init__("Wikidata", "https://query.wikidata.org/sparql", 1.0)
        
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Retrieve facts from Wikidata with timeout protection"""
        try:
            self._rate_limit_wait()
            
            sparql_query = f"""
            SELECT DISTINCT ?subject ?subjectLabel ?predicate ?predicateLabel ?object ?objectLabel WHERE {{
              {{
                ?subject ?label "{entity}"@en .
              }} UNION {{
                ?subject rdfs:label "{entity}"@en .
              }}
              
              ?subject ?predicate ?object .
              FILTER(?predicate != wdt:P31 && ?predicate != wdt:P279)
              
              SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
            }}
            LIMIT {limit}
            """
            
            params = {'query': sparql_query, 'format': 'json'}
            response = requests.get(self.base_url, params=params, timeout=12)  # Reduced timeout
            
            if response.status_code == 200:
                data = response.json()
                facts = []
                
                for binding in data.get('results', {}).get('bindings', []):
                    fact = EnhancedKnowledgeFact(
                        subject=binding.get('subjectLabel', {}).get('value', entity),
                        predicate=binding.get('predicateLabel', {}).get('value', 'related_to'),
                        object=binding.get('objectLabel', {}).get('value', ''),
                        source=self.name,
                        confidence=0.9,
                        source_uri=binding.get('subject', {}).get('value')
                    )
                    facts.append(fact)
                
                self.success_count += 1
                logger.info(f"Retrieved {len(facts)} facts from Wikidata for '{entity}'")
                return facts
            else:
                logger.warning(f"Wikidata returned status {response.status_code} for {entity}")
                
        except requests.Timeout:
            logger.warning(f"Wikidata query timeout for '{entity}'")
        except Exception as e:
            logger.warning(f"Wikidata query failed for '{entity}': {e}")
        
        return []

class EnhancedDBpediaConnector(BaseKGConnector):
    """DBpedia connector"""
    
    def __init__(self):
        super().__init__("DBpedia", "https://dbpedia.org/sparql", 1.0)
        
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Retrieve facts from DBpedia with timeout protection"""
        try:
            self._rate_limit_wait()
            
            entity_uri = f"http://dbpedia.org/resource/{entity.replace(' ', '_')}"
            
            sparql_query = f"""
            SELECT DISTINCT ?predicate ?object WHERE {{
              <{entity_uri}> ?predicate ?object .
              FILTER(LANG(?object) = "en" || !isLiteral(?object))
              FILTER(!isBlank(?object))
            }}
            LIMIT {limit}
            """
            
            params = {'query': sparql_query, 'format': 'json'}
            response = requests.get(self.base_url, params=params, timeout=12)  # Reduced timeout
            
            if response.status_code == 200:
                data = response.json()
                facts = []
                
                for binding in data.get('results', {}).get('bindings', []):
                    predicate = binding.get('predicate', {}).get('value', '')
                    obj = binding.get('object', {}).get('value', '')
                    
                    predicate_name = predicate.split('/')[-1].replace('_', ' ')
                    
                    fact = EnhancedKnowledgeFact(
                        subject=entity,
                        predicate=predicate_name,
                        object=obj,
                        source=self.name,
                        confidence=0.85,
                        source_uri=entity_uri
                    )
                    facts.append(fact)
                
                self.success_count += 1
                logger.info(f"Retrieved {len(facts)} facts from DBpedia for '{entity}'")
                return facts
            else:
                logger.warning(f"DBpedia returned status {response.status_code} for {entity}")
                
        except requests.Timeout:
            logger.warning(f"DBpedia query timeout for '{entity}'")
        except Exception as e:
            logger.warning(f"DBpedia query failed for '{entity}': {e}")
        
        return []

class EnhancedConceptNetConnector(BaseKGConnector):
    """ConceptNet connector with dynamic concept discovery"""
    
    def __init__(self):
        super().__init__("ConceptNet", "http://api.conceptnet.io", 0.5)
        
    def search_related_concepts(self, entity: str) -> List[str]:
        """Search for related concepts using ConceptNet's search API"""
        try:
            # Try search API first
            search_url = f"{self.base_url}/search?text={entity.replace(' ', '%20')}&limit=10"
            response = requests.get(search_url, timeout=10)
            
            related_concepts = []
            if response.status_code == 200:
                data = response.json()
                for edge in data.get('edges', []):
                    start = edge.get('start', {}).get('label', '')
                    end = edge.get('end', {}).get('label', '')
                    
                    # Extract concept paths and clean them
                    for concept_path in [start, end]:
                        if concept_path and '/c/en/' in concept_path:
                            concept = concept_path.replace('/c/en/', '').replace('_', ' ')
                            if concept.lower() != entity.lower() and len(concept) > 2:
                                related_concepts.append(concept)
            
            return list(set(related_concepts))[:5]  # Return top 5 unique concepts
            
        except Exception as e:
            logger.debug(f"ConceptNet search failed for {entity}: {e}")
            return []
    
    def query_concept_directly(self, concept: str, limit: int = 20) -> List[dict]:
        """Query a specific concept and return raw edges"""
        try:
            concept_path = f"/c/en/{concept.lower().replace(' ', '_')}"
            url = f"{self.base_url}{concept_path}?limit={limit}"
            
            response = requests.get(url, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                return data.get('edges', [])
            
        except Exception as e:
            logger.debug(f"ConceptNet direct query failed for {concept}: {e}")
        
        return []
        
    def retrieve_facts(self, entity: str, limit: int = 100) -> List[EnhancedKnowledgeFact]:
        """Retrieve facts from ConceptNet through dynamic discovery"""
        try:
            self._rate_limit_wait()
            all_facts = []
            
            # Strategy 1: Try direct query first
            direct_edges = self.query_concept_directly(entity, limit//2)
            
            # Strategy 2: Search for related concepts and query them
            related_concepts = self.search_related_concepts(entity)
            
            # Process direct edges
            for edge in direct_edges:
                fact = self._edge_to_fact(edge, entity, "direct")
                if fact:
                    all_facts.append(fact)
            
            # Process related concept edges
            for concept in related_concepts:
                concept_edges = self.query_concept_directly(concept, 5)
                for edge in concept_edges:
                    fact = self._edge_to_fact(edge, entity, f"via_{concept}")
                    if fact:
                        all_facts.append(fact)
            
            if all_facts:
                self.success_count += 1
                logger.info(f"Retrieved {len(all_facts)} facts from ConceptNet for '{entity}'")
                if related_concepts:
                    logger.info(f"  - Found related concepts: {related_concepts}")
            
            return all_facts[:limit]
                
        except Exception as e:
            logger.error(f"ConceptNet query failed for '{entity}': {e}")
        
        return []
    
    def _edge_to_fact(self, edge: dict, original_entity: str, discovery_method: str) -> Optional[EnhancedKnowledgeFact]:
        """Convert ConceptNet edge to EnhancedKnowledgeFact"""
        try:
            start = edge.get('start', {})
            end = edge.get('end', {})
            relation = edge.get('rel', {})
            weight = edge.get('weight', 1.0)
            
            start_label = start.get('label', '').replace('/c/en/', '').replace('_', ' ')
            end_label = end.get('label', '').replace('/c/en/', '').replace('_', ' ')
            rel_label = relation.get('label', 'related_to')
            
            # Skip if labels are empty or too short
            if not start_label or not end_label or len(start_label) < 2 or len(end_label) < 2:
                return None
            
            # Determine confidence based on discovery method
            confidence_multiplier = 1.0 if discovery_method == "direct" else 0.6
            
            return EnhancedKnowledgeFact(
                subject=original_entity,
                predicate=rel_label,
                object=end_label if start_label.lower() in original_entity.lower() else start_label,
                source=self.name,
                confidence=min(weight * confidence_multiplier, 1.0),
                context=f"Discovered {discovery_method}"
            )
            
        except Exception as e:
            logger.debug(f"Error converting edge to fact: {e}")
            return None

class MultiKGCache:
    """Caching system for knowledge graph facts"""
    
    def __init__(self, cache_file: str = KG_CACHE_FILE):
        self.cache_file = cache_file
        self.cache = self._load_cache()
        
    def _load_cache(self) -> Dict:
        """Load cache from file"""
        if os.path.exists(self.cache_file):
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                logger.warning(f"Could not load cache: {e}")
        return {}
    
    def _save_cache(self):
        """Save cache to file"""
        try:
            with open(self.cache_file, 'w', encoding='utf-8') as f:
                json.dump(self.cache, f, indent=2, ensure_ascii=False)
        except Exception as e:
            logger.warning(f"Could not save cache: {e}")
    
    def get_cache_key(self, source: str, entity: str) -> str:
        """Generate cache key"""
        return f"{source}:{hashlib.md5(entity.encode()).hexdigest()}"
    
    def get(self, source: str, entity: str) -> Optional[List[Dict]]:
        """Get cached facts"""
        key = self.get_cache_key(source, entity)
        return self.cache.get(key)
    
    def set(self, source: str, entity: str, facts: List[EnhancedKnowledgeFact]):
        """Cache facts"""
        key = self.get_cache_key(source, entity)
        serializable_facts = []
        for fact in facts:
            serializable_facts.append({
                'subject': fact.subject,
                'predicate': fact.predicate,
                'object': fact.object,
                'source': fact.source,
                'confidence': fact.confidence,
                'context': fact.context,
                'temporal': fact.temporal,
                'spatial': fact.spatial,
                'evidence_score': fact.evidence_score,
                'source_uri': fact.source_uri
            })
        self.cache[key] = serializable_facts
        self._save_cache()

class EnhancedMultiKGRAGSystem:
    """Multi-Knowledge Graph system with RAG functionality DEACTIVATED - CLAUDE 4 VERSION"""
    
    def __init__(self):
        self.connectors = {
            'wikidata': EnhancedWikidataConnector(),
            'dbpedia': EnhancedDBpediaConnector(),
            'conceptnet': EnhancedConceptNetConnector()
        }
        self.cache = MultiKGCache()
        self.chunker = TextChunker()
        self.location_extractor = LocationExtractor()
        self.global_locations = {}
        # RAG components deactivated
        self.vectorstore = None  
        self.document_chunks = []  
        self.stats = {
            'queries_processed': 0,
            'entities_extracted': 0,
            'facts_retrieved': 0,
            'cache_hits': 0,
            'chunks_processed': 0,
            'locations_found': 0,
            'locations_with_coordinates': 0,
            'location_duplicates_avoided': 0,
            'rag_queries': 0
        }
        
    def extract_entities_advanced(self, text: str) -> List[str]:
        """Extract entities from text"""
        entities = []
        
        pattern = r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b'
        matches = re.findall(pattern, text)
        entities.extend(matches)
        
        stop_words = {
            'The', 'This', 'That', 'These', 'Those', 'And', 'But', 'Or', 'So', 'If', 'When', 'Where',
            'Who', 'What', 'How', 'Why', 'All', 'Some', 'Many', 'Few', 'Most', 'Each', 'Every',
            'First', 'Second', 'Third', 'Last', 'Next', 'Previous', 'Before', 'After', 'During'
        }
        
        filtered_entities = []
        for entity in entities:
            entity = entity.strip()
            if (entity not in stop_words and len(entity) > 2 and not entity.isdigit()):
                filtered_entities.append(entity)
        
        seen = set()
        unique_entities = []
        for entity in filtered_entities:
            if entity.lower() not in seen:
                seen.add(entity.lower())
                unique_entities.append(entity)
        
        return unique_entities[:15]
    
    def retrieve_kg_facts_enhanced(self, entities: List[str]) -> Dict[str, List[EnhancedKnowledgeFact]]:
        """Retrieve facts from knowledge graphs with improved timeout handling"""
        all_facts = {}
        cache_hits = 0
        
        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = {}
            
            for entity in entities:
                for source_name, connector in self.connectors.items():
                    # Check cache first
                    cached_facts = self.cache.get(source_name, entity)
                    if cached_facts:
                        cache_hits += 1
                        if entity not in all_facts:
                            all_facts[entity] = []
                        for fact_data in cached_facts:
                            fact = EnhancedKnowledgeFact(**fact_data)
                            all_facts[entity].append(fact)
                    else:
                        future = executor.submit(connector.retrieve_facts, entity, 3)
                        futures[future] = (entity, source_name)
            
            # Collect results with better timeout handling
            completed = 0
            total_futures = len(futures)
            
            try:
                for future in as_completed(futures, timeout=45):  # Increased timeout
                    entity, source_name = futures[future]
                    completed += 1
                    
                    try:
                        facts = future.result(timeout=5)  # Individual future timeout
                        if facts:
                            self.cache.set(source_name, entity, facts)
                            
                            if entity not in all_facts:
                                all_facts[entity] = []
                            all_facts[entity].extend(facts)
                            
                            self.stats['facts_retrieved'] += len(facts)
                        
                        logger.debug(f"✅ {source_name} completed for {entity} ({completed}/{total_futures})")
                        
                    except Exception as e:
                        logger.warning(f"❌ {source_name} failed for {entity}: {e}")
                        continue
                        
            except TimeoutError:
                pending_count = total_futures - completed
                logger.warning(f"⏰ Timeout: {pending_count}/{total_futures} KG queries still pending, continuing with available results")
                
                # Cancel remaining futures
                for future in futures:
                    if not future.done():
                        future.cancel()
        
        self.stats['cache_hits'] += cache_hits
        logger.info(f"KG retrieval completed: {completed}/{total_futures} successful, {cache_hits} cache hits")
        return all_facts
    
    def format_kg_context_enhanced(self, kg_facts: Dict[str, List[EnhancedKnowledgeFact]]) -> str:
        """Format KG facts into context string"""
        context_parts = []
        
        for entity, facts in kg_facts.items():
            if facts:
                sorted_facts = sorted(facts, key=lambda f: f.confidence, reverse=True)
                
                context_parts.append(f"\n=== Knowledge about {entity} ===")
                
                by_source = {}
                for fact in sorted_facts[:3]:
                    if fact.source not in by_source:
                        by_source[fact.source] = []
                    by_source[fact.source].append(fact)
                
                for source, source_facts in by_source.items():
                    context_parts.append(f"\nFrom {source}:")
                    for fact in source_facts[:2]:
                        fact_str = f"- {fact.subject} {fact.predicate} {fact.object}"
                        if fact.confidence < 0.8:
                            fact_str += f" (confidence: {fact.confidence:.2f})"
                        context_parts.append(fact_str)
        
        return "\n".join(context_parts)
    
    def register_global_location(self, location_info: LocationInfo) -> str:
        """Register location globally and return unique identifier"""
        location_key = location_info.name.lower().strip()
        
        if location_key in self.global_locations:
            existing = self.global_locations[location_key]
            if (location_info.latitude and location_info.longitude and 
                (not existing.latitude or not existing.longitude)):
                self.global_locations[location_key] = location_info
                logger.info(f"Updated coordinates for {location_info.name}")
            else:
                self.stats['location_duplicates_avoided'] += 1
                logger.debug(f"Location {location_info.name} already registered")
        else:
            self.global_locations[location_key] = location_info
            logger.info(f"Registered new location: {location_info.name}")
        
        clean_name = re.sub(r'[^a-zA-Z0-9]', '', location_info.name)
        return f"ste:Location_{clean_name}"
    
    def process_chunk(self, chunk: str, chunk_num: int, claude_client) -> str:  # CHANGED: Parameter name
        """Process a single chunk of text WITHOUT RAG (RAG DEACTIVATED) - CLAUDE 4 VERSION"""
        logger.info(f"Processing chunk {chunk_num} ({len(chunk)} chars) - RAG DISABLED - USING CLAUDE 4")
        
        # RAG retrieval DEACTIVATED - skip this step
        # relevant_context = ""
        
        # Extract entities and locations (this remains the same)
        entities = self.extract_entities_advanced(chunk)
        locations = self.location_extractor.extract_locations_from_text(chunk)
        logger.info(f"Found potential locations in chunk {chunk_num}: {locations}")
        
        # Enrich locations with coordinates
        enriched_locations = {}
        for location_name in locations[:10]:
            location_info = self.location_extractor.enrich_location(location_name)
            if location_info:
                self.register_global_location(location_info)
                enriched_locations[location_name] = location_info
                self.stats['locations_found'] += 1
                if location_info.latitude and location_info.longitude:
                    self.stats['locations_with_coordinates'] += 1
        
        if not entities and not enriched_locations:
            logger.info(f"No entities or locations found in chunk {chunk_num}")
            return ""
        
        logger.info(f"Found entities in chunk {chunk_num}: {entities[:5]}...")
        logger.info(f"Enriched {len(enriched_locations)} locations with coordinates")
        
        # Get KG facts for entities (this remains the same)
        kg_facts = self.retrieve_kg_facts_enhanced(entities)
        kg_context = self.format_kg_context_enhanced(kg_facts)
        location_context = self.format_location_context(enriched_locations)
        
        # SIMPLIFIED PROMPT WITHOUT RAG - OPTIMIZED FOR CLAUDE 4
        simplified_prompt = f"""You are extracting historical events from text chunks. Use knowledge graph facts and location coordinates to enhance your extraction.

CURRENT TEXT CHUNK {chunk_num} TO ANALYZE:
{chunk}

KNOWLEDGE GRAPH FACTS FOR ENTITIES IN THIS CHUNK:
{kg_context}

LOCATION INFORMATION WITH COORDINATES:
{location_context}

TASK: Extract ONLY the events that are actually mentioned in the current text chunk.

Requirements:
1. Extract ONLY events mentioned in the CURRENT text chunk
2. Use KG facts to enhance entity information
3. Use location coordinates to provide precise geographical data
4. Include ALL these properties for each event:
   - ste:hasType (description of event)
   - ste:hasAgent (who caused/led the event)
   - ste:hasTime (when it happened)
   - ste:hasLocation (location name from text)
   - ste:hasLatitude (latitude coordinate if available)
   - ste:hasLongitude (longitude coordinate if available)
   - ste:hasCountry (country if available)
   - ste:hasRegion (region if available)
   - ste:hasLocationSource (source of coordinates: wikidata/dbpedia/local_ontology)
   - ste:hasResult (outcome/consequence)

Output format (do not include prefixes, they will be added later):
```turtle
ste:Event{chunk_num}_1 a ste:Event, dbp:SpecificEventType ;
    ste:hasType "specific description from current chunk" ;
    ste:hasAgent "specific person from current chunk" ;
    ste:hasTime "specific date from current chunk" ;
    ste:hasLocation "specific location from current chunk" ;
    ste:hasLatitude "37.1234"^^xsd:double ;
    ste:hasLongitude "15.5678"^^xsd:double ;
    ste:hasCountry "Italy" ;
    ste:hasRegion "Sicily" ;
    ste:hasLocationSource "wikidata" ;
    ste:hasResult "specific outcome from current chunk" .
```

IMPORTANT: 
- Extract events ONLY from the CURRENT text chunk
- Use KG facts to enrich entity details
- Include precise coordinates from location sources
- Only extract events explicitly mentioned in the current chunk
- If no clear events are found in current chunk, return empty
"""
        
        try:
            # CHANGED: Claude API call instead of OpenAI
            response = claude_client.messages.create(
                model="claude-3-5-sonnet-20241022",  # Latest Claude model
                max_tokens=4000,
                temperature=0,
                messages=[{"role": "user", "content": simplified_prompt}]
            )
            
            turtle_output = self.clean_turtle(response.content[0].text)
            self.stats['chunks_processed'] += 1
            logger.info(f"Generated RDF for chunk {chunk_num} (Claude 4, without RAG)")
            return turtle_output
        except Exception as e:
            logger.error(f"Error processing chunk {chunk_num} with Claude: {e}")
            return ""
    
    def format_location_context(self, enriched_locations: Dict[str, LocationInfo]) -> str:
        """Format location information into context string"""
        if not enriched_locations:
            return "No location coordinates available."
        
        context_parts = ["\n=== Location Information ==="]
        
        for location_name, location_info in enriched_locations.items():
            context_parts.append(f"\n{location_name}:")
            context_parts.append(f"  - Source: {location_info.source}")
            
            if location_info.latitude and location_info.longitude:
                context_parts.append(f"  - Coordinates: {location_info.latitude}, {location_info.longitude}")
                if location_info.source == "corrected":
                    context_parts.append(f"  - NOTE: Coordinates were corrected for historical accuracy")
            else:
                context_parts.append("  - Coordinates: Not available")
            
            if location_info.country:
                context_parts.append(f"  - Country: {location_info.country}")
            
            if location_info.region:
                context_parts.append(f"  - Region: {location_info.region}")
            
            if location_info.uri:
                context_parts.append(f"  - URI: {location_info.uri}")
        
        return "\n".join(context_parts)
    
    def generate_global_location_rdf(self) -> str:
        """Generate RDF for all unique locations found across all chunks"""
        if not self.global_locations:
            return ""
        
        location_rdf_parts = []
        
        for location_key, location_info in self.global_locations.items():
            clean_name = re.sub(r'[^a-zA-Z0-9]', '', location_info.name)
            location_id = f"ste:Location_{clean_name}"
            
            rdf_lines = [f'{location_id} a ste:Location ;']
            rdf_lines.append(f'    rdfs:label "{location_info.name}" ;')
            
            if location_info.latitude and location_info.longitude:
                rdf_lines.append(f'    geo:lat "{location_info.latitude}"^^xsd:double ;')
                rdf_lines.append(f'    geo:long "{location_info.longitude}"^^xsd:double ;')
            
            if location_info.country:
                rdf_lines.append(f'    ste:hasCountry "{location_info.country}" ;')
            
            if location_info.region:
                rdf_lines.append(f'    ste:hasRegion "{location_info.region}" ;')
            
            if location_info.source:
                rdf_lines.append(f'    ste:hasSource "{location_info.source}" ;')
            
            if location_info.uri:
                rdf_lines.append(f'    ste:hasURI <{location_info.uri}> ;')
            
            if rdf_lines[-1].endswith(' ;'):
                rdf_lines[-1] = rdf_lines[-1][:-2] + ' .'
            
            location_rdf_parts.append('\n'.join(rdf_lines))
        
        return '\n\n'.join(location_rdf_parts)
    
    def clean_turtle(self, raw_output: str) -> str:
        """Clean turtle output"""
        m = re.search(r"```(?:turtle)?\s*(.*?)```", raw_output, re.DOTALL | re.IGNORECASE)
        if m:
            return m.group(1).strip()
        
        lines = raw_output.strip().split('\n')
        turtle_lines = []
        for line in lines:
            stripped = line.strip()
            if (stripped.startswith('@') or stripped.startswith('<') or 
                stripped.startswith(':') or stripped.startswith('_') or 
                stripped.startswith('a ') or ':' in stripped or stripped == ''):
                turtle_lines.append(line)
        
        return '\n'.join(turtle_lines)

    # RAG METHODS DEACTIVATED
    def prepare_vectorstore(self, text_chunks: List[str]):
        """RAG DEACTIVATED: Vector store preparation disabled"""
        logger.info("RAG functionality is DEACTIVATED - vectorstore not created")
        return False
    
    def rag_query(self, query: str, claude_client, k: int = 20) -> Dict[str, Any]:  # CHANGED: Parameter name
        """RAG DEACTIVATED: RAG queries disabled"""
        return {"error": "RAG functionality is DEACTIVATED. Set RAG_ENABLED=True to enable RAG features."}
    
    def interactive_rag_session(self, claude_client):  # CHANGED: Parameter name
        """RAG DEACTIVATED: Interactive RAG session disabled"""
        print("\n❌ RAG functionality is DEACTIVATED")
        print("To enable RAG, set RAG_ENABLED=True at the top of the script")

# Utility functions
def load_api_key():
    """Load Anthropic API key"""  # CHANGED: Comment
    load_dotenv()
    api_key = os.getenv("ANTHROPIC_API_KEY")  # CHANGED: Environment variable name
    if not api_key:
        print("Error: ANTHROPIC_API_KEY not found in environment variables")  # CHANGED: Error message
        print("Please set your Anthropic API key:")
        print("export ANTHROPIC_API_KEY='your-api-key-here'")
        return None
    print("Anthropic API Key loaded successfully.")  # CHANGED: Success message
    return api_key

def load_text_from_file(filepath: str) -> str:
    """Load text from file"""
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return ""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        print(f"Loaded text from {filepath}")
        return text
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")
        return ""

def initialize_claude_client(api_key: str):  # CHANGED: Function name and logic
    """Initialize Claude client"""
    if not api_key:
        return None
    try:
        client = anthropic.Anthropic(api_key=api_key)
        print("Claude 4 client initialized successfully.")
        return client
    except Exception as e:
        print(f"Error initializing Claude client: {e}")
        return None

def prepare_vectorstore_from_text(text: str, multi_kg_system):
    """RAG DEACTIVATED: Vector store creation disabled"""
    if not RAG_ENABLED:
        logger.info("RAG functionality is DEACTIVATED - vectorstore not created")
        return None
    
    # Original code would go here if RAG_ENABLED was True
    return None

def main():
    """Main function with chunking support (RAG DEACTIVATED) - CLAUDE 4 VERSION"""
    print("🚀 Starting Multi-Knowledge Graph System with Chunking (RAG DEACTIVATED) - CLAUDE 4 VERSION")
    
    api_key = load_api_key()
    if not api_key:
        return
    
    domain_text = load_text_from_file(INPUT_TEXT_FILE)
    if not domain_text:
        print("⚠️  No input file found, using sample text")
        domain_text = """The Battle of Salamis was a decisive naval battle in 480 BC. 
        Themistocles led the Greek fleet to victory over the Persians commanded by Xerxes. 
        This victory established Greek naval supremacy in the Aegean Sea."""
    else:
        print(f"📄 Using YOUR text from {INPUT_TEXT_FILE}")
        print(f"📝 Text length: {len(domain_text)} characters")
    
    multi_kg_system = EnhancedMultiKGRAGSystem()
    claude_client = initialize_claude_client(api_key)  # CHANGED: Function call
    
    if not claude_client:  # CHANGED: Variable name
        return
    
    # Vector store preparation SKIPPED (RAG deactivated)
    print("\n❌ RAG vector store setup SKIPPED (RAG is DEACTIVATED)")
    
    token_count = multi_kg_system.chunker.count_tokens(domain_text)
    print(f"🔢 Total tokens in text: {token_count:,}")
    
    if token_count > 25000:  # CHANGED: Increased threshold for Claude
        print("📊 Text is large, chunking into smaller pieces...")
        chunks = multi_kg_system.chunker.chunk_text_by_sentences(domain_text, max_tokens=25000)  # CHANGED: Increased
        print(f"📄 Created {len(chunks)} chunks")
    else:
        print("📄 Text is small enough to process as single chunk")
        chunks = [domain_text]
    
    # Extract events and create RDF (without RAG)
    all_turtle_outputs = []
    all_entities = set()
    
    print("\n🔄 Processing chunks for event extraction (without RAG, using Claude 4)...")
    for i, chunk in enumerate(chunks, 1):
        print(f"\n🔄 Processing chunk {i}/{len(chunks)} with Claude 4...")
        
        turtle_output = multi_kg_system.process_chunk(chunk, i, claude_client)  # CHANGED: Parameter
        if turtle_output:
            all_turtle_outputs.append(turtle_output)
            
        chunk_entities = multi_kg_system.extract_entities_advanced(chunk)
        all_entities.update(chunk_entities)
        
        if i < len(chunks):
            time.sleep(1)  # Rate limiting
    
    # Save RDF output
    if all_turtle_outputs:
        prefixes = """@prefix ste: <http://www.example.org/ste#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix dbp: <http://dbpedia.org/ontology/> .
@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix dbpr: <http://dbpedia.org/resource/> .

"""
        
        final_output = prefixes + "# Historical Events with Knowledge Graph Enhanced Location Data (RAG DEACTIVATED, CLAUDE 4)\n" + "\n\n".join(all_turtle_outputs)
        
        with open(OUTPUT_RAG_TTL, 'w', encoding='utf-8') as f:
            f.write(final_output)
        
        print(f"\n✅ Saved RDF to {OUTPUT_RAG_TTL}")
        print(f"📊 Processing Statistics (Claude 4):")
        print(f"   - Total chunks processed: {len(chunks)}")
        print(f"   - Successful chunks: {len(all_turtle_outputs)}")
        print(f"   - Unique entities found: {len(all_entities)}")
        print(f"   - Total KG facts retrieved: {multi_kg_system.stats['facts_retrieved']}")
        print(f"   - Cache hits: {multi_kg_system.stats['cache_hits']}")
        print(f"   - Locations found: {multi_kg_system.stats['locations_found']}")
        print(f"   - Locations with coordinates: {multi_kg_system.stats['locations_with_coordinates']}")
        print(f"   - Location duplicates avoided: {multi_kg_system.stats['location_duplicates_avoided']}")
        print(f"   - Unique global locations: {len(multi_kg_system.global_locations)}")
        print(f"   - LLM used: Claude 4 (Anthropic)")  # CHANGED: Added LLM info
        print(f"   - RAG status: DEACTIVATED")
        
        print(f"\n🔗 Knowledge Graph Connector Statistics:")
        for name, connector in multi_kg_system.connectors.items():
            stats = connector.get_stats()
            print(f"   - {stats['name']}: {stats['successes']}/{stats['requests']} requests ({stats['success_rate']:.1%} success)")
        
        if multi_kg_system.location_extractor.location_cache:
            successful_locations = sum(1 for v in multi_kg_system.location_extractor.location_cache.values() if v is not None)
            total_locations = len(multi_kg_system.location_extractor.location_cache)
            print(f"   - Location enrichment: {successful_locations}/{total_locations} locations enriched ({successful_locations/total_locations:.1%} success)")
        
        print(f"\n📝 Sample of generated RDF:")
        print("="*60)
        print(final_output[:1000] + "..." if len(final_output) > 1000 else final_output)
        print("="*60)
        
    else:
        print("❌ No events were extracted from any chunks")
    
    # RAG SESSION DEACTIVATED
    print(f"\n❌ RAG System is DEACTIVATED")
    print(f"💡 To enable RAG functionality:")
    print(f"   1. Set RAG_ENABLED = True at the top of the script")
    print(f"   2. Ensure langchain dependencies are installed")
    print(f"   3. Re-run the script")
    
    print(f"\n🎉 Process complete! Check {OUTPUT_RAG_TTL} for RDF results.")
    print(f"📊 System ran in NON-RAG mode with Claude 4 - only Knowledge Graph and Location enrichment was used.")

if __name__ == '__main__':
    main()

2025-05-29 07:51:02,083 - INFO - Loaded location ontology from locations.owl


🚀 Starting Multi-Knowledge Graph System with Chunking (RAG DEACTIVATED) - CLAUDE 4 VERSION
Anthropic API Key loaded successfully.
Loaded text from part_aa
📄 Using YOUR text from part_aa
📝 Text length: 398568 characters
Claude 4 client initialized successfully.

❌ RAG vector store setup SKIPPED (RAG is DEACTIVATED)
🔢 Total tokens in text: 86,945
📊 Text is large, chunking into smaller pieces...


2025-05-29 07:51:07,312 - INFO - Processing chunk 1 (117220 chars) - RAG DISABLED - USING CLAUDE 4
2025-05-29 07:51:07,321 - INFO - Found potential locations in chunk 1: ['Corinthians', 'Ambraciots', 'Chimerium', 'Ephyre', 'King Cyrus', 'BOOK', 'Immediately', 'Cambyses', 'Pydna Accordingly', 'Strepsa', 'The Lacedaemonians', 'Pisistratus', 'Lacedaemonius', 'Seeing', 'Thyamis', 'Phocaeans', 'Atreus', 'Hence', 'Various', 'Achaeans', 'The Translator', 'Ambracia', 'Megara', 'Lacedaemon\n\nThe Athenians', 'Ozolian Locrians', 'Thrace', 'Bottiaeans', 'Old', 'Sybota Thus', 'Peloponnesian War', 'Troy', 'Actium', 'Zacynthus', 'Could', 'Corcyraeans', 'Athenians Thus', 'Trojan', 'Hellenes', 'Alive', 'Corinth', 'Euboea', 'Chrysippus', 'Assuredly', 'Sicily With', 'Meikiades', 'Epidamnus', 'Eurybatus', 'Corinth But', 'Corinth For', 'Eurystheus', 'THE PELOPONNESIAN WAR', 'Thesprotis', 'They', 'START', 'Danaans', 'Asopius Arrived', 'Glaucon', 'CHAPTER', 'Pale', 'Aware', 'Subsequently', 'Thessalians', 'P

📄 Created 4 chunks

🔄 Processing chunks for event extraction (without RAG, using Claude 4)...

🔄 Processing chunk 1/4 with Claude 4...


2025-05-29 07:51:19,599 - INFO - Registered new location: Cambyses
2025-05-29 07:51:22,321 - INFO - Registered new location: Strepsa
2025-05-29 07:51:22,322 - INFO - Found entities in chunk 1: ['The Project Gutenberg', 'The History', 'Peloponnesian War', 'Thucydides\nThis', 'United States']...
2025-05-29 07:51:22,323 - INFO - Enriched 2 locations with coordinates
2025-05-29 07:51:22,589 - INFO - Retrieved 0 facts from Wikidata for 'The Project Gutenberg'
This
This
2025-05-29 07:51:24,597 - INFO - Retrieved 0 facts from Wikidata for 'Project Gutenberg License'
2025-05-29 07:51:24,794 - INFO - Retrieved 0 facts from DBpedia for 'Project Gutenberg License'
Author
Author
Translator
Translator
Release Date
Release Date
Character
Character
2025-05-29 07:51:59,875 - INFO - KG retrieval completed: 21/21 successful, 24 cache hits
2025-05-29 07:52:13,858 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-05-29 07:52:13,864 - INFO - Generated RDF for chunk 1 


🔄 Processing chunk 2/4 with Claude 4...


2025-05-29 07:52:21,049 - INFO - Registered new location: Cylon
2025-05-29 07:52:21,050 - INFO - Found entities in chunk 2: ['Themistocles', 'Piraeus', 'Athenians', 'For', 'Between']...
2025-05-29 07:52:21,051 - INFO - Enriched 1 locations with coordinates
2025-05-29 07:52:21,303 - INFO - Retrieved 3 facts from DBpedia for 'About'
2025-05-29 07:52:21,511 - INFO - Retrieved 3 facts from Wikidata for 'About'
2025-05-29 07:52:22,277 - INFO - Retrieved 3 facts from DBpedia for 'His'
2025-05-29 07:52:22,352 - INFO - Retrieved 3 facts from Wikidata for 'His'
2025-05-29 07:52:23,339 - INFO - Retrieved 3 facts from Wikidata for 'Thus'
2025-05-29 07:52:23,565 - INFO - Retrieved 3 facts from DBpedia for 'Thus'
2025-05-29 07:52:30,445 - INFO - Retrieved 1 facts from ConceptNet for 'About'
2025-05-29 07:52:30,708 - INFO - Retrieved 0 facts from Wikidata for 'Mede Meanwhile Pausanias'
2025-05-29 07:52:30,927 - INFO - Retrieved 0 facts from DBpedia for 'Mede Meanwhile Pausanias'
2025-05-29 07:52:34,


🔄 Processing chunk 3/4 with Claude 4...


2025-05-29 07:53:30,027 - INFO - Registered new location: Cydonia
2025-05-29 07:53:35,449 - INFO - Found entities in chunk 3: ['There', 'Against', 'Further', 'Athenian', 'Athens']...
2025-05-29 07:53:35,451 - INFO - Enriched 1 locations with coordinates
2025-05-29 07:53:35,677 - INFO - Retrieved 3 facts from DBpedia for 'Our'
2025-05-29 07:53:35,778 - INFO - Retrieved 3 facts from Wikidata for 'Our'
2025-05-29 07:53:36,721 - INFO - Retrieved 3 facts from DBpedia for 'Nor'
2025-05-29 07:53:36,787 - INFO - Retrieved 3 facts from Wikidata for 'Nor'
2025-05-29 07:53:37,788 - INFO - Retrieved 3 facts from Wikidata for 'Again'
2025-05-29 07:53:38,013 - INFO - Retrieved 3 facts from DBpedia for 'Again'
2025-05-29 07:53:44,384 - INFO - Retrieved 1 facts from ConceptNet for 'Our'
2025-05-29 07:53:44,656 - INFO - Retrieved 0 facts from Wikidata for 'Athenian And'
2025-05-29 07:53:44,875 - INFO - Retrieved 0 facts from DBpedia for 'Athenian And'
2025-05-29 07:53:52,575 - INFO - Retrieved 1 facts 


🔄 Processing chunk 4/4 with Claude 4...


2025-05-29 07:54:35,690 - INFO - Registered new location: Plataea
2025-05-29 07:54:36,477 - INFO - Registered new location: Cyme
2025-05-29 07:54:36,478 - INFO - Registered new location: Salamis
2025-05-29 07:54:45,537 - INFO - Found entities in chunk 4: ['Antissa', 'Pyrrha', 'Eresus', 'Methymnians', 'Antissians']...
2025-05-29 07:54:45,538 - INFO - Enriched 3 locations with coordinates
2025-05-29 07:54:45,765 - INFO - Retrieved 0 facts from DBpedia for 'Methymnians'
2025-05-29 07:54:45,809 - INFO - Retrieved 0 facts from Wikidata for 'Methymnians'
2025-05-29 07:54:46,815 - INFO - Retrieved 0 facts from Wikidata for 'Antissians'
2025-05-29 07:54:47,079 - INFO - Retrieved 0 facts from DBpedia for 'Antissians'
2025-05-29 07:54:47,806 - INFO - Retrieved 0 facts from Wikidata for 'Mitylenians'
2025-05-29 07:54:48,036 - INFO - Retrieved 0 facts from DBpedia for 'Mitylenians'
2025-05-29 07:55:05,644 - INFO - Retrieved 0 facts from Wikidata for 'The Athenians'
2025-05-29 07:55:05,862 - INFO -


✅ Saved RDF to extracted_events_norag_with_multi_kg_Claude.ttl
📊 Processing Statistics (Claude 4):
   - Total chunks processed: 4
   - Successful chunks: 4
   - Unique entities found: 56
   - Total KG facts retrieved: 53
   - Cache hits: 106
   - Locations found: 7
   - Locations with coordinates: 7
   - Location duplicates avoided: 0
   - Unique global locations: 7
   - LLM used: Claude 4 (Anthropic)
   - RAG status: DEACTIVATED

🔗 Knowledge Graph Connector Statistics:
   - Wikidata: 18/23 requests (78.3% success)
   - DBpedia: 17/22 requests (77.3% success)
   - ConceptNet: 6/29 requests (20.7% success)
   - Location enrichment: 40/157 locations enriched (25.5% success)

📝 Sample of generated RDF:
@prefix ste: <http://www.example.org/ste#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix dbp: <http://dbpedia.org/ontology/> .
@prefix geo: <http://www.w3.org