In [None]:
# Cell 1: Mount Drive and Install Dependencies
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers torch accelerate

"""
TRUE BASE GENERATION SYSTEM
No external knowledge sources - pure LLM generation from text only
All external APIs, Knowledge Graphs, and Location enrichment DISABLED
"""

import os
import re
import time
import logging
from typing import List, Dict, Any
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Configuration
INPUT_TEXT_FILE = "/content/drive/MyDrive/part_aa"
OUTPUT_BASE_TTL = '/content/drive/MyDrive/extracted_events_base_generation_G.ttl'

# TRUE BASE GENERATION FLAGS - All external knowledge DISABLED
RAG_ENABLED = False
KNOWLEDGE_GRAPHS_ENABLED = False
LOCATION_ENRICHMENT_ENABLED = False
EXTERNAL_APIS_ENABLED = False

# Token limits
MAX_TOKENS_PER_REQUEST = 100000
CHUNK_OVERLAP = 200

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TextChunker:
    """Handles text chunking to manage token limits"""

    def __init__(self, model_name: str = "microsoft/DialoGPT-medium"):
        self.model_name = model_name

    def count_tokens(self, text: str) -> int:
        """Approximate token count (roughly 4 chars per token)"""
        return len(text) // 4

    def chunk_text_by_sentences(self, text: str, max_tokens: int = 1000) -> List[str]:
        """Chunk text by sentences to maintain coherence - smaller chunks for Colab"""
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            test_chunk = current_chunk + " " + sentence if current_chunk else sentence

            if self.count_tokens(test_chunk) > max_tokens and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk = test_chunk

        if current_chunk.strip():
            chunks.append(current_chunk.strip())

        return chunks

class TrueBaseGenerationSystem:
    """TRUE Base Generation System - No external knowledge sources"""

    def __init__(self):
        self.chunker = TextChunker()
        self.stats = {
            'chunks_processed': 0,
            'events_extracted': 0,
            'external_api_calls': 0,  # This will stay 0
            'knowledge_sources_used': 0,  # This will stay 0
        }
        logger.info("TRUE BASE GENERATION SYSTEM INITIALIZED")
        logger.info("❌ Knowledge Graphs: DISABLED")
        logger.info("❌ Location Enrichment: DISABLED")
        logger.info("❌ External APIs: DISABLED")
        logger.info("❌ RAG Text Retrieval: DISABLED")
        logger.info("✅ Pure LLM Generation: ENABLED")

    def extract_basic_entities_from_text_only(self, text: str) -> List[str]:
        """Extract entities using ONLY pattern matching - no external validation"""
        pattern = r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b'
        matches = re.findall(pattern, text)

        stop_words = {
            'The', 'This', 'That', 'These', 'Those', 'And', 'But', 'Or', 'So', 'If',
            'When', 'Where', 'Who', 'What', 'How', 'Why', 'All', 'Some', 'Many',
            'First', 'Second', 'Third', 'Last', 'Next', 'Before', 'After', 'During'
        }

        filtered_entities = []
        for entity in matches:
            entity = entity.strip()
            if (entity not in stop_words and len(entity) > 2 and not entity.isdigit()):
                filtered_entities.append(entity)

        seen = set()
        unique_entities = []
        for entity in filtered_entities:
            if entity.lower() not in seen:
                seen.add(entity.lower())
                unique_entities.append(entity)

        return unique_entities[:5]  # Reduced for smaller model

    def extract_basic_locations_from_text_only(self, text: str) -> List[str]:
        """Extract locations using ONLY pattern matching - no coordinate lookup"""
        location_patterns = [
            r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:City|County|State|Province|Country|Region|Island|Bay|Sea|Ocean|River|Mountain|Valley|Desert))\b',
            r'\b(?:Mount|Lake|River|Cape|Fort|Port|Saint|St\.)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b',
        ]

        locations = []
        for pattern in location_patterns:
            matches = re.findall(pattern, text)
            locations.extend(matches)

        location_stopwords = {
            'The', 'This', 'That', 'And', 'But', 'Or', 'So', 'If', 'When', 'Where',
            'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
            'September', 'October', 'November', 'December'
        }

        filtered_locations = []
        for loc in locations:
            loc = loc.strip()
            if (loc not in location_stopwords and len(loc) > 2 and not loc.isdigit()):
                filtered_locations.append(loc)

        return list(set(filtered_locations))[:3]  # Reduced for smaller model

    def process_chunk_true_base(self, chunk: str, chunk_num: int, model_data) -> str:
        """Process chunk with TRUE base generation - no external knowledge"""
        logger.info(f"Processing chunk {chunk_num} ({len(chunk)} chars) - TRUE BASE GENERATION")

        tokenizer, model = model_data

        entities = self.extract_basic_entities_from_text_only(chunk)
        locations = self.extract_basic_locations_from_text_only(chunk)

        logger.info(f"Found entities (text-only): {entities[:3]}...")
        logger.info(f"Found locations (text-only): {locations[:3]}...")

        if not entities and not locations:
            logger.info(f"No entities or locations found in chunk {chunk_num}")
            return ""

        # EXACT SAME PROMPT AS YOUR ORIGINAL CODE
        base_prompt = f"""You are extracting historical events from text using ONLY the information provided in the text chunk. Do not use external knowledge sources, but you CAN and SHOULD make reasonable inferences from the text.

TEXT CHUNK {chunk_num} TO ANALYZE:
{chunk}

ENTITIES FOUND IN TEXT: {', '.join(entities) if entities else 'None'}
LOCATIONS FOUND IN TEXT: {', '.join(locations) if locations else 'None'}

TASK: Extract historical events mentioned in this text chunk using the text information and making REASONABLE INFERENCES.

REQUIREMENTS:
1. Extract ONLY events explicitly mentioned in the text chunk
2. Use information directly stated in the text
3. MAKE REASONABLE INFERENCES from context clues in the text
4. If you can reasonably infer coordinates, countries, regions from textual context, DO IT
5. Include ALL these properties for each event:
   - ste:hasType (description of event, enhanced with context)
   - ste:hasAgent (who caused/led the event, with inferred roles)
   - ste:hasTime (when it happened, with inferred specificity)
   - ste:hasLocation (location name from text)
   - ste:hasLatitude (infer approximate coordinates if you can from text context)
   - ste:hasLongitude (infer approximate coordinates if you can from text context)
   - ste:hasCountry (infer country from textual geographic context)
   - ste:hasRegion (infer region from textual geographic context)
   - ste:hasLocationSource "inferred" (if you made geographic inferences)
   - ste:hasResult (outcome, enhanced with contextual inference)

INFERENCE GUIDELINES:
- If text mentions "Athens", infer it's in Greece, approximate coordinates
- If text mentions "Sicily", infer it's in Italy, Mediterranean coordinates
- If text mentions "Sparta/Lacedaemon", infer Peloponnese, Greece
- If you know from context clues what geographic region events occurred in, infer coordinates
- If someone is called "King", infer royal title
- If text implies timeframes, infer more specific dates
- If outcomes are implied, infer likely results

Output format (do not include prefixes):
```turtle
ste:Event{chunk_num}_1 a ste:Event ;
    ste:hasType "specific event type inferred from context" ;
    ste:hasAgent "person/group with inferred roles" ;
    ste:hasTime "time period with inferred specificity" ;
    ste:hasLocation "location name from text" ;
    ste:hasLatitude "37.9838" ;
    ste:hasLongitude "23.7275" ;
    ste:hasCountry "Greece" ;
    ste:hasRegion "Attica" ;
    ste:hasLocationSource "inferred" ;
    ste:hasResult "outcome inferred from context" .
```

CRITICAL:
- Use the SAME output format as enhanced systems for fair comparison
- INFER coordinates, countries, regions if you can reasonably deduce them from text
- Make the events as detailed and specific as possible through inference
- If you truly cannot infer something, then use empty string ""
- The goal is to extract maximum information through text analysis and inference

If no clear historical events are mentioned in the text, return empty.
"""

        try:
            # Tokenize and generate
            inputs = tokenizer.encode(base_prompt, return_tensors="pt", max_length=512, truncation=True)

            with torch.no_grad():
                outputs = model.generate(
                    inputs,
                    max_length=inputs.shape[1] + 200,
                    num_return_sequences=1,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract only the generated part
            generated_text = response_text[len(base_prompt):].strip()

            # DEBUG: Print what the model actually generated
            print(f"🔍 Raw model output for chunk {chunk_num}:")
            print(f"'{generated_text[:200]}...'")

            turtle_output = self.clean_turtle(generated_text)

            # DEBUG: Print cleaned output
            print(f"🔍 Cleaned turtle output:")
            print(f"'{turtle_output[:200]}...'")

            if turtle_output:
                self.stats['chunks_processed'] += 1
                event_count = turtle_output.count('ste:Event')
                self.stats['events_extracted'] += event_count
                logger.info(f"Generated {event_count} events from chunk {chunk_num}")
            else:
                print(f"⚠️ No turtle output after cleaning for chunk {chunk_num}")

            return turtle_output

        except Exception as e:
            logger.error(f"Error processing chunk {chunk_num}: {e}")
            return ""

    def clean_turtle(self, raw_output: str) -> str:
        """Clean turtle output"""
        lines = raw_output.strip().split('\n')
        turtle_lines = []
        for line in lines:
            stripped = line.strip()
            if (stripped.startswith('ste:') or
                stripped.startswith('a ') or
                ':' in stripped and ('hasType' in stripped or 'hasAgent' in stripped or 'hasTime' in stripped)):
                turtle_lines.append(line)

        return '\n'.join(turtle_lines)

# Utility functions
def load_api_key():
    """Hugging Face model doesn't need an API key for basic models"""
    print("✅ Using Hugging Face model - no API key needed.")
    return "local"

def load_text_from_file(filepath: str) -> str:
    """Load text from file"""
    if not os.path.isfile(filepath):
        print(f"❌ File not found: {filepath}")
        return ""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        print(f"✅ Loaded text from {filepath}")
        return text
    except Exception as e:
        print(f"❌ Error reading file {filepath}: {e}")
        return ""

def initialize_llm(api_key: str):
    """Initialize Hugging Face model"""
    try:
        print("🔄 Loading Hugging Face model (this may take a few minutes)...")

        # Using a proper instruct model that fits in Colab
        model_name = "microsoft/DialoGPT-medium"  # Will be replaced below

        # Using better free models that can handle structured output
        better_models = [
            "microsoft/DialoGPT-medium",        # Better at structured tasks
            "google/flan-t5-large",             # Instruction-tuned, good at structured output
            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Fallback
        ]

        model_name = better_models[0]  # Use DialoGPT-medium

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # Add padding token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print("✅ Hugging Face model loaded successfully.")
        return (tokenizer, model)
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("💡 Try using a different model or check your internet connection")
        return None

def main():
    """Main function - TRUE BASE GENERATION ONLY"""
    print("🚀 Starting TRUE BASE GENERATION SYSTEM with Hugging Face")
    print("="*60)
    print("🤗 LLM: Hugging Face Transformers (Free)")
    print("❌ Knowledge Graphs: DISABLED")
    print("❌ Location Coordinate Lookup: DISABLED")
    print("❌ External APIs: DISABLED")
    print("❌ RAG Text Retrieval: DISABLED")
    print("❌ All External Knowledge Sources: DISABLED")
    print("✅ Pure LLM Generation from Text Only: ENABLED")
    print("="*60)

    api_key = load_api_key()
    if not api_key:
        return

    domain_text = load_text_from_file(INPUT_TEXT_FILE)
    if not domain_text:
        print("⚠️  No input file found, using sample text")
        domain_text = """The Battle of Salamis was a decisive naval battle in 480 BC.
        Themistocles led the Greek fleet to victory over the Persians commanded by Xerxes.
        This victory established Greek naval supremacy in the Aegean Sea."""
    else:
        print(f"📄 Using text from {INPUT_TEXT_FILE}")
        print(f"📝 Text length: {len(domain_text)} characters")

    base_system = TrueBaseGenerationSystem()
    model_data = initialize_llm(api_key)

    if not model_data:
        return

    token_count = base_system.chunker.count_tokens(domain_text)
    print(f"🔢 Total tokens in text: {token_count:,}")

    # Use 10000 token chunks for Colab
    if token_count > 10000:
        print("📊 Text is large, chunking into smaller pieces...")
        chunks = base_system.chunker.chunk_text_by_sentences(domain_text, max_tokens=10000)
        print(f"📄 Created {len(chunks)} chunks")
    else:
        print("📄 Text is small enough to process as single chunk")
        chunks = [domain_text]

    # Process chunks with TRUE base generation
    all_turtle_outputs = []

    print("\n🔄 Processing chunks with TRUE BASE GENERATION...")
    for i, chunk in enumerate(chunks, 1):
        print(f"\n🔄 Processing chunk {i}/{len(chunks)}...")

        turtle_output = base_system.process_chunk_true_base(chunk, i, model_data)
        if turtle_output:
            all_turtle_outputs.append(turtle_output)

        if i < len(chunks):
            time.sleep(1)  # Pause between chunks

    # Save RDF output
    if all_turtle_outputs:
        prefixes = """@prefix ste: <http://www.example.org/ste#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

"""

        final_output = prefixes + "# TRUE BASE GENERATION - No External Knowledge Sources (Hugging Face)\n" + "\n\n".join(all_turtle_outputs)

        with open(OUTPUT_BASE_TTL, 'w', encoding='utf-8') as f:
            f.write(final_output)

        print(f"\n✅ Saved TRUE BASE GENERATION RDF to {OUTPUT_BASE_TTL}")
        print(f"📊 TRUE BASE GENERATION Statistics:")
        print(f"   - LLM: Hugging Face Transformers (Free)")
        print(f"   - Generation Mode: PURE BASE (No External Knowledge)")
        print(f"   - Total chunks processed: {len(chunks)}")
        print(f"   - Successful chunks: {len(all_turtle_outputs)}")
        print(f"   - Events extracted: {base_system.stats['events_extracted']}")

        print(f"\n📝 Sample of TRUE BASE GENERATION RDF:")
        print("="*60)
        print(final_output[:400] + "..." if len(final_output) > 400 else final_output)
        print("="*60)

    else:
        print("❌ No events were extracted from any chunks")

    print(f"\n🎉 TRUE BASE GENERATION complete!")
    print(f"📄 Output file: {OUTPUT_BASE_TTL}")

if __name__ == '__main__':
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🚀 Starting TRUE BASE GENERATION SYSTEM with Hugging Face
🤗 LLM: Hugging Face Transformers (Free)
❌ Knowledge Graphs: DISABLED
❌ Location Coordinate Lookup: DISABLED
❌ External APIs: DISABLED
❌ RAG Text Retrieval: DISABLED
❌ All External Knowledge Sources: DISABLED
✅ Pure LLM Generation from Text Only: ENABLED
✅ Using Hugging Face model - no API key needed.
✅ Loaded text from /content/drive/MyDrive/part_aa
📄 Using text from /content/drive/MyDrive/part_aa
📝 Text length: 398568 characters
🔄 Loading Hugging Face model (this may take a few minutes)...


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

✅ Hugging Face model loaded successfully.
🔢 Total tokens in text: 99,642
📊 Text is large, chunking into smaller pieces...
📄 Created 10 chunks

🔄 Processing chunks with TRUE BASE GENERATION...

🔄 Processing chunk 1/10...
🔍 Raw model output for chunk 1:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 1

🔄 Processing chunk 2/10...
🔍 Raw model output for chunk 2:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 2

🔄 Processing chunk 3/10...
🔍 Raw model output for chunk 3:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 3

🔄 Processing chunk 4/10...
🔍 Raw model output for chunk 4:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 4

🔄 Processing chunk 5/10...
🔍 Raw model output for chunk 5:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 5

🔄 Processing chunk 6/10...
🔍 Raw model output for chunk 6:
'...'
🔍 Cleaned turtle outpu

In [None]:
# Cell 1: Mount Drive and Install Dependencies
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers torch accelerate

# AUTHENTICATION FOR LLAMA (required)
from huggingface_hub import login
from getpass import getpass

# Use your HF token for Llama access
hf_token = getpass("Enter your HuggingFace token: ")
login(token=hf_token)

"""
TRUE BASE GENERATION SYSTEM
No external knowledge sources - pure LLM generation from text only
All external APIs, Knowledge Graphs, and Location enrichment DISABLED
"""

import os
import re
import time
import logging
from typing import List, Dict, Any
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Configuration
INPUT_TEXT_FILE = "/content/drive/MyDrive/part_aa"
OUTPUT_BASE_TTL = '/content/drive/MyDrive/extracted_events_base_generation_G.ttl'

# TRUE BASE GENERATION FLAGS - All external knowledge DISABLED
RAG_ENABLED = False
KNOWLEDGE_GRAPHS_ENABLED = False
LOCATION_ENRICHMENT_ENABLED = False
EXTERNAL_APIS_ENABLED = False

# Token limits
MAX_TOKENS_PER_REQUEST = 100000
CHUNK_OVERLAP = 200

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TextChunker:
    """Handles text chunking to manage token limits"""

    def __init__(self, model_name: str = "meta-llama/Llama-3.2-3B-Instruct"):
        self.model_name = model_name

    def count_tokens(self, text: str) -> int:
        """Approximate token count (roughly 4 chars per token)"""
        return len(text) // 4

    def chunk_text_by_sentences(self, text: str, max_tokens: int = 1000) -> List[str]:
        """Chunk text by sentences to maintain coherence - smaller chunks for Colab"""
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            test_chunk = current_chunk + " " + sentence if current_chunk else sentence

            if self.count_tokens(test_chunk) > max_tokens and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk = test_chunk

        if current_chunk.strip():
            chunks.append(current_chunk.strip())

        return chunks

class TrueBaseGenerationSystem:
    """TRUE Base Generation System - No external knowledge sources"""

    def __init__(self):
        self.chunker = TextChunker()
        self.stats = {
            'chunks_processed': 0,
            'events_extracted': 0,
            'external_api_calls': 0,  # This will stay 0
            'knowledge_sources_used': 0,  # This will stay 0
        }
        logger.info("TRUE BASE GENERATION SYSTEM INITIALIZED")
        logger.info("❌ Knowledge Graphs: DISABLED")
        logger.info("❌ Location Enrichment: DISABLED")
        logger.info("❌ External APIs: DISABLED")
        logger.info("❌ RAG Text Retrieval: DISABLED")
        logger.info("✅ Pure LLM Generation: ENABLED")

    def extract_basic_entities_from_text_only(self, text: str) -> List[str]:
        """Extract entities using ONLY pattern matching - no external validation"""
        pattern = r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b'
        matches = re.findall(pattern, text)

        stop_words = {
            'The', 'This', 'That', 'These', 'Those', 'And', 'But', 'Or', 'So', 'If',
            'When', 'Where', 'Who', 'What', 'How', 'Why', 'All', 'Some', 'Many',
            'First', 'Second', 'Third', 'Last', 'Next', 'Before', 'After', 'During'
        }

        filtered_entities = []
        for entity in matches:
            entity = entity.strip()
            if (entity not in stop_words and len(entity) > 2 and not entity.isdigit()):
                filtered_entities.append(entity)

        seen = set()
        unique_entities = []
        for entity in filtered_entities:
            if entity.lower() not in seen:
                seen.add(entity.lower())
                unique_entities.append(entity)

        return unique_entities[:5]  # Reduced for smaller model

    def extract_basic_locations_from_text_only(self, text: str) -> List[str]:
        """Extract locations using ONLY pattern matching - no coordinate lookup"""
        location_patterns = [
            r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:City|County|State|Province|Country|Region|Island|Bay|Sea|Ocean|River|Mountain|Valley|Desert))\b',
            r'\b(?:Mount|Lake|River|Cape|Fort|Port|Saint|St\.)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b',
        ]

        locations = []
        for pattern in location_patterns:
            matches = re.findall(pattern, text)
            locations.extend(matches)

        location_stopwords = {
            'The', 'This', 'That', 'And', 'But', 'Or', 'So', 'If', 'When', 'Where',
            'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
            'September', 'October', 'November', 'December'
        }

        filtered_locations = []
        for loc in locations:
            loc = loc.strip()
            if (loc not in location_stopwords and len(loc) > 2 and not loc.isdigit()):
                filtered_locations.append(loc)

        return list(set(filtered_locations))[:3]  # Reduced for smaller model

    def process_chunk_true_base(self, chunk: str, chunk_num: int, model_data) -> str:
        """Process chunk with TRUE base generation - no external knowledge"""
        logger.info(f"Processing chunk {chunk_num} ({len(chunk)} chars) - TRUE BASE GENERATION")

        tokenizer, model = model_data

        entities = self.extract_basic_entities_from_text_only(chunk)
        locations = self.extract_basic_locations_from_text_only(chunk)

        logger.info(f"Found entities (text-only): {entities[:3]}...")
        logger.info(f"Found locations (text-only): {locations[:3]}...")

        if not entities and not locations:
            logger.info(f"No entities or locations found in chunk {chunk_num}")
            return ""

        # EXACT SAME PROMPT AS YOUR ORIGINAL CODE
        base_prompt = f"""You are extracting historical events from text using ONLY the information provided in the text chunk. Do not use external knowledge sources, but you CAN and SHOULD make reasonable inferences from the text.

TEXT CHUNK {chunk_num} TO ANALYZE:
{chunk}

ENTITIES FOUND IN TEXT: {', '.join(entities) if entities else 'None'}
LOCATIONS FOUND IN TEXT: {', '.join(locations) if locations else 'None'}

TASK: Extract historical events mentioned in this text chunk using the text information and making REASONABLE INFERENCES.

REQUIREMENTS:
1. Extract ONLY events explicitly mentioned in the text chunk
2. Use information directly stated in the text
3. MAKE REASONABLE INFERENCES from context clues in the text
4. If you can reasonably infer coordinates, countries, regions from textual context, DO IT
5. Include ALL these properties for each event:
   - ste:hasType (description of event, enhanced with context)
   - ste:hasAgent (who caused/led the event, with inferred roles)
   - ste:hasTime (when it happened, with inferred specificity)
   - ste:hasLocation (location name from text)
   - ste:hasLatitude (infer approximate coordinates if you can from text context)
   - ste:hasLongitude (infer approximate coordinates if you can from text context)
   - ste:hasCountry (infer country from textual geographic context)
   - ste:hasRegion (infer region from textual geographic context)
   - ste:hasLocationSource "inferred" (if you made geographic inferences)
   - ste:hasResult (outcome, enhanced with contextual inference)

INFERENCE GUIDELINES:
- If text mentions "Athens", infer it's in Greece, approximate coordinates
- If text mentions "Sicily", infer it's in Italy, Mediterranean coordinates
- If text mentions "Sparta/Lacedaemon", infer Peloponnese, Greece
- If you know from context clues what geographic region events occurred in, infer coordinates
- If someone is called "King", infer royal title
- If text implies timeframes, infer more specific dates
- If outcomes are implied, infer likely results

Output format (do not include prefixes):
```turtle
ste:Event{chunk_num}_1 a ste:Event ;
    ste:hasType "specific event type inferred from context" ;
    ste:hasAgent "person/group with inferred roles" ;
    ste:hasTime "time period with inferred specificity" ;
    ste:hasLocation "location name from text" ;
    ste:hasLatitude "37.9838" ;
    ste:hasLongitude "23.7275" ;
    ste:hasCountry "Greece" ;
    ste:hasRegion "Attica" ;
    ste:hasLocationSource "inferred" ;
    ste:hasResult "outcome inferred from context" .
```

CRITICAL:
- Use the SAME output format as enhanced systems for fair comparison
- INFER coordinates, countries, regions if you can reasonably deduce them from text
- Make the events as detailed and specific as possible through inference
- If you truly cannot infer something, then use empty string ""
- The goal is to extract maximum information through text analysis and inference

If no clear historical events are mentioned in the text, return empty.
"""

        # Format for Llama 3.2
        formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert at extracting historical events from text. Follow instructions precisely.<|eot_id|><|start_header_id|>user<|end_header_id|>

{base_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

        try:
            # Tokenize and generate with Llama format
            inputs = tokenizer.encode(formatted_prompt, return_tensors="pt", max_length=2048, truncation=True)

            with torch.no_grad():
                outputs = model.generate(
                    inputs,
                    max_length=inputs.shape[1] + 200,
                    num_return_sequences=1,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract only the generated part
            generated_text = response_text[len(formatted_prompt):].strip()

            # DEBUG: Print what the model actually generated
            print(f"🔍 Raw model output for chunk {chunk_num}:")
            print(f"'{generated_text[:200]}...'")

            turtle_output = self.clean_turtle(generated_text)

            # DEBUG: Print cleaned output
            print(f"🔍 Cleaned turtle output:")
            print(f"'{turtle_output[:200]}...'")

            if turtle_output:
                self.stats['chunks_processed'] += 1
                event_count = turtle_output.count('ste:Event')
                self.stats['events_extracted'] += event_count
                logger.info(f"Generated {event_count} events from chunk {chunk_num}")
            else:
                print(f"⚠️ No turtle output after cleaning for chunk {chunk_num}")

            return turtle_output

        except Exception as e:
            logger.error(f"Error processing chunk {chunk_num}: {e}")
            return ""

    def clean_turtle(self, raw_output: str) -> str:
        """Clean turtle output"""
        lines = raw_output.strip().split('\n')
        turtle_lines = []
        for line in lines:
            stripped = line.strip()
            if (stripped.startswith('ste:') or
                stripped.startswith('a ') or
                ':' in stripped and ('hasType' in stripped or 'hasAgent' in stripped or 'hasTime' in stripped)):
                turtle_lines.append(line)

        return '\n'.join(turtle_lines)

# Utility functions
def load_api_key():
    """Hugging Face model doesn't need an API key for basic models"""
    print("✅ Using Hugging Face model - no API key needed.")
    return "local"

def load_text_from_file(filepath: str) -> str:
    """Load text from file"""
    if not os.path.isfile(filepath):
        print(f"❌ File not found: {filepath}")
        return ""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        print(f"✅ Loaded text from {filepath}")
        return text
    except Exception as e:
        print(f"❌ Error reading file {filepath}: {e}")
        return ""

def initialize_llm(api_key: str):
    """Initialize Hugging Face model"""
    try:
        print("🔄 Loading Hugging Face model (this may take a few minutes)...")

        # Using a proper instruct model that fits in Colab
        model_name = "microsoft/DialoGPT-medium"  # Will be replaced below

        # Using Llama 3.2 3B Instruct (requires HF token)
        model_name = "meta-llama/Llama-3.2-3B-Instruct"

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # Add padding token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print("✅ Hugging Face model loaded successfully.")
        return (tokenizer, model)
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("💡 Try using a different model or check your internet connection")
        return None

def main():
    """Main function - TRUE BASE GENERATION ONLY"""
    print("🚀 Starting TRUE BASE GENERATION SYSTEM with Hugging Face")
    print("="*60)
    print("🤗 LLM: Hugging Face Transformers (Free)")
    print("❌ Knowledge Graphs: DISABLED")
    print("❌ Location Coordinate Lookup: DISABLED")
    print("❌ External APIs: DISABLED")
    print("❌ RAG Text Retrieval: DISABLED")
    print("❌ All External Knowledge Sources: DISABLED")
    print("✅ Pure LLM Generation from Text Only: ENABLED")
    print("="*60)

    api_key = load_api_key()
    if not api_key:
        return

    domain_text = load_text_from_file(INPUT_TEXT_FILE)
    if not domain_text:
        print("⚠️  No input file found, using sample text")
        domain_text = """The Battle of Salamis was a decisive naval battle in 480 BC.
        Themistocles led the Greek fleet to victory over the Persians commanded by Xerxes.
        This victory established Greek naval supremacy in the Aegean Sea."""
    else:
        print(f"📄 Using text from {INPUT_TEXT_FILE}")
        print(f"📝 Text length: {len(domain_text)} characters")

    base_system = TrueBaseGenerationSystem()
    model_data = initialize_llm(api_key)

    if not model_data:
        return

    token_count = base_system.chunker.count_tokens(domain_text)
    print(f"🔢 Total tokens in text: {token_count:,}")

    # Use 10000 token chunks for Colab
    if token_count > 10000:
        print("📊 Text is large, chunking into smaller pieces...")
        chunks = base_system.chunker.chunk_text_by_sentences(domain_text, max_tokens=10000)
        print(f"📄 Created {len(chunks)} chunks")
    else:
        print("📄 Text is small enough to process as single chunk")
        chunks = [domain_text]

    # Process chunks with TRUE base generation
    all_turtle_outputs = []

    print("\n🔄 Processing chunks with TRUE BASE GENERATION...")
    for i, chunk in enumerate(chunks, 1):
        print(f"\n🔄 Processing chunk {i}/{len(chunks)}...")

        turtle_output = base_system.process_chunk_true_base(chunk, i, model_data)
        if turtle_output:
            all_turtle_outputs.append(turtle_output)

        if i < len(chunks):
            time.sleep(1)  # Pause between chunks

    # Save RDF output
    if all_turtle_outputs:
        prefixes = """@prefix ste: <http://www.example.org/ste#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

"""

        final_output = prefixes + "# TRUE BASE GENERATION - No External Knowledge Sources (Hugging Face)\n" + "\n\n".join(all_turtle_outputs)

        with open(OUTPUT_BASE_TTL, 'w', encoding='utf-8') as f:
            f.write(final_output)

        print(f"\n✅ Saved TRUE BASE GENERATION RDF to {OUTPUT_BASE_TTL}")
        print(f"📊 TRUE BASE GENERATION Statistics:")
        print(f"   - LLM: Hugging Face Transformers (Free)")
        print(f"   - Generation Mode: PURE BASE (No External Knowledge)")
        print(f"   - Total chunks processed: {len(chunks)}")
        print(f"   - Successful chunks: {len(all_turtle_outputs)}")
        print(f"   - Events extracted: {base_system.stats['events_extracted']}")

        print(f"\n📝 Sample of TRUE BASE GENERATION RDF:")
        print("="*60)
        print(final_output[:400] + "..." if len(final_output) > 400 else final_output)
        print("="*60)

    else:
        print("❌ No events were extracted from any chunks")

    print(f"\n🎉 TRUE BASE GENERATION complete!")
    print(f"📄 Output file: {OUTPUT_BASE_TTL}")

if __name__ == '__main__':
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter your HuggingFace token: ··········
🚀 Starting TRUE BASE GENERATION SYSTEM with Hugging Face
🤗 LLM: Hugging Face Transformers (Free)
❌ Knowledge Graphs: DISABLED
❌ Location Coordinate Lookup: DISABLED
❌ External APIs: DISABLED
❌ RAG Text Retrieval: DISABLED
❌ All External Knowledge Sources: DISABLED
✅ Pure LLM Generation from Text Only: ENABLED
✅ Using Hugging Face model - no API key needed.
✅ Loaded text from /content/drive/MyDrive/part_aa
📄 Using text from /content/drive/MyDrive/part_aa
📝 Text length: 398568 characters
🔄 Loading Hugging Face model (this may take a few minutes)...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

✅ Hugging Face model loaded successfully.
🔢 Total tokens in text: 99,642
📊 Text is large, chunking into smaller pieces...
📄 Created 10 chunks

🔄 Processing chunks with TRUE BASE GENERATION...

🔄 Processing chunk 1/10...
🔍 Raw model output for chunk 1:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 1

🔄 Processing chunk 2/10...
🔍 Raw model output for chunk 2:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 2

🔄 Processing chunk 3/10...
🔍 Raw model output for chunk 3:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 3

🔄 Processing chunk 4/10...
🔍 Raw model output for chunk 4:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 4

🔄 Processing chunk 5/10...
🔍 Raw model output for chunk 5:
'...'
🔍 Cleaned turtle output:
'...'
⚠️ No turtle output after cleaning for chunk 5

🔄 Processing chunk 6/10...
🔍 Raw model output for chunk 6:
'...'
🔍 Cleaned turtle outpu