In [1]:
import re
import json
import time
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field, ValidationError
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import os
from pathlib import Path

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
class Entity(BaseModel):
    """Represents an entity in the Knowledge Graph."""
    id: str = Field(description="Unique identifier for the entity in Vietnamese.")
    label: List[str] = Field(description="Array of entity names/aliases in Vietnamese.")
    type: str = Field(description="Entity type in Vietnamese.")
    description: str = Field(description="Brief description of the entity in Vietnamese.")
    original_text: str = Field(description="Complete verbatim quote from source text related to this entity.")
    properties: Dict[str, Any] = Field(description="Object containing important structured information.")
    metadata: Dict[str, Any] = Field(description="Object containing all additional available information.")


class Triplet(BaseModel):
    """Represents a relationship (triplet) in the Knowledge Graph."""
    subject_id: str = Field(description="Vietnamese ID of the subject entity.")
    predicate: str = Field(description="Relationship name, a concise phrase/verb in Vietnamese.")
    object_id: str = Field(description="Vietnamese ID of the object entity.")
    properties: Optional[Dict[str, Any]] = Field(default=None, description="Structured information about the relationship.")
    metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional information for the relationship.")


class KnowledgeGraph(BaseModel):
    """Complete structure of Knowledge Graph containing entities and relationships."""
    entities: List[Entity] = Field(description="List of all extracted entities.")
    triplets: List[Triplet] = Field(description="List of all extracted relationships.")

In [3]:
SCHEMA_PROMPT = """You are an expert at extracting Knowledge Graphs from Vietnamese historical texts. Your task is to extract ALL entities and relationships.

## OUTPUT FORMAT:
Return ONLY a valid JSON object (no markdown, no explanations):

{{
  "entities": [
    {{
      "id": "entity_id_in_vietnamese",
      "label": ["array of Vietnamese names/aliases"],
      "type": "Nhân vật|Tổ chức|Sự kiện|Hiệp ước|Địa điểm|Quốc gia|Khái niệm|Khác",
      "description": "Brief Vietnamese description",
      "original_text": "COMPLETE verbatim quote from source - this is CRITICAL for quiz answering",
      "properties": {{"structured_data": "value"}},
      "metadata": {{"additional_info": "value"}}
    }}
  ],
  "triplets": [
    {{
      "subject_id": "must_match_entity_id",
      "predicate": "vietnamese_verb_phrase",
      "object_id": "must_match_entity_id",
      "properties": {{"optional_structured_data": "value"}},
      "metadata": {{"optional_additional_info": "value"}}
    }}
  ]
}}

## CRITICAL RULES:

### 1. ENTITY EXTRACTION:
- Extract ALL significant entities (people, organizations, events, places, documents, concepts)
- **original_text is MANDATORY**: Copy the COMPLETE, EXACT text from source that describes this entity
- Include ALL relevant sentences about the entity, not just one sentence
- Multiple mentions → combine all text segments with "\\n\\n" separator
- Properties: dates (YYYY-MM-DD format), locations, key facts
- Metadata: roles, additional context
- TYPE MUST BE ONE OF: Nhân vật, Tổ chức, Sự kiện, Hiệp ước, Địa điểm, Quốc gia, Khái niệm, Khác

### 2. RELATIONSHIP EXTRACTION (CRITICAL - YOU MUST EXTRACT MORE):
- **Extract EVERY relationship between entities mentioned in the text**
- If two entities are mentioned together or related, CREATE a triplet
- Common predicates: "là", "thuộc_về", "tham_gia", "ký_kết", "lãnh_đạo", "thành_lập", "có_ảnh_hưởng_đến", "liên_quan_đến", "hợp_tác_với", "phát_triển_tại"
- Temporal relationships: "diễn_ra_trước", "dẫn_đến", "kết_quả_của"
- DO NOT create isolated entities - ensure each entity has at least one relationship

### 3. CONTINUITY WITH EXISTING ENTITIES:
**Previously extracted entities:**
{existing_entities_summary}

**When you encounter these entities again:**
- Reference by EXACT ID
- Add new labels if found
- APPEND to original_text (with \\n\\n separator)
- Create NEW relationships with other entities in current chunk

### 4. RELATIONSHIP COMPLETENESS CHECK:
Before returning, verify:
- Every entity has AT LEAST one triplet (as subject OR object)
- Related entities mentioned together have explicit relationships
- Temporal sequences are captured (event A → event B)
- Organizational hierarchies are clear (entity X belongs_to entity Y)

### 5. QUALITY REQUIREMENTS:
- All IDs in Vietnamese (lowercase, underscores for spaces)
- All values in Vietnamese
- Dates in YYYY-MM-DD, YYYY-MM-00, or YYYY-00-00 format
- NO abstract concepts without concrete relationships
- Focus on factual, verifiable relationships

## PROCESSING STRATEGY:
1. Identify all entities in the chunk
2. For EACH entity pair that appears together or is contextually related, create a triplet
3. Check against existing entities - update if duplicate, create relationship if new
4. Extract original_text carefully - include ALL relevant sentences
5. Verify completeness before returning

Now process the following text:

{document_content}"""

# Simplified prompt for retry attempts
SIMPLE_EXTRACTION_PROMPT = """Extract entities and relationships from this Vietnamese text. Return ONLY valid JSON:

{{
  "entities": [
    {{
      "id": "vietnamese_id",
      "label": ["names"],
      "type": "Nhân vật|Tổ chức|Sự kiện|Hiệp ước|Địa điểm|Quốc gia|Khái niệm|Khác",
      "description": "description",
      "original_text": "exact text from source",
      "properties": {{}},
      "metadata": {{}}
    }}
  ],
  "triplets": [
    {{
      "subject_id": "id1",
      "predicate": "relationship",
      "object_id": "id2"
    }}
  ]
}}

Text: {document_content}

Extract at least 3-5 main entities and their relationships."""

In [4]:
def split_into_chunks(text: str, max_chunk_size: int = 1200) -> List[str]:
    """Split text into smaller chunks for processing."""
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for paragraph in paragraphs:
        paragraph = paragraph.strip()
        if not paragraph:
            continue
            
        if len(paragraph) > max_chunk_size:
            sentences = re.split(r'[.!?;]+', paragraph)
            for sentence in sentences:
                sentence = sentence.strip()
                if not sentence:
                    continue
                    
                if len(current_chunk) + len(sentence) + 2 > max_chunk_size:
                    if current_chunk:
                        chunks.append(current_chunk.strip())
                    current_chunk = sentence + ". "
                else:
                    current_chunk += sentence + ". "
        elif len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = paragraph + "\n\n"
        else:
            current_chunk += paragraph + "\n\n"
    
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    
    return chunks


def format_existing_entities_summary(entities: List[Dict]) -> str:
    """Format existing entities for context."""
    if not entities:
        return "None (first chunk)"
    
    summary = []
    for entity in entities[-100:]:  # Last 100 entities for context
        summary.append(f"- {entity['id']} ({entity['type']}): {', '.join(entity['label'][:3])}")
    
    return "\n".join(summary)


def merge_entity_data(existing: Dict, new: Dict) -> Dict:
    """Intelligently merge new entity data into existing entity."""
    # Merge labels
    existing_labels = set(existing.get('label', []))
    new_labels = set(new.get('label', []))
    existing['label'] = list(existing_labels | new_labels)
    
    # Append original_text with separator
    new_text = new.get('original_text', '').strip()
    existing_text = existing.get('original_text', '').strip()
    
    if new_text and new_text not in existing_text:
        if existing_text:
            existing['original_text'] = existing_text + "\n\n" + new_text
        else:
            existing['original_text'] = new_text
    
    # Merge properties (new takes precedence)
    existing['properties'].update(new.get('properties', {}))
    
    # Merge metadata
    existing['metadata'].update(new.get('metadata', {}))
    
    # Use longer description
    if len(new.get('description', '')) > len(existing.get('description', '')):
        existing['description'] = new['description']
    
    return existing


def merge_knowledge_graphs(kg_list: List[Dict]) -> Dict:
    """Merge multiple knowledge graphs with deduplication."""
    merged_entities = {}
    merged_triplets = []
    triplet_set = set()
    
    for kg in kg_list:
        # Merge entities
        for entity in kg.get('entities', []):
            entity_id = entity['id']
            
            if entity_id in merged_entities:
                merged_entities[entity_id] = merge_entity_data(merged_entities[entity_id], entity)
            else:
                merged_entities[entity_id] = entity
        
        # Merge triplets (deduplicate)
        for triplet in kg.get('triplets', []):
            triplet_key = (triplet['subject_id'], triplet['predicate'], triplet['object_id'])
            
            if triplet_key not in triplet_set:
                triplet_set.add(triplet_key)
                merged_triplets.append(triplet)
    
    return {
        'entities': list(merged_entities.values()),
        'triplets': merged_triplets
    }


def fix_common_json_errors(text: str) -> str:
    """Fix common JSON syntax errors."""
    # Remove trailing commas before closing brackets/braces
    text = re.sub(r',\s*}', '}', text)
    text = re.sub(r',\s*]', ']', text)
    
    # Fix unescaped quotes in strings (basic fix)
    # This is tricky - only do simple cases
    
    # Ensure proper closing
    open_braces = text.count('{')
    close_braces = text.count('}')
    if open_braces > close_braces:
        text += '}' * (open_braces - close_braces)
    
    open_brackets = text.count('[')
    close_brackets = text.count(']')
    if open_brackets > close_brackets:
        text += ']' * (open_brackets - close_brackets)
    
    return text


In [5]:
def parse_json_response(response_text: str, fix_errors: bool = True) -> Dict:
    """Parse JSON from LLM response with robust error handling."""
    if not response_text or not response_text.strip():
        return {"entities": [], "triplets": []}
    
    text = response_text.strip()
    
    # Remove markdown
    text = re.sub(r'^```json\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^```\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\s*```$', '', text, flags=re.MULTILINE)
    text = text.strip()
    
    # Extract JSON object
    json_start = text.find('{')
    json_end = text.rfind('}')
    
    if json_start == -1 or json_end == -1:
        return {"entities": [], "triplets": []}
    
    text = text[json_start:json_end+1]
    
    if fix_errors:
        text = fix_common_json_errors(text)
    
    try:
        return json.loads(text)
    except json.JSONDecodeError as e:
        # Try to extract partial data
        try:
            # Try to find entities array
            entities_match = re.search(r'"entities"\s*:\s*\[(.*?)\]', text, re.DOTALL)
            triplets_match = re.search(r'"triplets"\s*:\s*\[(.*?)\]', text, re.DOTALL)
            
            partial = {"entities": [], "triplets": []}
            
            if entities_match:
                try:
                    partial["entities"] = json.loads('[' + entities_match.group(1) + ']')
                except:
                    pass
            
            if triplets_match:
                try:
                    partial["triplets"] = json.loads('[' + triplets_match.group(1) + ']')
                except:
                    pass
            
            if partial["entities"]:
                return partial
        except:
            pass
        
        return {"entities": [], "triplets": []}


def validate_and_fix_kg(kg_dict: Dict) -> Optional[Dict]:
    """Validate KG structure and fix common issues."""
    try:
        # Fix missing required fields
        for entity in kg_dict.get('entities', []):
            if 'properties' not in entity:
                entity['properties'] = {}
            if 'metadata' not in entity:
                entity['metadata'] = {}
            if not isinstance(entity.get('label'), list):
                entity['label'] = [str(entity.get('label', entity.get('id', 'unknown')))]
            if not entity.get('original_text'):
                entity['original_text'] = entity.get('description', '')
        
        for triplet in kg_dict.get('triplets', []):
            if 'properties' not in triplet:
                triplet['properties'] = {}
            if 'metadata' not in triplet:
                triplet['metadata'] = {}
        
        # Validate with Pydantic
        kg_validated = KnowledgeGraph(**kg_dict)
        return kg_validated.model_dump()
    
    except ValidationError as e:
        print(f"⚠ Validation error: {str(e)[:100]}")
        # Try to salvage what we can
        try:
            fixed_entities = []
            for entity in kg_dict.get('entities', []):
                try:
                    # Ensure minimum required fields
                    if entity.get('id') and entity.get('type') and entity.get('description'):
                        if 'label' not in entity or not isinstance(entity['label'], list):
                            entity['label'] = [entity['id']]
                        if 'original_text' not in entity:
                            entity['original_text'] = entity.get('description', '')
                        if 'properties' not in entity:
                            entity['properties'] = {}
                        if 'metadata' not in entity:
                            entity['metadata'] = {}
                        fixed_entities.append(entity)
                except:
                    continue
            
            fixed_triplets = []
            for triplet in kg_dict.get('triplets', []):
                try:
                    if triplet.get('subject_id') and triplet.get('predicate') and triplet.get('object_id'):
                        if 'properties' not in triplet:
                            triplet['properties'] = {}
                        if 'metadata' not in triplet:
                            triplet['metadata'] = {}
                        fixed_triplets.append(triplet)
                except:
                    continue
            
            if fixed_entities:
                fixed_kg = {"entities": fixed_entities, "triplets": fixed_triplets}
                kg_validated = KnowledgeGraph(**fixed_kg)
                return kg_validated.model_dump()
        except:
            pass
    
    return None


def load_existing_kg(file_path: str) -> Dict:
    """Load existing knowledge graph from JSON file."""
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"⚠ Warning: Could not load existing KG: {e}")
    return {"entities": [], "triplets": []}


def save_kg(kg_dict: Dict, output_path: str):
    """Save knowledge graph to JSON file."""
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(kg_dict, f, ensure_ascii=False, indent=2)


def process_chunk_with_retry(
    chunk: str,
    existing_summary: str,
    llm,
    max_retries: int = 5,
    delay: float = 2.0
) -> Optional[Dict]:
    """Process a single chunk with multiple retry strategies."""
    
    strategies = [
        ("full", SCHEMA_PROMPT, False),
        ("full_fixed", SCHEMA_PROMPT, True),
        ("simple", SIMPLE_EXTRACTION_PROMPT, False),
        ("simple_fixed", SIMPLE_EXTRACTION_PROMPT, True),
        ("minimal", SIMPLE_EXTRACTION_PROMPT, True)
    ]
    
    for strategy_name, prompt_template, fix_json in strategies:
        for attempt in range(max_retries):
            try:
                prompt = ChatPromptTemplate.from_template(prompt_template)
                chain = prompt | llm
                
                response = chain.invoke({
                    "document_content": chunk,
                    "existing_entities_summary": existing_summary if "existing_entities_summary" in prompt_template else ""
                })
                
                if not hasattr(response, 'content') or not response.content:
                    time.sleep(delay)
                    continue
                
                kg_dict = parse_json_response(response.content, fix_errors=fix_json)
                
                if not kg_dict.get('entities'):
                    time.sleep(delay)
                    continue
                
                # Validate and fix
                kg_validated = validate_and_fix_kg(kg_dict)
                
                if kg_validated and kg_validated.get('entities'):
                    return kg_validated
                
                time.sleep(delay)
                
            except Exception as e:
                error_msg = str(e)
                
                # Handle rate limit
                if "429" in error_msg or "quota" in error_msg.lower():
                    wait_time = 10
                    if "retry in" in error_msg.lower():
                        try:
                            wait_match = re.search(r'retry in (\d+\.?\d*)', error_msg.lower())
                            if wait_match:
                                wait_time = float(wait_match.group(1)) + 1
                        except:
                            pass
                    
                    print(f"⏳{wait_time:.0f}s ", end="", flush=True)
                    time.sleep(wait_time)
                    continue
                
                # Other errors
                if attempt < max_retries - 1:
                    time.sleep(delay)
                    continue
        
        # If this strategy failed, try next one
        print(f"[{strategy_name}→", end="")
    
    return None


def process_single_file_incremental(
    file_path: str,
    existing_kg: Dict,
    llm,
    max_retries: int = 5,
    delay_between_chunks: float = 15.0
) -> Dict:
    """Process a single file and merge with existing KG incrementally."""
    print(f"\n{'='*70}")
    print(f"Processing file: {file_path}")
    print(f"{'='*70}")
    
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
    except Exception as e:
        print(f"✗ Error reading file: {e}")
        return existing_kg
    
    print(f"✓ Loaded {len(content)} characters")
    
    chunks = split_into_chunks(content, max_chunk_size=1200)
    print(f"✓ Split into {len(chunks)} chunks\n")
    
    cumulative_entities = existing_kg.get('entities', [])
    chunk_kg_list = []
    failed_chunks = []
    
    for i, chunk in enumerate(chunks, 1):
        print(f"Chunk {i}/{len(chunks)}: ", end="", flush=True)
        
        existing_summary = format_existing_entities_summary(cumulative_entities)
        
        kg_result = process_chunk_with_retry(
            chunk,
            existing_summary,
            llm,
            max_retries=max_retries,
            delay=delay_between_chunks
        )
        
        if kg_result and kg_result.get('entities'):
            chunk_kg_list.append(kg_result)
            cumulative_entities.extend(kg_result['entities'])
            
            entity_count = len(kg_result['entities'])
            triplet_count = len(kg_result['triplets'])
            ratio = triplet_count / entity_count if entity_count > 0 else 0
            
            print(f"] ✓ {entity_count}E/{triplet_count}T (R:{ratio:.1f})")
        else:
            failed_chunks.append(i)
            print(f"] ✗ FAILED")
        
        time.sleep(delay_between_chunks)
    
    if failed_chunks:
        print(f"\n⚠ Failed chunks: {failed_chunks} (Total: {len(failed_chunks)}/{len(chunks)})")
    
    # Merge new chunks with existing KG
    print(f"\n→ Merging {len(chunk_kg_list)} successful chunks with existing KG...")
    all_kg_list = [existing_kg] + chunk_kg_list if existing_kg.get('entities') else chunk_kg_list
    merged_kg = merge_knowledge_graphs(all_kg_list)
    
    return merged_kg


def create_knowledge_graph_incremental(
    file_paths: List[str],
    output_json_path: str,
    output_ttl_path: str = None,
    api_key_1: str = None,
    max_retries: int = 5,
    delay_between_chunks: float = 2.0,
    save_after_each_file: bool = True
):
    """Create unified knowledge graph by processing files incrementally."""
    print("="*70)
    print("INCREMENTAL KNOWLEDGE GRAPH EXTRACTION SYSTEM (WITH ERROR RECOVERY)")
    print("="*70 + "\n")
    
    if api_key_1 is None:
        api_key_1 = os.environ.get("GOOGLE_API_KEY_1")
        if not api_key_1:
            print("ERROR: Google API key not found!")
            return
    
    print("[1] Initializing LLM...")
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        google_api_key=api_key_1,
        temperature=0.0,
        max_tokens=8000,
    )
    print("✓ LLM initialized\n")
    
    # Load existing KG if available
    print("[2] Checking for existing knowledge graph...")
    existing_kg = load_existing_kg(output_json_path)
    if existing_kg.get('entities'):
        print(f"✓ Loaded existing KG: {len(existing_kg['entities'])} entities, {len(existing_kg['triplets'])} triplets\n")
    else:
        print("✓ Starting fresh knowledge graph\n")
    
    # Process each file incrementally
    current_kg = existing_kg
    
    for file_idx, file_path in enumerate(file_paths, 1):
        print(f"\n[3.{file_idx}] Processing file {file_idx}/{len(file_paths)}")
        
        current_kg = process_single_file_incremental(
            file_path,
            current_kg,
            llm,
            max_retries=max_retries,
            delay_between_chunks=delay_between_chunks
        )
        
        # Save after each file
        if save_after_each_file:
            save_kg(current_kg, output_json_path)
            print(f"✓ Progress saved: {len(current_kg['entities'])} entities, {len(current_kg['triplets'])} triplets")
    
    # Final statistics
    print(f"\n{'='*70}")
    print("FINAL STATISTICS")
    print(f"{'='*70}")
    print(f"Total Entities: {len(current_kg['entities'])}")
    print(f"Total Relationships: {len(current_kg['triplets'])}")
    
    ratio = len(current_kg['triplets']) / len(current_kg['entities']) if current_kg['entities'] else 0
    print(f"Relationship/Entity Ratio: {ratio:.2f}")
    
    if ratio < 1.0:
        print("⚠ WARNING: Low relationship ratio - entities may be isolated")
    
    # Entity type distribution
    type_dist = {}
    for entity in current_kg['entities']:
        entity_type = entity.get('type', 'Unknown')
        type_dist[entity_type] = type_dist.get(entity_type, 0) + 1
    
    print(f"\nEntity Type Distribution:")
    for entity_type, count in sorted(type_dist.items(), key=lambda x: x[1], reverse=True):
        print(f"  {entity_type}: {count}")
    
    # Final save
    print(f"\n[4] Saving final outputs...")
    save_kg(current_kg, output_json_path)
    print(f"✓ JSON saved: {output_json_path}")
    
    if output_ttl_path:
        export_to_ttl(current_kg, output_ttl_path)
    
    print(f"\n{'='*70}")
    print("EXTRACTION COMPLETE!")
    print(f"{'='*70}")
    
    return current_kg


def export_to_ttl(kg_dict: Dict, output_path: str):
    """Export KG to TTL format."""
    ttl = "@prefix : <http://example.org/kg#> .\n\n"
    
    for entity in kg_dict['entities']:
        eid = entity['id'].replace(' ', '_')
        ttl += f":{eid} a :{entity['type']} ;\n"
        ttl += f'  :label "{entity["id"]}"@vi ;\n'
        ttl += f'  :description "{entity["description"]}"@vi .\n\n'
    
    for triplet in kg_dict['triplets']:
        subj = triplet['subject_id'].replace(' ', '_')
        pred = triplet['predicate'].replace(' ', '_')
        obj = triplet['object_id'].replace(' ', '_')
        ttl += f":{subj} :{pred} :{obj} .\n"
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(ttl)
    print(f"✓ TTL saved: {output_path}")

In [6]:
if __name__ == "__main__":
    INPUT_FILES = [
        r"data/SGK/1.txt",
        r"data/SGK/2.txt",
        r"data/SGK/3.txt",
        r"data/SGK/4.txt",
        r"data/SGK/5.txt",
        r"data/SGK/6.txt",
    ]
    
    OUTPUT_JSON = "graph_documents_v3.json"
    OUTPUT_TTL = "graph_documents_v3.ttl"
    
    create_knowledge_graph_incremental(
        INPUT_FILES,
        OUTPUT_JSON,
        OUTPUT_TTL,
        delay_between_chunks=2.0,
        save_after_each_file=True
    )

INCREMENTAL KNOWLEDGE GRAPH EXTRACTION SYSTEM (WITH ERROR RECOVERY)

[1] Initializing LLM...
✓ LLM initialized

[2] Checking for existing knowledge graph...
✓ Starting fresh knowledge graph


[3.1] Processing file 1/6

Processing file: data/SGK/1.txt
✓ Loaded 19369 characters
✓ Split into 18 chunks

Chunk 1/18: ] ✓ 12E/16T (R:1.3)
Chunk 2/18: ] ✓ 15E/28T (R:1.9)
Chunk 3/18: ] ✓ 19E/19T (R:1.0)
Chunk 4/18: ] ✓ 14E/18T (R:1.3)
Chunk 5/18: ] ✓ 14E/18T (R:1.3)
Chunk 6/18: ] ✓ 18E/18T (R:1.0)
Chunk 7/18: ] ✓ 15E/26T (R:1.7)
Chunk 8/18: ] ✓ 34E/34T (R:1.0)
Chunk 9/18: ] ✓ 12E/13T (R:1.1)
Chunk 10/18: ] ✓ 20E/21T (R:1.1)
Chunk 11/18: ] ✓ 15E/23T (R:1.5)
Chunk 12/18: ] ✓ 15E/21T (R:1.4)
Chunk 13/18: ] ✓ 14E/14T (R:1.0)
Chunk 14/18: ] ✓ 19E/18T (R:0.9)
Chunk 15/18: ] ✓ 17E/17T (R:1.0)
Chunk 16/18: ] ✓ 22E/21T (R:1.0)
Chunk 17/18: ] ✓ 12E/31T (R:2.6)
Chunk 18/18: ] ✓ 20E/18T (R:0.9)

→ Merging 18 successful chunks with existing KG...
✓ Progress saved: 205 entities, 369 triplets

[3.2] Processing