In [1]:
import json
import os
from typing import List
from pydantic import BaseModel, Field
from itext2kg import itext2kg_star
from itext2kg.documents_distiller import DocumentsDistiller, Article
import asyncio
from itext2kg import iText2KG_Star
from itext2kg.logging_config import setup_logging, get_logger
from itext2kg import itext2kg_star
from langchain_ollama import ChatOllama, OllamaEmbeddings
import time
from datetime import datetime

  warn(


In [None]:
# Create models for destill

In [24]:
class ContentSource(BaseModel):
    name: str = Field(description="Name of the author/speaker/presenter")
    role: Optional[str] = Field(description="Role or position")
    affiliation: Optional[str] = Field(description="Organization or company affiliation")


class Content(BaseModel):
    title: str = Field(description="Title of the content (article/video)")
    sources: List[ContentSource] = Field(description="Authors/speakers involved")
    summary: str = Field(description="Brief summary of the content")
    key_concepts: List[str] = Field(description="Main concepts or topics covered")
    insights: str = Field(description="Key insights and findings")
    challenges: str = Field(description="Challenges or limitations discussed")
    solutions: str = Field(description="Proposed solutions or approaches")
    practical_applications: str = Field(description="Practical applications or implementations mentioned")
    # Additional fields observed in your JSON files
    methodology: Optional[str] = Field(default=None, description="Research methodology")
    conclusions: Optional[str] = Field(default=None, description="Conclusions")

In [None]:
# Create function for creating semantic blocks from batches (output from .py script)

In [25]:
import os
import json
from typing import List, Any, Dict
from datetime import datetime, date

def load_and_create_semantic_blocks(results_dir="distilled_results"):
    """Load JSONs and create semantic blocks based on Content schema"""

    all_semantic_blocks = []

    # Get all JSON files
    json_files = [f for f in os.listdir(results_dir) if f.endswith('.json')]
    json_files.sort()  # Sort alphabetically

    print(f"Found {len(json_files)} JSON files")

    for filename in json_files:
        file_path = os.path.join(results_dir, filename)

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            # Create Content object
            content_obj = Content(**json_data)

            # Convert to semantic blocks
            for key, value in content_obj.model_dump().items():
                if value and value != [] and value != "":
                    if key == "sources" and isinstance(value, list):
                        sources_text = ", ".join(
                            [f"{s['name']} ({s.get('role','')})" for s in value if s.get('name')]
                        )
                        if sources_text:
                            block = f"{key} - {sources_text}".replace("{", "[").replace("}", "]")
                            all_semantic_blocks.append(block)
                    elif key == "key_concepts" and isinstance(value, list):
                        concepts_text = ", ".join(value)
                        block = f"{key} - {concepts_text}".replace("{", "[").replace("}", "]")
                        all_semantic_blocks.append(block)
                    elif isinstance(value, str):
                        block = f"{key} - {value}".replace("{", "[").replace("}", "]")
                        all_semantic_blocks.append(block)

            print(f"✓ Processed {filename}")

        except Exception as e:
            print(f"✗ Error with {filename}: {e}")

    print(f"\nTotal semantic blocks created: {len(all_semantic_blocks)}")
    return all_semantic_blocks

In [None]:
# Create semantic blocks

In [26]:
semantic_blocks = load_and_create_semantic_blocks("vnp_itext2kg/distilled_results")

Found 55 JSON files
✓ Processed batch_1.json
✓ Processed batch_10.json
✓ Processed batch_11.json
✓ Processed batch_12.json
✓ Processed batch_13.json
✓ Processed batch_14.json
✓ Processed batch_15.json
✓ Processed batch_16.json
✓ Processed batch_17.json
✓ Processed batch_18.json
✓ Processed batch_19.json
✓ Processed batch_2.json
✓ Processed batch_21.json
✓ Processed batch_22.json
✓ Processed batch_23.json
✓ Processed batch_24.json
✓ Processed batch_25.json
✓ Processed batch_26.json
✓ Processed batch_27.json
✓ Processed batch_28.json
✓ Processed batch_29.json
✓ Processed batch_3.json
✓ Processed batch_30.json
✓ Processed batch_31.json
✓ Processed batch_32.json
✓ Processed batch_34.json
✓ Processed batch_35.json
✓ Processed batch_36.json
✓ Processed batch_38.json
✓ Processed batch_39.json
✓ Processed batch_4.json
✓ Processed batch_40.json
✓ Processed batch_41.json
✓ Processed batch_42.json
✓ Processed batch_45.json
✓ Processed batch_47.json
✓ Processed batch_48.json
✓ Processed batch_5.js

In [27]:
semantic_blocks

['title - Extracted Information for The Standard for Program Management Fifth Edition Extracted Key Concepts and Findings from the Standard for Program Management Fifth Edition Research Methodology and Findings The Standard for Program Management / Project Management Institute Extracted Key Concepts, Methodologies, Findings, and Insights from the Provided Document',
 'sources - PDF document (Researcher), PMIstandards+ (Digital Platform), Blagoja Jankoski (Licensee), Methodological Approach (Authoritative Source), Data Collection Methods (Expert Source), LCCN (Identifier), ISBN (Identifier), Subjects (Classification), DDC (Classification), LC record (Identifier), ISBN (Identifier), Published by (Publisher), PMI.org (Publisher), Copyright (Copyright notice), Project Management Institute, Inc. (Publisher), PDF document containing research, technical, or academic content (Document Source), Act like an experienced research analyst. (Research Analyst)',
 "summary - - Act like an experienced 

In [None]:
# Set up connection to Ollama using langchain

In [33]:
llm_text = ChatOllama(
    model="gemma2:2b",
    temperature=0,
)
embeddings = OllamaEmbeddings(
    model="nomic-embed-text:latest",
)

In [29]:
itext2kg_star = iText2KG_Star(llm_model=llm_text, embeddings_model=embeddings)

In [30]:
sb = semantic_blocks[0:40]

In [None]:
# Create graph

In [61]:
kg = await itext2kg_star.build_graph(
    sections=sb,
    ent_threshold=0.8,
    rel_threshold=0.7,
)

[2025-09-07 20:53:27] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 1
[2025-09-07 20:53:42] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 2
[2025-09-07 20:54:19] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [project management institute, inc.:Publisher] --merged--> [institute:Project_Management_Institute]
[2025-09-07 20:54:19] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [methodological approach:Methodological_Approach] --merged--> [methodologies:Methodologies]
[2025-09-07 20:54:19] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 3
[2025-09-07 20:54:48] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [standard for program management:Standard_for_Program_Management] --merged--> [standard:Standard_for_Program_

In [None]:
# Ran into blocking here, had to remove several blocks to continue

In [59]:
sb.remove(sb[25])

In [60]:
len(sb)

25

In [None]:
# Storing in Neo4j

In [62]:
from itext2kg.graph_integration import Neo4jStorage

URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "admin123"

# Note: Graph visualization remains synchronous
graph_integrator = Neo4jStorage(uri=URI, username=USERNAME, password=PASSWORD)
graph_integrator.visualize_graph(knowledge_graph=kg)