In [None]:
from itext2kg import itext2kg_star
from langchain_ollama import ChatOllama, OllamaEmbeddings
import time
from datetime import datetime

  warn(


In [None]:
from itext2kg.documents_distiller import DocumentsDistiller, Article
import asyncio
from itext2kg import iText2KG_Star
from itext2kg.logging_config import setup_logging, get_logger

In [None]:
# Set up connection using langchain
# llm - used for distill
# llm_text and embeddings used for extraction

In [None]:
llm = ChatOllama(
    model="llama3.2:1b",
    temperature=0,
)

In [None]:
llm_text = ChatOllama(
    model="gemma2:2b",
    temperature=0,
)
embeddings = OllamaEmbeddings(
    model="nomic-embed-text:latest",
)

In [None]:
# Load initial data

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(f"./data/input/pmi-project-performance-domains.pdf")
pages = loader.load_and_split()

In [None]:
# Load data for iteration

In [None]:
# Initialize components
document_distiller = DocumentsDistiller(llm_model=llm)
itext2kg_star = iText2KG_Star(llm_model=llm_text, embeddings_model=embeddings)

file_paths = [
"./data/input/pmi-models-methods-artifacts.pdf",
"./data/input/12-project-management-principles.pdf"
]

# Extract facts from each time point
IE_query = '''
# DIRECTIVES :
- Act like an experienced information extractor.
- You have a document describing the Scrum software development process.
- If you do not find the right information, keep its place empty.
'''

loader = PyPDFLoader(f"./data/input/pmi-project-performance-domains.pdf")
pages = loader.load_and_split()

# Process first document to initialize the KG
facts_0 = await document_distiller.distill(documents=[page.page_content.replace("{", '[').replace("}", "]") for page in pages], IE_query=IE_query, output_data_structure=Article)
semantic_blocks = [f"{key} - {value}".replace("{", "[").replace("}", "]")
                     for key, value in facts_0.model_dump().items()
                     if value != [] and value != "" and value is not None]

In [None]:
# Build initial knowledge graph
kg = await itext2kg_star.build_graph(
    sections=semantic_blocks,
    ent_threshold=0.8,      # Higher threshold for more distinct entities
    rel_threshold=0.7,      # Threshold for relationship merging
)

print(f"Dynamic KG completed! Entities: {len(kg.entities)}, Relationships: {len(kg.relationships)}")

[2025-08-20 11:35:28] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 1
[2025-08-20 11:35:35] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 2
[2025-08-20 11:36:22] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 3
[2025-08-20 11:37:05] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [scrum software development process:Guide] --merged--> [scrum software development process:title]
[2025-08-20 11:37:05] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [scrum software development process:Document] --merged--> [scrum software development process:title]
[2025-08-20 11:37:05] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [scrum software development process:Practices] --merged--> [scrum software development process:title]


In [None]:
# Convert remaining data to semantic blocks

In [None]:
semantic_blocks_list = []
for file in file_paths:
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()

    facts_0 = await document_distiller.distill(documents=[page.page_content.replace("{", '[').replace("}", "]") for page in pages], IE_query=IE_query, output_data_structure=Article)
    semantic_blocks = [f"{key} - {value}".replace("{", "[").replace("}", "]")
                         for key, value in facts_0.model_dump().items()
                         if value != [] and value != "" and value is not None]
    semantic_blocks_list.append(semantic_blocks)

In [None]:
semantic_blocks_list

[['title - Scrum Software Development Process Scrum Software Development Process Models Scrum Software Development Process Information Extraction Scrum Software Development Process Information Extraction',
  "authors - [['name': 'Agile Manifesto', 'affiliation': 'Project Management Institute, Inc.'], ['name': 'Scrum Framework', 'affiliation': 'Scrum Alliance'], ['name': 'Scrum Team', 'affiliation': 'Scrum.org'], ['name': 'Experienced Information Extractor', 'affiliation': 'Project Management Institute, Inc.'], ['name': 'Experienced Information Extractor', 'affiliation': 'Project Management Institute, Inc.']]",
  'abstract - The Scrum software development process is a framework for managing and completing complex projects. It emphasizes teamwork, accountability, and iterative progress toward well-defined goals. This document describes the Scrum software development process. It includes models for enabling outcome models, methods, and artifacts. The Scrum software development process is 

In [None]:
# Incrementally update with subsequent documents
for i in range(1, len(semantic_blocks_list)):
    print(f"Processing document {i+1} from {file_paths[i]}")

    # Update the knowledge graph incrementally
    kg = await itext2kg_star.build_graph(
        sections=semantic_blocks_list[i],
        existing_knowledge_graph=kg.model_copy(),  # Pass existing KG for incremental updates
        ent_threshold=0.8,
        rel_threshold=0.7
    )
    print(f"Dynamic KG completed! Entities: {len(kg.entities)}, Relationships: {len(kg.relationships)}")

Processing document 2 from ./data/input/12-project-management-principles.pdf
[2025-08-20 11:45:38] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 1
[2025-08-20 11:45:51] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 2
[2025-08-20 11:47:04] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 3
[2025-08-20 11:47:26] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [process:Scrum_software_development_process] --merged--> [scrum software development process:title]
[2025-08-20 11:47:26] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 4
[2025-08-20 11:47:41] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [scrum master:Team_roles] --merged--> [scrum team:authors]
[2025-08-20 11:47:41] [    INF

In [None]:
# Made an error in the for loop with do it again with missed file

In [None]:
print(f"Processing document {0+1} from {file_paths[0]}")

# Update the knowledge graph incrementally
kg = await itext2kg_star.build_graph(
    sections=semantic_blocks_list[i],
    existing_knowledge_graph=kg.model_copy(),  # Pass existing KG for incremental updates
    ent_threshold=0.8,
    rel_threshold=0.7
)
print(f"Dynamic KG completed! Entities: {len(kg.entities)}, Relationships: {len(kg.relationships)}")

Processing document 1 from ./data/input/pmi-models-methods-artifacts.pdf
[2025-08-20 11:51:06] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 1
[2025-08-20 11:51:12] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 2
[2025-08-20 11:52:26] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 3
[2025-08-20 11:52:48] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [process:Scrum_software_development_process] --merged--> [scrum software development process:title]
[2025-08-20 11:52:48] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 4
[2025-08-20 11:53:03] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [scrum master:Team_roles] --merged--> [scrum team:authors]
[2025-08-20 11:53:03] [    INFO] [

In [None]:
# Preprocessing function (Neo4j Does not handle entities starting with numnber)
def sanitize_node_labels(kg):
    for node in kg.entities:
        # If node label starts with a number, prefix with 'N_'
        if node.label[0].isdigit():
            node.label="N_" + node.label
        print(node)
        print("\n")
    return kg

# Apply sanitization before visualization
kg_sanitized = sanitize_node_labels(kg)
graph_integrator.visualize_graph(knowledge_graph=kg_sanitized)

label='Practices' name='software development process' properties=EntityProperties(embeddings=array([ 4.95918490e-02,  1.11922169e-02, -1.53193786e-01, -3.39484370e-03,
        9.79448398e-03, -5.28637358e-03,  4.12039252e-02, -1.67882460e-02,
        1.39033740e-03, -2.14188650e-02,  5.81347500e-04, -1.72530440e-02,
        9.62194112e-02,  5.67354416e-02,  1.57632411e-02, -4.52384880e-02,
        1.11621380e-02, -7.16464110e-02, -6.29394098e-02, -2.84070664e-03,
       -2.28548025e-02, -4.13712880e-02,  1.64644272e-02, -2.69872180e-02,
        1.09422416e-01,  2.90710756e-02,  5.45684132e-03,  5.99485850e-03,
       -4.35076732e-03, -9.57092450e-03,  2.96242920e-02,  1.18611934e-02,
        2.46960336e-02, -1.48328121e-02, -1.80277764e-02, -7.71256698e-02,
        2.91126764e-02, -2.31912688e-02,  1.35112444e-02, -4.62455100e-02,
        9.81081000e-03, -2.11021676e-02, -1.09604000e-03, -3.02955352e-02,
        5.13599064e-02, -1.62443561e-02,  2.13419940e-02, -2.13426386e-02,
       

In [None]:
"""
Saved in TestStar_v.0.1
"""

'\nSaved in TestStar_v.0.1\n'