In [None]:
from itext2kg.documents_distiller import DocumentsDistiller, Article
import asyncio
from itext2kg import iText2KG_Star
from itext2kg.logging_config import setup_logging, get_logger
from itext2kg import itext2kg_star
from langchain_ollama import ChatOllama, OllamaEmbeddings
import time
from datetime import datetime

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load all .txt files from a directory
loader = DirectoryLoader(
    "./data/unstructured/",
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'}
)
documents = loader.load()

# Split if needed
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
pages = text_splitter.split_documents(documents)

In [None]:
# Create models for distillation

In [None]:
from typing import List, Optional
from pydantic import BaseModel, Field, field_validator
class ContentSource(BaseModel):
    name: str = Field(description="Name of the author/speaker/presenter")
    role: Optional[str] = Field(description="Role or position")
    affiliation: Optional[str] = Field(description="Organization or company affiliation")
class Content(BaseModel):
    title: str = Field(description="Title of the content (article/video)")
    sources: List[ContentSource] = Field(description="Authors/speakers involved")
    summary: str = Field(description="Brief summary of the content")
    key_concepts: List[str] = Field(description="Main concepts or topics covered")
    insights: str = Field(description="Key insights and findings")
    challenges: str = Field(description="Challenges or limitations discussed")
    solutions: str = Field(description="Proposed solutions or approaches")
    practical_applications: str = Field(description="Practical applications or implementations mentioned")

In [None]:
# Set up connections using langchain

In [None]:
llm = ChatOllama(
    model="llama3.2:1b",
    temperature=0,
)

In [None]:
llm_text = ChatOllama(
    model="gemma2:2b",
    temperature=0,
)
embeddings = OllamaEmbeddings(
    model="nomic-embed-text:latest",
)

In [None]:
IE_query='''# DIRECTIVES :
            - Act like an experienced information extractor.
            - You have a YouTube transcript about Waterfall (preplanned) software development.
            - Extract key concepts, methodologies, best practices, and insights discussed.
            - If you do not find the right information, keep its place empty.
            - Focus on practical advice and real-world applications.''',

In [None]:
len(pages)

417

In [None]:
semantic_blocks=[]

In [None]:
# Split into blocks

In [None]:
pages_section=pages[0:9]
facts_0 = await document_distiller.distill(documents=[page.page_content.replace("{", '[').replace("}", "]") for page in pages_section], IE_query=IE_query, output_data_structure=Article)
semantic_blocks = [f"{key} - {value}".replace("{", "[").replace("}", "]")
                     for key, value in facts_0.model_dump().items()
                     if value != [] and value != "" and value is not None]

In [None]:
semantic_blocks

['title - Business Case Initiation Tools Project Initiation in Scrum/Agile Software Development # DIRECTIVES : Act like an experienced information extractor. Extract key concepts, methodologies, best practices, and insights discussed. Extracted Key Concepts and Methodologies from the Project Charter Organizational Breakdown Structure (OBS) and Resource Breakdown Structure (RBS): Key Concepts and Methodologies # DIRECTIVES : Act like an experienced information extractor. # DIRECTIVES : \n            - Act like an experienced information extractor.\n            - You have a YouTube transcript about Scrum/Agile software development.\n            - Extract key concepts, methodologies, best practices, and insights discussed.\n            - If you do not find the right information, keep its place empty.\n            - Focus on practical advice and real-world applications. Stakeholder Matrix/Map Insights Salience Chart for Stakeholders in Scrum/Agile Software Development',
 "authors - [['name

In [None]:
# Test to see if just copying it would result in a useable object

In [None]:
sb = ['title - Business Case Initiation Tools Project Initiation in Scrum/Agile Software Development # DIRECTIVES : Act like an experienced information extractor. Extract key concepts, methodologies, best practices, and insights discussed. Extracted Key Concepts and Methodologies from the Project Charter Organizational Breakdown Structure (OBS) and Resource Breakdown Structure (RBS): Key Concepts and Methodologies # DIRECTIVES : Act like an experienced information extractor. # DIRECTIVES : \n            - Act like an experienced information extractor.\n            - You have a YouTube transcript about Scrum/Agile software development.\n            - Extract key concepts, methodologies, best practices, and insights discussed.\n            - If you do not find the right information, keep its place empty.\n            - Focus on practical advice and real-world applications. Stakeholder Matrix/Map Insights Salience Chart for Stakeholders in Scrum/Agile Software Development',
 "authors - [['name': 'Experienced Information Extractor', 'affiliation': 'YouTube Transcript'], ['name': 'Samantha', 'affiliation': ''], ['name': 'Samantha', 'affiliation': 'Project Manager'], ['name': 'Pet Sitter Business Plan', 'affiliation': 'Unknown'], ['name': 'Information Extractor', 'affiliation': 'Experienced Information Extractor'], ['name': 'Information Extractor', 'affiliation': 'AI Assistant'], ['name': 'YouTube transcript', 'affiliation': 'Unknown'], ['name': 'John Doe', 'affiliation': 'Project Manager'], ['name': 'John Doe', 'affiliation': 'Information Extractor']]",
 "abstract - In this context, we will be using the following project management tools to initiate our project. The first two tools are: 1. Business Case Initiation Tool In this context, the project initiation tools are being discussed as part of a feasibility study for a new business case. The goal is to determine if starting the project is worthwhile and whether the proposed solution addresses the current issue or problem. A summary of the key points will be provided, followed by solutions and a recommendation. The project charter outlines the high-level goals, scope, and deliverables of the pet buddy mobile app. It serves as a foundation for the development team to work towards and ensures that all stakeholders are aligned with the project's objectives. In this YouTube transcript, the authors discuss the importance of using OBS and RBS in Scrum/Agile software development. They highlight key concepts such as stakeholder management, organizational charts, and resource allocation. In this context, we're evaluating our project's impact on Samantha, the business owner and project sponsor. We rate their influence as high (10) and impact as high (10). However, her level of influence is extremely high due to her control over resources and funding. This means that if we want to collaborate with Samantha, we need to prioritize communication and ensure she's informed about our project's progress. A salience chart is a visual tool used to identify and prioritize stakeholders' needs, power, legitimacy, and urgency. It's particularly useful for Agile software development projects.",
 'key_findings - The business case initiation tool is used to define and document the project\'s objectives, scope, and stakeholders. It helps to identify the key issues and opportunities that need to be addressed during the project. [...] 1. The project initiation tools are being considered as part of a feasibility study for a new business case. 2. The goal is to determine if starting the project is worthwhile and whether the proposed solution addresses the current issue or problem. 3. A summary of the key points will be provided, followed by solutions and a recommendation. Scrum/Agile software development principles, methodologies, best practices, and insights discussed in the transcript. project charter [...] ["Scrum/Agile software development", "Resource Breakdown Structure (RBS)"] [Impact: High, Influence: Extremely High]\n\nSamantha (Business Owner/Project Sponsor):\n- High Impact\n- High Influence ["Stakeholders", "Scrum/Agile software development", "Salience chart"]',
 "limitation_of_sota - Limited to defining and documenting the project's objectives, scope, and stakeholders The limitation of Sota (Scrum/Agile Software Development) is that it requires a clear understanding of the project's goals, scope, and stakeholders to initiate the project effectively. No specific limitations mentioned in the transcript. The project charter is an abstract document that outlines the high-level goals and scope of the pet buddy mobile app. It provides a foundation for the development team to work towards and ensures that all stakeholders are aligned with the project's objectives. None None The salience chart can be used to monitor stakeholders' needs over time, but it may not provide a comprehensive view of their power and legitimacy.",
 "proposed_solution - The business case initiation tool provides a structured approach to identifying and prioritizing the project's requirements. It helps to ensure that all stakeholders are aligned and that the project is focused on delivering value to customers. A proposed solution for initiating the project would be to create a project charter with the business owner and project manager. This would provide a clear understanding of the project's objectives and ensure that everyone is aligned on the project's direction. The project charter is created with key concepts, methodologies, best practices, and insights discussed. The project manager and sponsor sign off on the project, funding, and people needed to initiate the project. The project charter proposes a solution to develop a pet buddy mobile app, which is a key aspect of the pet sitter business plan. It outlines the high-level goals, scope, and deliverables of the project, ensuring that all stakeholders are aligned with the objectives. Use OBS to identify stakeholders and form a project team, then use RBS for resource allocation. 1. Create a Resource Breakdown Structure (RBS) for the pet buddy mobile app project. Create a stakeholder register for PMP exam preparation and update it regularly to track new stakeholders, their roles, responsibilities, influence, and impact. This will enable easy identification of stakeholders during the exam and provide valuable insights into project management best practices. Maintain communication with Samantha to ensure she's informed about our project's progress. If necessary, collaborate on a high-level stakeholder map or matrix to visualize the impact and influence of various stakeholders. To create a salience chart for stakeholders in Scrum/Agile software development, we need to identify key concepts, methodologies, best practices, and insights discussed in the YouTube transcript. We can use a 3D chart or a Ven diagram to visualize this information.",
 'paper_limitations - Limited to defining and documenting the project\'s objectives, scope, and stakeholders. [...] 1. The paper limitation is that it does not provide any specific information about the project initiation tools or methodologies used in Scrum/Agile software development. No specific paper limitations mentioned in the transcript. The paper limitations section states that the project charter is an abstract document that provides a foundation for the development team to work towards. It ensures that all stakeholders are aligned with the project\'s objectives and provides a clear understanding of the project scope. [...] ["Capturing all people involved in the project", "Organizing stakeholders and team members"], None [Directives: Act like an experienced information extractor.\n- You have a YouTube transcript about Scrum/Agile software development.\n- Extract key concepts, methodologies, best practices, and insights discussed.\n- If you do not find the right information, keep its place empty.\n- Focus on practical advice and real-world applications.] The transcript may not provide explicit information about stakeholder engagement, power, legitimacy, and urgency. Therefore, we need to focus on practical advice and real-world applications to extract relevant insights.']

In [None]:
sb

['title - Business Case Initiation Tools Project Initiation in Scrum/Agile Software Development # DIRECTIVES : Act like an experienced information extractor. Extract key concepts, methodologies, best practices, and insights discussed. Extracted Key Concepts and Methodologies from the Project Charter Organizational Breakdown Structure (OBS) and Resource Breakdown Structure (RBS): Key Concepts and Methodologies # DIRECTIVES : Act like an experienced information extractor. # DIRECTIVES : \n            - Act like an experienced information extractor.\n            - You have a YouTube transcript about Scrum/Agile software development.\n            - Extract key concepts, methodologies, best practices, and insights discussed.\n            - If you do not find the right information, keep its place empty.\n            - Focus on practical advice and real-world applications. Stakeholder Matrix/Map Insights Salience Chart for Stakeholders in Scrum/Agile Software Development',
 "authors - [['name

In [None]:
itext2kg_star = iText2KG_Star(llm_model=llm_text, embeddings_model=embeddings)

In [None]:
# Construct graph

In [None]:
kg = await itext2kg_star.build_graph(
    sections=sb,
    ent_threshold=0.8,      # Higher threshold for more distinct entities
    rel_threshold=0.7,      # Threshold for relationship merging
)

[2025-08-22 14:08:19] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 1
[2025-08-22 14:08:59] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 2
[2025-08-22 14:09:38] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [youtube transcript:authors] --merged--> [youtube transcript:YouTube_Transcript]
[2025-08-22 14:09:38] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [youtube transcript:affiliation] --merged--> [youtube transcript:YouTube_Transcript]
[2025-08-22 14:09:38] [    INFO] [itext2kg.itext2kg.itext2kg_star] ------- Extracting Relations and Deriving Entities from Document 3
[2025-08-22 14:10:14] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [samantha:Project_Impact_Evaluation] --merged--> [samantha:authors]
[2025-08-22 14:10:14] [    INFO] [itext2kg.itext2kg.graph_matching.

In [None]:
pages_section=pages[0:99]
facts_0 = await document_distiller.distill(documents=[page.page_content.replace("{", '[').replace("}", "]") for page in pages_section], IE_query=IE_query, output_data_structure=Article)
semantic_blocks = [f"{key} - {value}".replace("{", "[").replace("}", "]")
                     for key, value in facts_0.model_dump().items()
                     if value != [] and value != "" and value is not None]

CancelledError: 