In [None]:
# Set up connection using langchain

In [None]:
from langchain_ollama import ChatOllama, OllamaEmbeddings
llm = ChatOllama(
    model="llama3",
    temperature=0,
)
embeddings = OllamaEmbeddings(
    model="llama3",
)

In [None]:
# Input data

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(f"./data/input/test.pdf")
pages = loader.load_and_split()

In [None]:
from itext2kg.documents_distiller import DocumentsDistiller, CV, Article
document_distiller = DocumentsDistiller(llm_model = llm)

In [None]:
# Test to see output when distilling pages seperately

In [None]:
from tqdm.asyncio import tqdm
import asyncio

document_distiller = DocumentsDistiller(llm_model=llm)
IE_query = '''
# DIRECTIVES :
- Act like an experienced information extractor.
- You have a document describing the Scrum software development process.
- If you do not find the right information, keep its place empty.
'''

all_distilled_results = []
total_pages = len(pages)

print(f"Processing {total_pages} pages individually...")

for i, page in enumerate(tqdm(pages, desc="Distilling pages")):
    try:
        # Process one page at a time
        processed_content = page.page_content.replace("{", '[').replace("}", "]")

        distilled_result = await document_distiller.distill(
            documents=[processed_content],
            IE_query=IE_query,
            output_data_structure=Article
        )

        all_distilled_results.append(distilled_result)

        # Optional: log every N pages
        if (i + 1) % 10 == 0:
            print(f"✓ Completed {i + 1}/{total_pages} pages")

    except Exception as e:
        print(f"Error processing page {i + 1}: {e}")
        all_distilled_results.append(None)  # or handle error as needed

print(f"✓ All {total_pages} pages processed!")

Processing 15 pages individually...


Distilling pages:  67%|██████████████████████████████████████████▋                     | 10/15 [04:38<02:45, 33.11s/it]

✓ Completed 10/15 pages


Distilling pages: 100%|████████████████████████████████████████████████████████████████| 15/15 [06:48<00:00, 27.24s/it]

✓ All 15 pages processed!





In [None]:
## Took 10 minutes for 14 pages, approx 14sec/page

In [None]:
all_distilled_results[0].model_dump().items()

dict_items([('title', 'DIRECTIVES'), ('authors', [{'name': 'Ken Schwaber & Jeff Sutherland', 'affiliation': 'The Scrum Guide'}]), ('abstract', ''), ('key_findings', ''), ('limitation_of_sota', ''), ('proposed_solution', ''), ('paper_limitations', '')])

In [None]:
# Seperate into sematic blocks

In [None]:
semantic_blocks = []
for item in all_distilled_results:
    semantic_block = [f"{key} - {value}".replace("{", "[").replace("}", "]")
                     for key, value in item.model_dump().items()
                     if value != [] and value != "" and value is not None]
    for pair in semantic_block:
        semantic_blocks.append(pair)

In [None]:
semantic_blocks

['title - DIRECTIVES',
 "authors - [['name': 'Ken Schwaber & Jeff Sutherland', 'affiliation': 'The Scrum Guide']]",
 'title - Purpose of the Scrum Guide',
 "authors - [['name': 'Ken Schwaber & Jeff Sutherland', 'affiliation': ''], ['name': '', 'affiliation': '']]",
 'abstract - We developed Scrum in the early 1990s. We wrote the first version of the Scrum Guide in 2010 to help people worldwide understand Scrum. We have evolved the Guide since then through small, functional updates. Together, we stand behind it.',
 'title - Purpose of the Scrum Guide',
 'title - Commitment: Definition of Done',
 'title - Scrum Definition',
 'key_findings - 1. A Product Owner orders the work for a complex problem into a Product Backlog.\n2. The Scrum Team turns a selection of the work into an Increment of value during a Sprint.\n3. The Scrum Team and its stakeholders inspect the results and adjust for the next Sprint.\n4. Repeat',
 'paper_limitations - Scrum is simple. Try it as is and determine if its p

In [None]:
from itext2kg import iText2KG
itext2kg = iText2KG(llm_model = llm, embeddings_model = embeddings)

In [None]:
# Construct graph

In [None]:
kg = await itext2kg.build_graph(sections=[semantic_blocks], ent_threshold=0.6, rel_threshold=0.6)

[2025-08-19 14:35:31] [    INFO] [itext2kg.itext2kg.itext2kg] ------- Extracting Entities from the Document 1
[2025-08-19 14:36:40] [    INFO] [itext2kg.itext2kg.itext2kg] ------- Extracting Relations from the Document 1
[2025-08-19 14:38:13] [    INFO] [itext2kg.itext2kg.irelations_extraction.irelations_extractor] Verification of invented entities
[2025-08-19 14:38:13] [    INFO] [itext2kg.itext2kg.irelations_extraction.irelations_extractor] [INVENTED ENTITIES] The entities label='jeff_sutherland' name='jeff sutherland' properties=EntityProperties(embeddings=None) and label='the_scrum_guide' name='scrum guide' properties=EntityProperties(embeddings=None) are invented. Solving them ...
[2025-08-19 14:38:16] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [jeff sutherland:jeff_sutherland] --merged--> [jeff sutherland:Jeff_Sutherland]
[2025-08-19 14:38:16] [    INFO] [itext2kg.itext2kg.graph_matching.matcher] Entity was matched --- [scrum guide:the_scrum_guid

In [None]:
## 3 minutes for 14 pages, approx 12/sec per page

In [None]:
kg.entities

[Entity(name=the scrum guide, label=Scrum_Guide, properties=embeddings=array([ 0.00076323, -0.01079603,  0.01234522, ..., -0.01680617,
         0.00379306, -0.00287072])),
 Entity(name=ken schwaber, label=Ken_Schwaber, properties=embeddings=array([-0.0069446 , -0.00501832,  0.02016574, ...,  0.00713102,
         0.01179898, -0.00894241])),
 Entity(name=jeff sutherland, label=Jeff_Sutherland, properties=embeddings=array([-0.01258763, -0.00573601,  0.01123346, ...,  0.00373595,
         0.01075638,  0.00151081]))]

In [None]:
kg.relationships

[Relationship(name=Authors, startEntity=label='Jeff_Sutherland' name='jeff sutherland' properties=EntityProperties(embeddings=array([-0.01258763, -0.00573601,  0.01123346, ...,  0.00373595,
         0.01075638,  0.00151081])), endEntity=label='Scrum_Guide' name='the scrum guide' properties=EntityProperties(embeddings=array([ 0.00076323, -0.01079603,  0.01234522, ..., -0.01680617,
         0.00379306, -0.00287072])), properties=embeddings=array([ 0.01162794, -0.02450253, -0.00616446, ...,  0.00347366,
         0.01757184, -0.01354166]) observation_dates=[]),
 Relationship(name=Authors, startEntity=label='Ken_Schwaber' name='ken schwaber' properties=EntityProperties(embeddings=array([-0.0069446 , -0.00501832,  0.02016574, ...,  0.00713102,
         0.01179898, -0.00894241])), endEntity=label='Scrum_Guide' name='the scrum guide' properties=EntityProperties(embeddings=array([ 0.00076323, -0.01079603,  0.01234522, ..., -0.01680617,
         0.00379306, -0.00287072])), properties=embeddings=

In [None]:
# 3 entities and 2 relationships is below expected for size of data, likely loss of context