## Build Graph

In [1]:
import logging
from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from knowledge_graph.knowledge import KnowledgeBuilder
from knowledge_graph.graph_builder import IterativeKnowledgeGraphBuilder

# llm_client = LLMInterface("bedrock", "us.deepseek.r1-v1:0")
llm_client = LLMInterface("openai_like", "qwen3-32b")
kb_builder = KnowledgeBuilder()
graph_builder = IterativeKnowledgeGraphBuilder(llm_client, get_text_embedding)

# Initialize logging module with a basic configuration for console output
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s'
)

  from .autonotebook import tqdm as notebook_tqdm


OpenAI Base URL: http://192.168.206.101:1234/v1


In [2]:
import json
import os

# Define the path to the JSON configuration file
config_file_path = 'docs/pdf_metadata.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")


client_name_list = set([doc['client_name'] for doc in loaded_docs])
client_docs = {}
for client_name in client_name_list:
    client_docs[client_name] = []
    for doc in loaded_docs:
        if doc['client_name'] == client_name:
            client_docs[client_name].append({
                'path': doc['path'],  # required
                'client_name': doc['client_name'],
                'created_time': doc['created_time'],
                'modified_time': doc['modified_time'],
                'doc_link': doc['web_view_link'],
                'mime_type': doc['mime_type']
            })
    print(f"Client: {client_name}, Number of documents: {len(client_docs[client_name])}")


Successfully loaded configuration from: docs/pdf_metadata.json

Example: Accessing first document data:
{'path': 'docs/Apple/Apple Customer Questions/Apple Customer Questions.pdf', 'magic_pdf_path': 'docs/Apple/Apple Customer Questions/Apple Customer Questions_magic_pdf', 'pymupdf_path': 'docs/Apple/Apple Customer Questions/Apple Customer Questions_pymupdf', 'magic_pdf_md_path': 'docs/Apple/Apple Customer Questions/Apple Customer Questions_magic_pdf/Apple Customer Questions.md', 'pymupdf_md_path': 'docs/Apple/Apple Customer Questions/Apple Customer Questions_pymupdf/Apple Customer Questions.md', 'client_name': 'Apple', 'created_time': '2023-11-21T20:56:27.253000+00:00', 'modified_time': '2024-01-30T15:52:12.471000+00:00', 'web_view_link': 'https://docs.google.com/document/d/1MG8UTBxv-ZQYWY4AwYTtpQ9uHdfh3ytWaoEg1JN6CaA/edit?usp=drivesdk', 'mime_type': 'application/vnd.google-apps.document'}
Client: Rubrik, Number of documents: 6
Client: Airbnb, Number of documents: 6
Client: Visa - AI P

In [4]:
client_name = "Apple"
docs = client_docs[client_name]

topic_name = f"Customer Tracking for {client_name}"

print("step 1: upload docs to knowledge base")
topic_docs = {}
for doc in docs:
    file_path = doc['path']
    try:
        res = kb_builder.extract_knowledge(
            file_path, 
            doc
        )
        if res['status'] == 'success':
            topic_docs[res['source_id']] = {
                "source_id": res['source_id'],
                "source_name": res['source_name'],
                "source_content": res['source_content'],
                "source_link": res['source_link'],
            } 
        else:
            print(f"process index {file_path} failed, {res['error']}", exc_info=True)

    except Exception as e:
        logging.error(f"process index {file_path} failed, {e}", exc_info=True)

topic_docs

[2025-06-12 13:54:36,489] INFO in extract: Using magic_pdf to extract data from PDFs
[2025-06-12 13:54:36,493] INFO in extract: Processing with magic_pdf: docs/Apple/Apple Customer Questions/Apple Customer Questions.pdf, output directory: docs/Apple/Apple Customer Questions/Apple Customer Questions_magic_pdf
[32m2025-06-12 13:54:36.495[0m | [1mINFO    [0m | [36mmagic_pdf.data.dataset[0m:[36m__init__[0m:[36m157[0m - [1mlang: None[0m
[32m2025-06-12 13:54:36.566[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mdoc_analyze[0m:[36m162[0m - [1mBatch 1/1: 7 pages/7 pages[0m


step 1: upload docs to knowledge base


Layout Predict: 100%|██████████| 7/7 [00:01<00:00,  4.52it/s]
MFD Predict: 100%|██████████| 7/7 [00:02<00:00,  2.44it/s]
MFR Predict: 100%|██████████| 4/4 [00:00<00:00, 14.87it/s]
OCR-det Predict: 100%|██████████| 7/7 [00:01<00:00,  3.59it/s]
Table Predict: 0it [00:00, ?it/s]
OCR-rec Predict: 100%|██████████| 212/212 [00:04<00:00, 43.44it/s]
Processing pages: 100%|██████████| 7/7 [00:00<00:00, 39.78it/s]
[2025-06-12 13:54:48,578] INFO in extract: ✅ Successfully processed with magic_pdf: docs/Apple/Apple Customer Questions/Apple Customer Questions.pdf, output directory: docs/Apple/Apple Customer Questions/Apple Customer Questions_magic_pdf, generated files: Apple Customer Questions.md (main markdown), Apple Customer Questions_content_list.json (content structure), Apple Customer Questions_middle.json (metadata)
[2025-06-12 13:54:48,988] INFO in knowledge: Source data already exists for docs/Apple/Apple Customer Questions/Apple Customer Questions.pdf, id: cc5b92b6-ef73-4c4d-8b54-60c610d3

{'cc5b92b6-ef73-4c4d-8b54-60c610d3443d': {'source_id': 'cc5b92b6-ef73-4c4d-8b54-60c610d3443d',
  'source_name': 'Apple Customer Questions',
  'source_content': '# Question\n\nHow does TiCDC handle the processes if a node goes down.\n\n# Answer\n\nWhen a node failure occurs during TiCDC replication, the system takes the following steps to recover:\n\n1.Detect the node failure: TiCDC uses a heartbeat mechanism to detect node failures. If a node does not respond to a heartbeat within a certain timeout, it is considered failed.   \n2．Elect a new leader: Once a node failure is detected, TiCDC will elect a new leader from the remaining nodes.   \n3.Synchronize data with the new leader: The new leader will need to synchronize its data with the failed node before it can start replicating changes.   \n4．Resume replication: Once the new leader is synchronized, it will resume replicating changes from TiDB to the downstream systems.\n\nDuring the recovery process, there may be a brief period of ti

In [9]:
topic_name = f"Customer {client_name} Tracking"

print("step 2: add to graph")
result = graph_builder.build_iterative_knowledge_graph(topic_name, list(topic_docs.values()))
print(result)

[2025-06-12 14:32:52,246] INFO in graph_builder: Building iterative knowledge graph for topic: Customer Apple Tracking: 9 documents
[2025-06-12 14:32:52,247] INFO in graph_builder: === Stage 0: Generating document summaries ===


step 2: add to graph


[2025-06-12 14:32:54,646] INFO in summarizer: Generating summary for document: Apple Customer Questions
[2025-06-12 14:34:19,986] INFO in _client: HTTP Request: POST http://192.168.206.101:1234/v1/chat/completions "HTTP/1.1 200 OK"
[2025-06-12 14:34:22,553] INFO in summarizer: Using cached summary for document: TiDB's Relationship with MySQL and Security Vulnerabilities
[2025-06-12 14:34:23,697] INFO in summarizer: Using cached summary for document: 1 - Account Discovery Capture Sheet (with instruction)
[2025-06-12 14:34:25,010] INFO in summarizer: Using cached summary for document: Louis FINAL 10102023
[2025-06-12 14:34:26,223] INFO in summarizer: Using cached summary for document: Apple Relevant Features
[2025-06-12 14:34:27,569] INFO in summarizer: Using cached summary for document: Apple POC Joint Execution Plan
[2025-06-12 14:34:28,820] INFO in summarizer: Using cached summary for document:  Technical Discovery Capture Form 
[2025-06-12 14:34:30,189] INFO in summarizer: Using cach

{'topic_name': 'Customer Apple Tracking', 'blueprint_id': '3d4fad24-47d2-493c-a8a0-aef68879e545', 'documents_processed': 9, 'summaries_generated': 9, 'triplets_extracted': 18, 'semantic_triplets': 10, 'structural_triplets': 8, 'entities_created': 1, 'relationships_created': 18, 'skeletal_entities_created': 0, 'skeletal_relationships_created': 0, 'narrative_entities_created': 1, 'narrative_relationships_created': 18, 'skeletal_graph': {'entities_count': 10, 'relationships_count': 10, 'skeletal_entities': [{'description': 'Customer entity evaluating TiDB for tracking systems', 'entity_type': 'Customer', 'name': 'Apple', 'role': 'central_topic'}, {'description': "Distributed database solution evaluated for Apple's tracking needs", 'entity_type': 'Technical Component', 'name': 'TiDB Cloud', 'role': 'key_aspect'}, {'description': 'Change Data Capture tool for replication and synchronization in TiDB', 'entity_type': 'Technical Component', 'name': 'TiCDC', 'role': 'supporting_element'}, {'des