## Prepare Data

In [1]:
import logging
import os

from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from setting.db import db_manager
from knowledge_graph.knowledge import KnowledgeBuilder
from knowledge_graph.graph_builder import KnowledgeGraphBuilder

llm_client = LLMInterface("ollama", "qwen3:32b-fp16")
session_factory = db_manager.get_session_factory(os.getenv("GRAPH_DATABASE_URI"))
kb_builder = KnowledgeBuilder(llm_client, get_text_embedding, session_factory)
graph_builder = KnowledgeGraphBuilder(llm_client, get_text_embedding, session_factory)

# Initialize logging module with a basic configuration for console output
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(filename)s:%(lineno)d: %(message)s'
)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import os
import hashlib

categories = [
    'tidbcloud/API/API Overview',
    'tidbcloud/About TiDB Cloud',
]

# Define the path to the JSON configuration file
config_file_path = '/Users/ian/Work/docs/toc_files_for_tidb_cloud.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")


tidb_product_docs = {}
for category in categories:
    topic_name = "TiDBCloud Product Documentation - " + category
    tidb_product_docs[topic_name] = []
    topic_docs = set()
    for doc in loaded_docs:
        if category == doc['category']:
            topic_id = f"{category}-{doc['web_view_link']}"
            if topic_id in topic_docs:
                continue
            topic_docs.add(topic_id)
            tidb_product_docs[topic_name].append({
                'topic_name': topic_name,
                'path': doc['path'],  # required
                'doc_link': doc['web_view_link'], # required
                'category': category,
                'updated_at': doc['modified_time'],
                'mime_type': doc['mime_type'],
                'version': "2025-07-07"
            })
    print(f"Topic: {topic_name}, Number of documents: {len(tidb_product_docs[topic_name])}")

Successfully loaded configuration from: /Users/ian/Work/docs/toc_files_for_tidb_cloud.json

Example: Accessing first document data:
{'path': '/Users/ian/Work/docs/tidb-cloud/tidb-cloud-intro.md', 'category': 'tidbcloud/About TiDB Cloud', 'modified_time': '2025-01-09 08:47:10 +0000', 'web_view_link': 'https://docs.pingcap.com/tidbcloud/tidb-cloud-intro/', 'mime_type': 'text/markdown'}
Topic: TiDBCloud Product Documentation - tidbcloud/API/API Overview, Number of documents: 1
Topic: TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud, Number of documents: 3


## Restful API Example

### Upload documents by topic. 

The same document can be uploaded to different topics repeatedly, and the backend will automatically handle deduplication.

In [3]:
topic_name = "TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud"
topic_docs = tidb_product_docs[topic_name]
topic_docs

[{'topic_name': 'TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud',
  'path': '/Users/ian/Work/docs/tidb-cloud/tidb-cloud-intro.md',
  'doc_link': 'https://docs.pingcap.com/tidbcloud/tidb-cloud-intro/',
  'category': 'tidbcloud/About TiDB Cloud',
  'updated_at': '2025-01-09 08:47:10 +0000',
  'mime_type': 'text/markdown',
  'version': '2025-07-07'},
 {'topic_name': 'TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud',
  'path': '/Users/ian/Work/docs/tidb-cloud/high-availability-with-multi-az.md',
  'doc_link': 'https://docs.pingcap.com/tidbcloud/high-availability-with-multi-az/',
  'category': 'tidbcloud/About TiDB Cloud',
  'updated_at': '2025-04-17 05:54:42 +0000',
  'mime_type': 'text/markdown',
  'version': '2025-07-07'},
 {'topic_name': 'TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud',
  'path': '/Users/ian/Work/docs/mysql-compatibility.md',
  'doc_link': 'https://docs.pingcap.com/tidbcloud/mysql-compatibility/',
  'category': 'tidbcloud/About T

In [None]:
import requests

url = "http://192.168.206.252:23333/api/v1/save"

files = []
links = []
for doc in topic_docs:
    files.append(('files', (doc["path"].split('/')[-1], open(doc["path"], 'rb'), 'application/pdf')))
    links.append(doc["doc_link"])

data = {
    'links': links,
    'topic_name': topic_name,
    'database_uri': os.getenv("GRAPH_DATABASE_URI")
}
response = requests.post(url, files=files, data=data)

print(response.status_code)
print(response.json())


In [4]:
import requests
import json
import os
from pathlib import Path

API_ENDPOINT = "http://192.168.206.252:23333/api/v1/save"

for topic_doc in topic_docs:
    topic_name = topic_doc["topic_name"]
    document_link = topic_doc["doc_link"]
    document_path = Path(topic_doc["path"])

    metadata_payload = {
        "topic_name": topic_name,
        "link": document_link,
    }
    metadata_json_string = json.dumps(metadata_payload)

    form_data = {
        "target_type": "knowledge_graph",
        "metadata": metadata_json_string,
    }

    with open(document_path, "rb") as f:
        files_payload = {
            "file": (document_path.name, f, "application/octet-stream")
        }

        # send request
        print(f"uploading document '{document_path.name}' to {API_ENDPOINT}...")
        try:
            response = requests.post(API_ENDPOINT, data=form_data, files=files_payload, timeout=60)

            # handle response
            print(f"response status code: {response.status_code}")
            
            # check if response is successful
            if response.ok:
                print("response content (JSON):")
                print(json.dumps(response.json(), indent=2, ensure_ascii=False))
            else:
                print(f"request failed: {response.text}")

        except requests.exceptions.RequestException as e:
            print(f"request failed: {e}")


uploading document 'tidb-cloud-intro.md' to http://192.168.206.252:23333/api/v1/save...
response status code: 200
response content (JSON):
{
  "status": "success",
  "data": {
    "id": "cf72bc65d6a8d51c07e3854cfafe299a3988567f636f901a552e508eff22a236",
    "name": "tidb-cloud-intro.md",
    "file_path": "uploads/TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud/tidb-cloud-intro_v1",
    "doc_link": "https://docs.pingcap.com/tidbcloud/tidb-cloud-intro/",
    "file_type": "markdown",
    "status": "uploaded"
  },
  "message": "Successfully processed file for knowledge graph. Status: uploaded"
}
uploading document 'high-availability-with-multi-az.md' to http://192.168.206.252:23333/api/v1/save...
response status code: 200
response content (JSON):
{
  "status": "success",
  "data": {
    "id": "572d84c0c1c102c7baaec99bc6a2cfb532fd3ceb4bd40f7e07d6d53d5e0573a3",
    "name": "high-availability-with-multi-az.md",
    "file_path": "uploads/TiDBCloud Product Documentation - tidbcloud

### Build Graph

After documents are uploaded to the same topic, a build of the corresponding graph can be triggered.

In [5]:
import requests

# Call the trigger-processing API to start processing uploaded all documents for a topic
url = "http://192.168.206.252:23333/api/v1/knowledge/trigger-processing"
data = {
    "topic_name": topic_name
}

response = requests.post(url, data=data)
print(response.status_code)
print(response.json())

200
{'status': 'success', 'data': {'triggered_count': 3, 'topic_name': 'TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud'}, 'message': "Successfully triggered processing for 3 documents in topic 'TiDBCloud Product Documentation - tidbcloud/About TiDB Cloud'. Processing will begin shortly."}
