In [None]:
### Fixing import errors of the

import sys
import os

# This code navigates up one directory from the notebook's location ('examples/')
# to get the project's root directory.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# We check if the path is already in the system path.
# If not, we add it to the beginning of the list.
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added project root to Python path: {project_root}")
else:
    print(f"Project root is already in Python path: {project_root}")

# Optional: You can print the first few paths to verify
print("\nVerifying sys.path:")
for i, path in enumerate(sys.path[:5]):
    print(f"{i}: {path}")

In [None]:
import json
import re
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import SimpleDirectoryReader, PropertyGraphIndex,Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

In [None]:
llm = Ollama(
    model= "gemma3:12b",
    request_timeout=120.0,
    context_window=8128,
    temperature=0.0
)

Settings.llm = llm
Settings.chunk_size=512
Settings.chunk_overlap=64

embed_model = OllamaEmbedding(
    model_name="snowflake-arctic-embed2:latest",
    ollama_additional_kwargs={"mirostat": 0},
)
Settings.embed_model = embed_model

In [None]:
with open('../.data/novel.json', 'r') as file:
    # Load the JSON data from the file into a Python object
    data = json.load(file)

In [None]:
modes = ["gli","llm","hybrid"]

In [None]:
def extract_knowledge_graph_data(file_path):
    """
    Extracts the total number of nodes and relations from an HTML file
    containing a vis.js knowledge graph.

    Args:
        file_path (str): The path to the HTML file.

    Returns:
        dict: A dictionary with the counts of 'nodes' and 'relations',
              or an error message if data cannot be extracted.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
    except FileNotFoundError:
        return {"error": f"File not found at {file_path}"}

    # Use regular expressions to find the nodes and edges data
    # The re.DOTALL flag allows '.' to match newlines
    nodes_pattern = re.search(r'nodes = new vis\.DataSet\((.*?)\);', html_content, re.DOTALL)
    edges_pattern = re.search(r'edges = new vis\.DataSet\((.*?)\);', html_content, re.DOTALL)

    if not nodes_pattern or not edges_pattern:
        return {"error": "Could not find nodes or edges data in the specified format."}

    # Extract the JSON string from the matched patterns
    nodes_json_str = nodes_pattern.group(1)
    edges_json_str = edges_pattern.group(1)

    try:
        # Parse the JSON string into a Python list
        nodes_data = json.loads(nodes_json_str)
        edges_data = json.loads(edges_json_str)
    except json.JSONDecodeError:
        return {"error": "Failed to parse the data. Check if it is valid JSON."}

    # Count the number of nodes and relations
    num_nodes = len(nodes_data)
    num_relations = len(edges_data)

    return {
        "nodes": num_nodes,
        "relations": num_relations
    }

In [None]:
# this only needs to be executed if no graphs in .persistent_storage/.graphs/ has been created, they are essential for the extraction of nodes
for mode in modes:
    for novel in data:
        corpus_name = novel["corpus_name"]
        index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=f"./.persistent_storage/.storage_context/{mode}/{corpus_name}"))
        index.property_graph_store.save_networkx_graph(name=f"./.persistent_storage/.graphs/{mode}/{corpus_name}_kg.html")
        

In [None]:
for mode in modes:
    added_nodes = 0
    added_rels = 0
    for novel in data:
        corpus_name = novel["corpus_name"]
        file_path = f"./.persistent_storage/.graphs/{mode}/{corpus_name}_kg.html"
        kg_data = extract_knowledge_graph_data(file_path)
        if "error" in kg_data:
            print(kg_data["error"])
        else:
            added_nodes = added_nodes + kg_data["nodes"]
            added_rels = added_rels + kg_data["relations"]
    print(f"Mode: {mode}")
    print(f"num nodes: {added_nodes}")
    print(f"num rels: {added_rels}")