In [1]:
import os
import sys
from rdflib import Graph, Namespace
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx

# Add the project root to Python's path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

# Define the path to your turtle files
turtle_dir = os.path.join(project_root, "polianna-processed", "turtle")

# Create a single combined graph
combined_graph = Graph()

# Get list of all ttl files
ttl_files = [f for f in os.listdir(turtle_dir) if f.endswith('.ttl')]
print(f"Found {len(ttl_files)} turtle files in {turtle_dir}")

# Load all files into the combined graph with a progress bar
for ttl_file in tqdm(ttl_files, desc="Loading files"):
    file_path = os.path.join(turtle_dir, ttl_file)
    try:
        combined_graph.parse(file_path, format="turtle")
    except Exception as e:
        print(f"Error loading {ttl_file}: {e}")

# Print statistics about the combined graph
print("\nCombined Graph Statistics:")
print(f"Total number of triples: {len(combined_graph)}")
print(f"Number of unique subjects: {len(set(combined_graph.subjects()))}")
print(f"Number of unique predicates: {len(set(combined_graph.predicates()))}")
print(f"Number of unique objects: {len(set(combined_graph.objects()))}")

# If you want to see the namespaces in the combined graph
print("\nNamespaces in combined graph:")
for prefix, namespace in combined_graph.namespaces():
    print(f"{prefix}: {namespace}")

# If you want to save the combined graph
combined_graph_path = os.path.join(project_root, "polianna-processed", "combined_graph.ttl")
combined_graph.serialize(destination=combined_graph_path, format="turtle")
print(f"\nSaved combined graph to {combined_graph_path}")

# Optional: visualize a small sample of the graph if it's not too large
if len(combined_graph) < 1000:  # Only visualize if the graph is small enough
    plt.figure(figsize=(12, 10))
    G = nx.Graph()
    
    # Add a sample of edges
    for s, p, o in list(combined_graph)[:500]:  # Sample first 500 triples
        G.add_edge(str(s), str(o), label=str(p))
    
    pos = nx.spring_layout(G, seed=42)
    nx.draw(G, pos, with_labels=False, node_size=10, font_size=8, 
            node_color="skyblue", alpha=0.7, width=0.5)
    plt.title("Sample of Combined Knowledge Graph")
    plt.show()
else:
    print("\nGraph is too large to visualize directly.")

Found 412 turtle files in /Users/oskarkrafft/Desktop/Projects/LLM-policy-knowledge-graphs/polianna-processed/turtle


Loading files: 100%|██████████| 412/412 [00:00<00:00, 1513.30it/s]



Combined Graph Statistics:
Total number of triples: 5829
Number of unique subjects: 1169
Number of unique predicates: 20
Number of unique objects: 2059

Namespaces in combined graph:
brick: https://brickschema.org/schema/Brick#
csvw: http://www.w3.org/ns/csvw#
dc: http://purl.org/dc/elements/1.1/
dcat: http://www.w3.org/ns/dcat#
dcmitype: http://purl.org/dc/dcmitype/
dcterms: http://purl.org/dc/terms/
dcam: http://purl.org/dc/dcam/
doap: http://usefulinc.com/ns/doap#
foaf: http://xmlns.com/foaf/0.1/
geo: http://www.opengis.net/ont/geosparql#
odrl: http://www.w3.org/ns/odrl/2/
org: http://www.w3.org/ns/org#
prof: http://www.w3.org/ns/dx/prof/
prov: http://www.w3.org/ns/prov#
qb: http://purl.org/linked-data/cube#
schema: https://schema.org/
sh: http://www.w3.org/ns/shacl#
skos: http://www.w3.org/2004/02/skos/core#
sosa: http://www.w3.org/ns/sosa/
ssn: http://www.w3.org/ns/ssn/
time: http://www.w3.org/2006/time#
vann: http://purl.org/vocab/vann/
void: http://rdfs.org/ns/void#
wgs: https:

In [None]:
def interactive_graph_visualization(article_id, turtle_dir=None, output_file=None, 
                                   physics=True, height="750px", width="100%"):
    """
    Create an interactive visualization of an RDF graph using PyVis.
    
    Args:
        article_id: The ID of the article
        turtle_dir: Directory containing the TTL files
        output_file: HTML file to save the visualization (defaults to article_id + .html)
        physics: Enable physics simulation for the graph
        height: Height of the visualization
        width: Width of the visualization
        
    Returns:
        The loaded RDF graph
    """
    from pyvis.network import Network
    import os
    
    # Use default directory if none provided
    if turtle_dir is None:
        turtle_dir = os.path.join(project_root, "polianna-processed", "turtle")
    
    # Default output file
    if output_file is None:
        output_file = f"{article_id}_graph.html"
    
    # Construct file path
    file_path = os.path.join(turtle_dir, f"{article_id}.ttl")
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None
    
    # Load the graph
    g = Graph()
    g.parse(file_path, format="turtle")
    
    # Print the TTL content
    print(f"TTL Content for {article_id}:")
    print("-" * 80)
    ttl_content = g.serialize(format="turtle").decode("utf-8") if hasattr(g.serialize(format="turtle"), 'decode') else g.serialize(format="turtle")
    print(ttl_content)
    print("-" * 80)
    
    # Create an interactive network with CDN resources set to 'in_line'
    net = Network(notebook=True, cdn_resources='in_line', height=height, width=width, directed=True)
    net.toggle_physics(physics)
    
    # Track added nodes to avoid duplicates
    added_nodes = set()
    
    # Add edges to the network
    for s, p, o in g:
        # Get the simplified node names
        source = str(s).split('/')[-1] if '#' not in str(s) else str(s).split('#')[-1]
        target = str(o).split('/')[-1] if '#' not in str(o) else str(o).split('#')[-1]
        predicate = str(p).split('/')[-1] if '#' not in str(p) else str(p).split('#')[-1]
        
        # Add nodes if they don't exist
        if source not in added_nodes:
            net.add_node(source, title=str(s), label=source)
            added_nodes.add(source)
            
        if target not in added_nodes:
            net.add_node(target, title=str(o), label=target)
            added_nodes.add(target)
        
        # Add edge with the predicate as title
        net.add_edge(source, target, title=predicate)
    
    # Save and display using the direct HTML display method
    net.save_graph(output_file)
    
    # Print graph statistics
    print(f"Graph Statistics:")
    print(f"- Number of triples: {len(g)}")
    print(f"- Number of unique subjects: {len(set(g.subjects()))}")
    print(f"- Number of unique predicates: {len(set(g.predicates()))}")
    print(f"- Number of unique objects: {len(set(g.objects()))}")
    
    print(f"Interactive visualization saved to {output_file}")
    
    # Direct HTML display method
    from IPython.display import HTML
    
    try:
        # Read the HTML file content
        with open(output_file, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # Return HTML display object
        return HTML(html_content)
    except Exception as e:
        print(f"Error displaying visualization: {e}")
        print(f"The visualization has been saved to {output_file}. Please open it in a web browser.")
        return g