# Ground Truth Generation Demo

This notebook demonstrates how to generate ground truth RDF data from policy documents using the POLIANNA ontology.

In [1]:
import os
import json
import sys

# Add the project root to Python's path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)  # Make src importable

from src.ground_truth_generation.generate_ground_truth import (
    generate_turtle_for_article,
    load_ontology_and_data
)

## Step 1: Set up paths and directories

In [2]:
# Set paths for input and output
data_dir = os.path.join(project_root, "polianna-dataset", "data", "03b_processed_to_json")
output_dir = os.path.join(project_root, "polianna-processed", "turtle")
ontology_path = os.path.join(project_root, "ontology", "polianna-ontology.ttl")

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print(f"Data directory: {data_dir}")
print(f"Output directory: {output_dir}")
print(f"Ontology path: {ontology_path}")

Data directory: /Users/oskarkrafft/Desktop/Projects/LLM-policy-knowledge-graphs/polianna-dataset/data/03b_processed_to_json
Output directory: /Users/oskarkrafft/Desktop/Projects/LLM-policy-knowledge-graphs/polianna-processed/turtle
Ontology path: /Users/oskarkrafft/Desktop/Projects/LLM-policy-knowledge-graphs/ontology/polianna-ontology.ttl


## Step 2: List available article folders and select one for demonstration

In [3]:
# List all article folders in the data directory
article_folders = [entry.path for entry in os.scandir(data_dir) if entry.is_dir()]
print(f"Found {len(article_folders)} article folders.")

# Display the first few article folders
for i, folder in enumerate(article_folders[:5]):
    print(f"{i+1}. {os.path.basename(folder)}")

# Select the first folder for demonstration
demo_folder = article_folders[0]
print(f"\nSelected folder for demonstration: {os.path.basename(demo_folder)}")

Found 412 article folders.
1. EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55
2. EU_32012L0027_Title_0_Chapter_3_Section_0_Article_14
3. EU_32008R1099_Title_0_Chapter_0_Section_0_Article_11
4. EU_32006L0032_Title_0_Chapter_3_Section_0_Article_07
5. EU_32014L0094_Title_0_Chapter_0_Section_0_Article_04

Selected folder for demonstration: EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55


## Step 3: Explore the content of the selected folder

In [4]:
# Check if the required files exist
curated_path = os.path.join(demo_folder, "Curated_Annotations.json")
info_path = os.path.join(demo_folder, "Policy_Info.json")
text_path = os.path.join(demo_folder, "Raw_Text.txt")

files_exist = all(os.path.isfile(p) for p in [curated_path, info_path, text_path])
print(f"Required files exist: {files_exist}")

# Display policy information
with open(info_path, encoding="utf-8") as f:
    policy_info = json.load(f)

print("\nPolicy Information:")
for key, value in policy_info.items():
    print(f"{key}: {value}")

# Display first few annotations
with open(curated_path, encoding="utf-8") as f:
    annotations = json.load(f)

print(f"\nTotal annotations: {len(annotations)}")
print("\nFirst 3 annotations:")
for i, ann in enumerate(annotations[:3]):
    print(f"\nAnnotation {i+1}:")
    print(f"  Layer: {ann.get('layer')}")
    print(f"  Feature: {ann.get('feature')}")
    print(f"  Tag: {ann.get('tag')}")
    print(f"  Text: {ann.get('text')[:50]}...")

Required files exist: True

Policy Information:
Titel: EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55
CELEX_Number: 32019L0944
ELI: http://data.europa.eu/eli/dir/2019/944/oj
Annotators: ['A', 'B']

Total annotations: 14

First 3 annotations:

Annotation 1:
  Layer: Instrumenttypes
  Feature: InstrumentType
  Tag: RegulatoryInstr
  Text: right of access to accounts...

Annotation 2:
  Layer: Instrumenttypes
  Feature: InstrumentType
  Tag: RegulatoryInstr
  Text: right of access...

Annotation 3:
  Layer: Instrumenttypes
  Feature: InstrumentType
  Tag: RegulatoryInstr
  Text: right of access...


## Step 4: Generate the Turtle file for the selected article

In [5]:
# Generate the Turtle file
generate_turtle_for_article(demo_folder, output_dir)

# Path to the generated Turtle file
folder_name = os.path.basename(os.path.normpath(demo_folder))
ttl_path = os.path.join(output_dir, f"{folder_name}.ttl")

# Check if the file was created successfully
if os.path.isfile(ttl_path):
    print(f"Successfully created Turtle file at: {ttl_path}")
    print(f"File size: {os.path.getsize(ttl_path) / 1024:.2f} KB")
else:
    print(f"Failed to create Turtle file!")

Created Turtle file: /Users/oskarkrafft/Desktop/Projects/LLM-policy-knowledge-graphs/polianna-processed/turtle/EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55.ttl
Successfully created Turtle file at: /Users/oskarkrafft/Desktop/Projects/LLM-policy-knowledge-graphs/polianna-processed/turtle/EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55.ttl
File size: 9.40 KB


## Step 5: Explore the generated Turtle file

In [6]:
# Display the first few lines of the Turtle file
with open(ttl_path, "r", encoding="utf-8") as f:
    ttl_content = f.readlines()

print(f"First 20 lines of the Turtle file:")
for line in ttl_content[:20]:
    print(line.strip())

First 20 lines of the Turtle file:
@prefix polianna: <https://polianna-kg.org/> .
@prefix eli: <http://data.europa.eu/eli/ontology#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://polianna-kg.org/data/document/32019L0944> a polianna:PolicyDocument ;
rdfs:label "32019L0944" .

<https://polianna-kg.org/data/article/EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55> a polianna:PolicyArticle ;
rdfs:label "EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55" ;
polianna:fullText "article 55\nright of access to accounts\n1.   member states or any competent authority that they designate, including the regulatory authorities referred to in article 57, shall, insofar as necessary to carry out their functions, have right of access to the accounts of electricity undertakings as set out in article 56.\n2.   member states and any designated competent authority, including the re

## Step 6: Load the generated Turtle file into an RDF Graph

In [7]:
from rdflib import Graph, Namespace, URIRef
from rdflib.namespace import RDF, RDFS

# Load just the generated Turtle file
g = Graph()
g.parse(ttl_path, format="turtle")

print(f"Loaded {len(g)} triples from the Turtle file.")

Loaded 104 triples from the Turtle file.


## Step 7: Explore the RDF Graph

In [8]:
# Define namespaces
POLIANNA = Namespace("https://polianna-kg.org/")

# Count occurrences of different features
print("Feature counts:")
feature_counts = {}
for s, p, o in g.triples((None, URIRef(POLIANNA + "hasFeature"), None)):
    feature = o.split("#")[-1] if "#" in o else o.split("/")[-1]
    feature_counts[feature] = feature_counts.get(feature, 0) + 1

for feature, count in sorted(feature_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {count}")

# Count occurrences of different tags
print("\nTag counts:")
tag_counts = {}
for s, p, o in g.triples((None, URIRef(POLIANNA + "hasTag"), None)):
    tag = o.split("#")[-1] if "#" in o else o.split("/")[-1]
    tag_counts[tag] = tag_counts.get(tag, 0) + 1

for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{tag}: {count}")

Feature counts:
Actor: 9
InstrumentType: 3
Compliance: 1
Objective: 1

Tag counts:
RegulatoryInstr: 3
Authority_monitoring: 3
Addressee_default: 3
Authority_default: 2
Form_monitoring: 1
Addressee_monitored: 1
Objective_QualIntention_noCCM: 1


## Step 8: Run SPARQL Queries

In [9]:
# Query to get policy article information
article_query = """
PREFIX polianna: <https://polianna-kg.org/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?article ?label WHERE {
    ?article a polianna:PolicyArticle ;
             rdfs:label ?label .
}
"""

print("Policy Article Information:")
for row in g.query(article_query):
    print(f"Article URI: {row.article}")
    print(f"Label: {row.label}")

# Query to get snippet text and its feature
snippet_query = """
PREFIX polianna: <https://polianna-kg.org/>

SELECT ?text ?feature ?tag WHERE {
    ?ann a polianna:SpanAnnotation ;
         polianna:annotatedText ?text ;
         polianna:hasFeature ?feature ;
         polianna:hasTag ?tag .
}
LIMIT 5
"""

print("\nSnippets with Features (first 5):")
for row in g.query(snippet_query):
    feature = row.feature.split("/")[-1]
    tag = row.tag.split("/")[-1]
    text = row.text[:50] + "..." if len(row.text) > 50 else row.text
    print(f"\nText: {text}")
    print(f"Feature: {feature}")
    print(f"Tag: {tag}")

Policy Article Information:
Article URI: https://polianna-kg.org/data/article/EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55
Label: EU_32019L0944_Title_0_Chapter_6_Section_5_Article_55

Snippets with Features (first 5):

Text: right of access to accounts
Feature: InstrumentType
Tag: RegulatoryInstr

Text: right of access
Feature: InstrumentType
Tag: RegulatoryInstr

Text: right of access
Feature: InstrumentType
Tag: RegulatoryInstr

Text: member states
Feature: Actor
Tag: Authority_monitoring

Text: competent authority
Feature: Actor
Tag: Authority_monitoring


## Conclusion

This notebook demonstrated how to:
1. Select a policy document from the dataset
2. Generate an RDF Turtle file using the POLIANNA ontology
3. Load and explore the generated data using RDFLib
4. Query the data using SPARQL

The generated Turtle files can now be loaded into a triple store or used with other RDF tools for more advanced analysis and querying.