In [None]:
# Import necessary libraries
import os
import json
import time
from pathlib import Path
import pandas as pd
from rdflib import Graph, Namespace
from dotenv import load_dotenv

# Add project root to path if needed
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import inference pipeline
from src.inference.pipeline import KnowledgeGraphPipeline
from src.inference.base_inference import OpenAIClient, OllamaClient

# Set up paths
test_data_dir = os.path.join(project_root, "test_data")
ontology_path = os.path.join(project_root, "ontology", "ontology_v17_no_fulltext.ttl")
output_dir = os.path.join(project_root, "experiments", "results", "test_results")
os.makedirs(output_dir, exist_ok=True)

# Load environment variables from secrets_config.env
dotenv_path = os.path.join(os.path.dirname(os.getcwd()), "secrets_config.env")
load_dotenv(dotenv_path)

# Verify API key is loaded
api_key = os.getenv("OPENAI_API_KEY")
if api_key:
    print("✓ API key loaded successfully")
else:
    print("✗ Failed to load API key")

✓ API key loaded successfully


In [2]:
# Load test articles list
try:
    test_articles_df = pd.read_csv(os.path.join(test_data_dir, "test_articles.csv"))
    print(f"Loaded {len(test_articles_df)} test articles")
except FileNotFoundError:
    print("Test articles CSV not found, listing directories instead")
    article_dirs = [d for d in os.listdir(test_data_dir) 
                    if os.path.isdir(os.path.join(test_data_dir, d))]
    print(f"Found {len(article_dirs)} test article directories")
    test_articles_df = pd.DataFrame({"article_id": article_dirs})

# Select the first article for this demonstration
article_id = test_articles_df["article_id"].iloc[0]
print(f"Selected test article: {article_id}")

# Load article data
article_dir = os.path.join(test_data_dir, article_id)
with open(os.path.join(article_dir, "policy_info.json"), "r") as f:
    policy_info = json.load(f)
with open(os.path.join(article_dir, "Raw_Text.txt"), "r") as f:
    article_text = f.read()

print(f"Loaded article with {len(article_text)} characters")
print(f"Policy CELEX: {policy_info.get('CELEX_Number', 'Unknown')}")

Loaded 42 test articles
Selected test article: EU_32014R0421_Title_0_Chapter_0_Section_0_Article_01
Loaded article with 5715 characters
Policy CELEX: 32014R0421


In [3]:
# Configure LLM client
# Uncomment one of the following clients:
# OpenAI client
llm_client = OpenAIClient(model="gpt-4o")
# llm_client = OllamaClient(model="llama3")

# Create pipeline
pipeline = KnowledgeGraphPipeline(
    llm_client=llm_client,
    output_dir=output_dir,
    ontology_path=ontology_path
)

# Run inference with different strategies
strategies = ["zero-shot", "one-shot", "few-shot"]
results = {}

for strategy in strategies:
    print(f"\n\n======= Running {strategy} inference =======")
    start_time = time.time()
    
    try:
        result = pipeline.process_article(
            article_text=article_text,
            policy_info=policy_info,
            prompt_strategy=strategy,
            output_format="ttl",
            max_tokens=10000,
            save_results=True
        )
        
        elapsed_time = time.time() - start_time
        results[strategy] = result
        
        print(f"Strategy: {strategy}")
        print(f"Success: {result.get('success', False)}")
        print(f"Generation time: {result.get('generation_time', 0):.2f} seconds")
        print(f"Processing time: {elapsed_time:.2f} seconds")
        
        # Safely handle None graph
        graph = result.get('graph')
        triple_count = len(graph) if graph is not None else 0
        print(f"Triples count: {triple_count}")
        
        print(f"Is valid: {result.get('is_valid', False)}")
        
        if result.get('error'):
            print(f"Error: {result.get('error')}")
        
        # Safely handle output content
        output = result.get('extracted_content', '')
        preview = "\n".join(output.split("\n")[:10]) + "\n..." if output else "No output available"
        print(f"\nPreview of output:\n{preview}")
        
    except Exception as e:
        print(f"Error processing {strategy} strategy: {str(e)}")
        # Create a fallback result
        results[strategy] = {
            'success': False,
            'error': f"Processing error: {str(e)}",
            'generation_time': time.time() - start_time,
            'is_valid': False,
            'graph': [],
            'extracted_content': ''
        }

# Compare metrics with safe handling of None values
comparison_df = pd.DataFrame({
    strategy: {
        "Success": results[strategy].get("success", False),
        "Generation time (s)": results[strategy].get("generation_time", 0),
        "Triple count": len(results[strategy].get("graph", [])) if results[strategy].get("graph") is not None else 0,
        "Is valid": results[strategy].get("is_valid", False)
    } for strategy in strategies
})

print("\n\n======= Strategy Comparison =======")
print(comparison_df.T)



Strategy: zero-shot
Success: True
Generation time: 14.98 seconds
Processing time: 14.99 seconds
Triples count: 17
Is valid: True

Preview of output:
@prefix : <https://polianna-kg.org/Ontology#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix pol: <https://polianna-kg.org/Ontology#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://data.europa.eu/eli/reg/2014/421/oj> a pol:PolicyDocument ;
    pol:hasArticle <https://polianna-kg.org/Ontology#EU_32014R0421_Title_0_Chapter_0_Section_0_Article_01> .
...


Strategy: one-shot
Success: True
Generation time: 7.15 seconds
Processing time: 7.16 seconds
Triples count: 30
Is valid: True

Preview of output:
@prefix pol: <https://polianna-kg.org/Ontology#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

pol:32014R0421 a pol:PolicyDocument ;
    pol:ha

In [4]:
# Optional: Validate triples with rdflib
def validate_turtle(ttl_content):
    try:
        g = Graph()
        g.parse(data=ttl_content, format="turtle")
        return True, len(g)
    except Exception as e:
        return False, str(e)

print("\n\n======= RDFLib Validation =======")
for strategy in strategies:
    ttl_content = results[strategy].get("extracted_content", "")
    is_valid, result = validate_turtle(ttl_content)
    print(f"{strategy}: {'✓ Valid' if is_valid else '✗ Invalid'}, {result}")



zero-shot: ✓ Valid, 17
one-shot: ✓ Valid, 30
few-shot: ✓ Valid, 44
