# LLM-Based Automatic Policy Text Classifier
This notebook demonstrates a multi-step approach to automatically annotate policy articles using an LLM (Anthropic Claude), informed by the coding scheme in the Appendix and Codebook.

## 0. Install Required Packages

In [None]:
# Install required packages if not already installed
!pip install python-dotenv requests numpy

## 1. Import Required Modules

In [1]:
import os
import sys
import json
from pathlib import Path

# Add parent directory to path to import our modules
sys.path.append('..')

# Import our custom modules
from src.agent.utils import (
    get_project_root, 
    load_coding_scheme, 
    filter_coding_scheme,
    load_raw_text,
    load_curated_annotations,
    create_extended_coding_scheme
)

from src.agent.annotation import (
    annotate_article,
    load_few_shot_examples,
    prepare_annotation_prompt
)



# Load environment variables
from dotenv import load_dotenv
load_dotenv()

ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
assert ANTHROPIC_API_KEY, 'Please set your ANTHROPIC_API_KEY in the .env file.'

## 2. Load and Explore the Coding Scheme

In [7]:
# Load the full coding scheme
coding_scheme = load_coding_scheme()
print(f"Full coding scheme has {len(coding_scheme.get('layers', []))} layers")

# Display all layer and tagset names
for layer in coding_scheme.get('layers', []):
    print(f"\nLayer: {layer.get('layer')}")
    for tagset in layer.get('tagsets', []):
        tag_count = len(tagset.get('tags', []))
        print(f"  - Tagset: {tagset.get('tagset')} ({tag_count} tags)")

Full coding scheme has 3 layers

Layer: Policydesigncharacteristics
  - Tagset: Objective (4 tags)
  - Tagset: Reference (3 tags)
  - Tagset: Actor (8 tags)
  - Tagset: Resource (3 tags)
  - Tagset: Time (5 tags)
  - Tagset: Compliance (2 tags)
  - Tagset: Reversibility (1 tags)

Layer: Technologyandapplicationspecificity
  - Tagset: EnergySpecificity (2 tags)
  - Tagset: ApplicationSpecificity (2 tags)
  - Tagset: TechnologySpecificity (2 tags)

Layer: Instrumenttypes
  - Tagset: InstrumentType (10 tags)


## 3. Filter Coding Scheme for Specific Layers/Tagsets

In [8]:
# Filter to focus on Policydesigncharacteristics/Actor
#target_layer = "Policydesigncharacteristics"
#target_tagset = "Actor"

#target_layer = "Instrumenttypes"
#target_tagset = "InstrumentType"

target_layer = None
target_tagset = None


filtered_scheme = filter_coding_scheme(
    coding_scheme, 
    layers=[target_layer],
    tagsets=[target_tagset]
)

# Display the filtered tags
for layer in filtered_scheme.get('layers', []):
    for tagset in layer.get('tagsets', []):
        print(f"Tags in {layer.get('layer')}/{tagset.get('tagset')}:")
        for tag in tagset.get('tags', []):
            print(f"  - {tag.get('tag_name')}: {tag.get('tag_description')[:100]}...")

ValueError: No layers/tagsets matched your filter. layers=[None], tagsets=[None]

## 4. Create Extended Coding Scheme with Examples

The extended coding scheme enhances the original by adding real-world examples for each tag extracted from annotated data. This helps the LLM better understand what to look for.

In [3]:
# Create the extended coding scheme
extended_scheme_path = create_extended_coding_scheme(
    output_name="Coding_Scheme_Extended",
    min_occurrences=1,  # Each example must appear at least twice
    max_examples=30      # Maximum of 5 examples per tag
)

Extended coding scheme created at /Users/johannesmuller/Documents/github/POLIANNA-AI-CC-Project/data/01_policy_info/Coding_Scheme_Extended.json
Added examples to 51 out of 42 tags
Minimum occurrences: 1, Maximum examples per tag: 30


In [4]:
# Load the extended scheme to examine it
extended_scheme = load_coding_scheme(scheme_name="Coding_Scheme_Extended")

# Count tags with examples
tags_with_examples = 0
total_tags = 0
total_examples = 0

for layer in extended_scheme.get('layers', []):
    for tagset in layer.get('tagsets', []):
        for tag in tagset.get('tags', []):
            total_tags += 1
            examples = tag.get('tag_examples', [])
            if examples:
                tags_with_examples += 1
                total_examples += len(examples)

print(f"Extended scheme has examples for {tags_with_examples} out of {total_tags} tags")
print(f"Total examples: {total_examples}")
print(f"Average examples per tag with examples: {total_examples / tags_with_examples:.2f}" if tags_with_examples > 0 else "No tags have examples yet")

# View examples for our target layer/tagset
print(f"\nExamples for {target_layer}/{target_tagset}:")
for layer in extended_scheme.get('layers', []):
    if layer.get('layer') == target_layer:
        for tagset in layer.get('tagsets', []):
            if tagset.get('tagset') == target_tagset:
                for tag in tagset.get('tags', []):
                    examples = tag.get('tag_examples', [])
                    print(f"\n  - {tag.get('tag_name')}: {len(examples)} examples")
                    for i, example in enumerate(examples):
                        print(f"    {i+1}. \"{example}\"")

Extended scheme has examples for 42 out of 42 tags
Total examples: 1143
Average examples per tag with examples: 27.21


NameError: name 'target_layer' is not defined

## 5. Load a Sample Article

In [5]:
# Article ID to work with
article_id = "EU_32018R1999_Title_0_Chapter_2_Section_0_Article_10"

# Load the raw text
raw_text = load_raw_text(article_id)
print(f"Article text ({len(raw_text)} characters):\n")
print(raw_text)

Article text (1088 characters):

article 10
public consultation
without prejudice to any other union law requirements, each member state shall ensure that the public is given early and effective opportunities to participate in the preparation of the draft integrated national energy and climate plan — as regards the plans for the 2021 to 2030 period, in the preparation of the final plan well before its adoption — as well as of the long-term strategies referred to in article 15. each member state shall attach to the submission of such documents to the commission a summary of the public's views or provisional views. in so far as directive 2001/42/ec is applicable, consultations undertaken on the draft in accordance with that directive shall be deemed to satisfy the obligations to consult the public under this regulation.
each member state shall ensure that the public is informed. each member state shall set reasonable timeframes allowing sufficient time for the public to be informed, to p

## 6. Load Curated Annotations for the Article

In [6]:
# Load the curated annotations
curated_annotations = load_curated_annotations(article_id)

# Filter to our target layer/tagset
filtered_annotations = []
for ann in curated_annotations:
    if ann.get('layer') == target_layer and ann.get('feature') == target_tagset:
        # Create a clean version without metadata fields
        clean_ann = {k: v for k, v in ann.items() if k not in ['span_id', 'tokens']}
        filtered_annotations.append(clean_ann)

print(f"Found {len(filtered_annotations)} annotations in {target_layer}/{target_tagset}:")
for ann in filtered_annotations:
    print(f"- {ann['tag']}: '{ann['text']}'")

NameError: name 'target_layer' is not defined

## 7. Generate Few-Shot Examples

In [None]:
# Load few-shot examples
few_shot_examples = load_few_shot_examples(
    num_examples=2,
    layers=[target_layer],
    tagsets=[target_tagset],
    exclude_article_ids=[article_id]
)

print(f"Loaded {len(few_shot_examples)} few-shot examples")

# Display the first example
if few_shot_examples:
    example = few_shot_examples[0]
    print(f"\nExample Text:\n{example['text'][:200]}...")
    print(f"\nExample Annotations ({len(example['annotations'])}):\n")
    for ann in example['annotations']:
        print(f"- {ann['tag']}: '{ann['text']}'")

## 8. Create LLM Prompt with Extended Scheme

In [None]:
# Create the annotation prompt with extended coding scheme
prompt = prepare_annotation_prompt(
    raw_text=raw_text,
    coding_scheme=extended_scheme,  # Use the extended scheme with examples
    layers=[target_layer],
    tagsets=[target_tagset],
    few_shot_examples=few_shot_examples,
    use_extended_scheme=True  # Enable special formatting for examples
)

# Display a shortened version of the prompt
print(f"Prompt length: {len(prompt)} characters")
print("Prompt preview (first 1000 characters):")
print(prompt[:1000] + "...\n[truncated]...")

print(prompt)

## 9. Annotate the Article with Standard Scheme

In [None]:
# Annotate the article with the standard coding scheme
standard_annotations = annotate_article(
    article_id=article_id,
    layers=[target_layer],
    tagsets=[target_tagset],
    num_examples=2,
    save_result=True,
    use_extended_scheme=False,  # Use standard scheme
    scheme_name="Coding_Scheme"  # Explicitly use the standard scheme
)

print(f"Generated {len(standard_annotations)} annotations with standard scheme")
for ann in standard_annotations:
    print(f"- {ann['layer']}/{ann['feature']}/{ann['tag']}: '{ann['text']}'")

## 10. Annotate the Article with Extended Scheme

In [None]:
# Annotate the article with the extended coding scheme
extended_annotations = annotate_article(
    article_id=article_id,
    layers=[target_layer],
    tagsets=[target_tagset],
    num_examples=2,
    save_result=True,
    use_extended_scheme=True,  # Use extended scheme with examples
    scheme_name="Coding_Scheme_Extended"  # Explicitly use the extended scheme
)

print(f"Generated {len(extended_annotations)} annotations with extended scheme")
for ann in extended_annotations:
    print(f"- {ann['layer']}/{ann['feature']}/{ann['tag']}: '{ann['text']}'")

## 11. Compare Against Curated Annotations

In [None]:
# Display curated, standard, and extended annotations side by side
print("Curated vs. Generated Annotations:\n")

print("Curated Annotations:")
for ann in filtered_annotations:
    print(f"- {ann['tag']}: '{ann['text']}' ({ann['start']}:{ann['stop']})")

print("\nStandard Scheme Annotations:")
for ann in standard_annotations:
    print(f"- {ann['tag']}: '{ann['text']}' ({ann['start']}:{ann['stop']})")
    
print("\nExtended Scheme Annotations:")
for ann in extended_annotations:
    print(f"- {ann['tag']}: '{ann['text']}' ({ann['start']}:{ann['stop']})")

## 12. Safe results 

In [16]:
# Save both annotation sets to different files for comparison
root_dir = get_project_root()
article_dir = os.path.join(root_dir, 'data', '03b_processed_to_json', article_id)

# Save standard annotations to a separate file
standard_path = os.path.join(article_dir, 'Generated_Annotations_Standard.json')
with open(standard_path, 'w') as f:
    json.dump(standard_annotations, f, indent=2)

# Save extended annotations to a separate file
extended_path = os.path.join(article_dir, 'Generated_Annotations_Extended.json')
with open(extended_path, 'w') as f:
    json.dump(extended_annotations, f, indent=2)