# LLM-Based Automatic Policy Text Classifier
This notebook demonstrates a multi-step approach to automatically annotate policy articles using an LLM (Anthropic Claude), informed by the coding scheme in the Appendix and Codebook.

## 1. Load Required Libraries and Environment Variables

In [2]:
import os
import json
from typing import List, Dict
from dotenv import load_dotenv
import requests
import glob
import random

# Load environment variables
load_dotenv()
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
LLM_MODEL = os.getenv('LLM_MODEL', 'claude-3-opus-20240229')
LLM_TEMPERATURE = float(os.getenv('LLM_TEMPERATURE', '0.2'))
assert ANTHROPIC_API_KEY, 'Please set your ANTHROPIC_API_KEY in the .env file.'



ModuleNotFoundError: No module named 'dotenv'

# 2. Test LLM connection with a short query


In [7]:
print('Testing LLM connection...')
test_prompt = "Hello! Please return a JSON array with a single object: {\"layer\": \"Test\", \"feature\": \"TestFeature\", \"tag\": \"TestTag\", \"text\": \"test span\"}"
headers = {
    'x-api-key': ANTHROPIC_API_KEY,
    'anthropic-version': '2023-06-01',
    'content-type': 'application/json'
}
data = {
    'model': LLM_MODEL,
    'max_tokens': 128,
    'temperature': LLM_TEMPERATURE,
    'messages': [
        {
            'role': 'user',
            'content': test_prompt
        }
    ]
}
try:
    response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
    response.raise_for_status()
    content = response.json()['content'][0]['text']
    print('LLM test response:', content)
except Exception as e:
    print('LLM connection test failed:', e)

Testing LLM connection...
LLM test response: Here is the JSON array with a single object as requested:

[
  {
    "layer": "Test",
    "feature": "TestFeature",
    "tag": "TestTag",
    "text": "test span"
  }
]
LLM test response: Here is the JSON array with a single object as requested:

[
  {
    "layer": "Test",
    "feature": "TestFeature",
    "tag": "TestTag",
    "text": "test span"
  }
]


## 3. Load Coding Scheme (from Codebook)

In [5]:
# For demonstration, load coding scheme from JSON (should be extracted from PDF in production)
with open('data/01_policy_info/Coding_Scheme.json', 'r') as f:
    coding_scheme = json.load(f)
print('Loaded coding scheme keys:', list(coding_scheme.keys()))

Loaded coding scheme keys: ['layers']


## 4. Define Anthropic Claude LLM Annotation Function

In [None]:
def filter_coding_scheme(coding_scheme, layers=None, tagsets=None):
    # Filter coding scheme dict to only include selected layers/tagsets
    if not layers and not tagsets:
        return coding_scheme
    if "layers" not in coding_scheme:
        raise ValueError("Coding scheme JSON does not contain a 'layers' key")
    filtered = {"layers": []}
    for layer in coding_scheme.get("layers", []):
        if "layer" not in layer:
            raise KeyError(f"Layer object missing 'layer' field: {layer}")
        if layers and layer["layer"] not in layers:
            continue
        new_layer = {k: v for k, v in layer.items() if k != "tagsets"}
        new_layer["tagsets"] = []
        for tagset in layer.get("tagsets", []):
            if "tagset" not in tagset:
                raise KeyError(f"Tagset object missing 'tagset' field: {tagset}")
            if tagsets and tagset["tagset"] not in tagsets:
                continue
            new_layer["tagsets"].append(tagset)
        if new_layer["tagsets"] or not tagsets:
            filtered["layers"].append(new_layer)
    if not filtered["layers"]:
        raise ValueError(f"No layers/tagsets matched your filter. layers={layers}, tagsets={tagsets}")
    return filtered

def load_few_shot_examples(base_dir, layers=None, tagsets=None, max_lines=6, n=3):
    folders = glob.glob(os.path.join(base_dir, "*/"))
    random.shuffle(folders)
    examples = []
    skipped_count = 0
    
    # Build tag_name lookup for validation
    valid_tags = {}
    for l in coding_scheme["layers"]:
        if not layers or l["layer"] in layers:
            layer_tags = {}
            for ts in l.get("tagsets", []):
                if not tagsets or ts["tagset"] in tagsets:
                    layer_tags[ts["tagset"]] = [t["tag_name"] for t in ts.get("tags", [])]
            if layer_tags:
                valid_tags[l["layer"]] = layer_tags
    
    for folder in folders:
        try:
            with open(os.path.join(folder, "Raw_Text.txt")) as f:
                raw = f.read().strip()
