# CPC definition expander

### convert cpc xml file to json

In [13]:
import xml.etree.ElementTree as ET
import json

def get_title_text(item):
    title_parts = []
    class_title = item.find("class-title")
    if class_title is not None:
        for title_part in class_title.findall(".//title-part"):
            texts = title_part.findall("text")
            cpc_texts = title_part.findall("CPC-specific-text/text")
            title_parts.extend([t.text for t in texts + cpc_texts if t.text])
    return " ".join(title_parts).strip()

def parse_classification_item(item):
    result = {
        "symbol": item.find("classification-symbol").text,
        "title": get_title_text(item),
        "children": []
    }
    
    for child in item.findall("./classification-item"):
        result["children"].append(parse_classification_item(child))
    
    return result

def xml_to_json(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    g10l = root.find(".//classification-item[classification-symbol='G10L']")
    result = parse_classification_item(g10l)
    
    return result

def save_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    input_file = 'cpc-scheme-G10L.xml'
    output_file = 'g10l_hierarchy.json'
    
    hierarchy = xml_to_json(input_file)
    save_json(hierarchy, output_file)
    print(f"G10L hierarchy has been saved to {output_file}")

if __name__ == "__main__":
    main()

G10L hierarchy has been saved to g10l_hierarchy.json


### generate expanded definitions in hierachical json file

In [20]:
import json
import anthropic
from tqdm import tqdm
import time

# Set up your Anthropic API key
client = anthropic.Anthropic(api_key=apikey)

def get_expanded_definition(symbol, title, parent_title=""):
    prompt = f"""Human: Provide a concise definition for the CPC category {symbol}: '{title}'. """
    if parent_title:
        prompt += f"This is a subgroup of '{parent_title}'. "
    prompt += "Focus only on what this specific category covers, without repeating information from higher levels."
    prompt += "\n\nAssistant: Here's a concise definition for the CPC category:"

    try:
        response = client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=300,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        # Extract the definition from the response
        definition = response.content[0].text.strip()
        return definition
    except Exception as e:
        print(f"Error generating definition for {symbol}: {e}")
        return ""

def process_hierarchy(data, parent_title=""):
    if isinstance(data, dict):
        if 'symbol' in data and 'title' in data:
            data['expanded_definition'] = get_expanded_definition(data['symbol'], data['title'], parent_title)
            print(f"Processed: {data['symbol']}")
        
        if 'children' in data and isinstance(data['children'], list):
            for child in data['children']:
                process_hierarchy(child, data.get('title', ''))
    elif isinstance(data, list):
        for item in data:
            process_hierarchy(item, parent_title)

# Load the original JSON file
with open('g10l_hierarchy.json', 'r') as f:
    data = json.load(f)

# Process the entire hierarchy
process_hierarchy(data)

# Save the updated JSON file
with open('g10l_hierarchy_expanded.json', 'w') as f:
    json.dump(data, f, indent=2)

print("Processing complete. Updated JSON saved as 'g10l_hierarchy_expanded.json'.")

Processed: G10L
Processed: G10L13/00
Processed: G10L13/00
Processed: G10L13/02
Processed: G10L2013/021
Processed: G10L13/027
Processed: G10L13/033
Processed: G10L13/0335
Processed: G10L13/04
Processed: G10L13/047
Processed: G10L13/06
Processed: G10L13/07
Processed: G10L13/08
Processed: G10L2013/083
Processed: G10L13/086
Processed: G10L13/10
Processed: G10L2013/105
Processed: G10L15/00
Processed: G10L15/005
Processed: G10L15/01
Processed: G10L15/02
Processed: G10L2015/022
Processed: G10L2015/025
Processed: G10L2015/027
Processed: G10L15/04
Processed: G10L15/05
Processed: G10L15/06
Processed: G10L15/063
Processed: G10L2015/0631
Processed: G10L2015/0633
Processed: G10L2015/0635
Processed: G10L2015/0636
Processed: G10L2015/0638
Processed: G10L15/065
Processed: G10L15/07
Processed: G10L15/075
Processed: G10L15/08
Processed: G10L2015/081
Processed: G10L15/083
Processed: G10L2015/085
Processed: G10L2015/086
Processed: G10L2015/088
Processed: G10L15/10
Processed: G10L15/12
Processed: G10L15/14