In [20]:
from dotenv import load_dotenv
import cohere
import requests
from dotenv import load_dotenv
import cohere
from time import sleep
import json
from difflib import SequenceMatcher
import os

In [61]:
from resolve_entities import finalize_entities_and_relationships

In [21]:
from pathlib import Path
import sys
project_root = Path.cwd().parent   
sys.path.append(str(project_root / "src"))

In [22]:
DATA_DIR_RAW = project_root / "data" / "raw"
DATA_DIR_PROCESSED = project_root / "data" / "processed"
PROMPTS_DIR = project_root / "prompts"

In [23]:
# LLM API
# Load all keys from .env
load_dotenv()

# Access environment variables
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
print("Cohere key loaded:", bool(COHERE_API_KEY))

Cohere key loaded: True


### Call LLM on each chunk to identify nodes and relationships

In [24]:
co = cohere.ClientV2(COHERE_API_KEY, log_warning_experimental_features=False)

In [25]:
with open(DATA_DIR_RAW /"a_study_in_scarlet_chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

In [26]:
## chunks = chunks[:5]  # for testing, use only first 5 chunks

In [27]:
# Base prompt
prompt_base = open(PROMPTS_DIR /"prompt_template.txt").read()

# Load the response schema
with open(PROMPTS_DIR /"response_schema.json") as f:
    response_schema = json.load(f)

In [28]:
# Global lists of entities and relationships
global_entities = []
global_entity_map = {}      # name/alias -> id
global_relationships = []
global_relation_types = set()  # unique relation types

existing_rels = set()       # (source_id, relation_type, target_id) tuples
entity_counter = 1



# Loop over each chunk
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")

    # Prepare global lists
    entity_list_str = json.dumps({"entities": global_entities}, ensure_ascii=False)
    relation_list_str = json.dumps(list(global_relation_types), ensure_ascii=False)

    # Update prompt with current chunk and global lists
    prompt = prompt_base.replace("{CHUNK}", chunk)
    prompt = prompt.replace("{ENTITYLIST}", entity_list_str)
    prompt = prompt.replace("{RELATIONLIST}", relation_list_str)

    # Call the LLM
    response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}],
        response_format={
            "type": "json_object",
            "schema": response_schema
        }
    )

    # Parse model output
    data = json.loads(response.dict()["message"]["content"][0]["text"])
    sleep(8)  # avoid rate limits

    # Merge entities
    for ent in data["entities"]:
        key = ent["name"].lower()
        if key in global_entity_map:
            ent["id"] = global_entity_map[key]
        else:
            ent_id = f"e{entity_counter}"
            ent["id"] = ent_id
            global_entity_map[key] = ent_id
            global_entities.append(ent)
            entity_counter += 1

    # Merge relationships (deduplicate and normalize)
    for rel in data["relationships"]:
        src_id = global_entity_map.get(rel["source"].lower(), rel["source"])
        tgt_id = global_entity_map.get(rel["target"].lower(), rel["target"])
        rel_type = rel["relation"].lower()

        rel_key = (src_id, rel_type, tgt_id)
        if rel_key not in existing_rels:
            rel["source"] = src_id
            rel["target"] = tgt_id
            rel["relation"] = rel_type
            global_relationships.append(rel)
            existing_rels.add(rel_key)
            global_relation_types.add(rel_type)


Processing chunk 1/61
Processing chunk 2/61
Processing chunk 3/61
Processing chunk 4/61
Processing chunk 5/61
Processing chunk 6/61
Processing chunk 7/61
Processing chunk 8/61
Processing chunk 9/61
Processing chunk 10/61
Processing chunk 11/61
Processing chunk 12/61
Processing chunk 13/61
Processing chunk 14/61
Processing chunk 15/61
Processing chunk 16/61
Processing chunk 17/61
Processing chunk 18/61
Processing chunk 19/61
Processing chunk 20/61
Processing chunk 21/61
Processing chunk 22/61
Processing chunk 23/61
Processing chunk 24/61
Processing chunk 25/61
Processing chunk 26/61
Processing chunk 27/61
Processing chunk 28/61
Processing chunk 29/61
Processing chunk 30/61
Processing chunk 31/61
Processing chunk 32/61
Processing chunk 33/61
Processing chunk 34/61
Processing chunk 35/61
Processing chunk 36/61
Processing chunk 37/61
Processing chunk 38/61
Processing chunk 39/61
Processing chunk 40/61
Processing chunk 41/61
Processing chunk 42/61
Processing chunk 43/61
Processing chunk 44/

In [63]:
final_entities, final_relationships = finalize_entities_and_relationships(
    global_entities,
    global_relationships,
    log=True
)

[Entity Resolution] Merged 'Dr. Watson' -> 'Watson' (sim=1.00)
[Entity Resolution] Merged 'Sherlock Holmes' test' -> 'Sherlock Holmes' (sim=0.83)
[Entity Resolution] Merged '221B, Baker Street' -> 'Baker Street' (sim=0.80)
[Entity Resolution] Merged '221B Baker Street' -> 'Baker Street' (sim=0.83)
[Entity Resolution] Merged 'Lestrade' -> 'Mr. Lestrade' (sim=1.00)
[Entity Resolution] Merged 'murderer' -> 'the murderer' (sim=0.80)
[Entity Resolution] Merged 'Rache' -> 'Rachel' (sim=0.91)
[Entity Resolution] Merged 'Dr Watson' -> 'Watson' (sim=1.00)
[Entity Resolution] Merged 'Mr. Stangerson' -> 'Stangerson' (sim=1.00)
[Entity Resolution] Merged 'Drebber' -> 'Mr. Drebber' (sim=1.00)
[Entity Resolution] Merged 'Enoch Drebber' -> 'Enoch J. Drebber' (sim=0.90)
[Entity Resolution] Merged 'Leader' -> 'Lieder' (sim=0.83)
[Entity Resolution] Merged 'Elders' -> 'Elder' (sim=0.91)
[Entity Resolution] Merged 'Council of Four' -> 'Sacred Council of Four' (sim=0.81)
[Entity Resolution] Merged 'Ferrie

In [64]:
# Save entities
with open(DATA_DIR_PROCESSED / "entities.json", "w", encoding="utf-8") as f:
    json.dump(final_entities, f, ensure_ascii=False, indent=2)

# Save relationships
with open(DATA_DIR_PROCESSED / "relationships.json", "w", encoding="utf-8") as f:
    json.dump(final_relationships, f, ensure_ascii=False, indent=2)

print("Saved entities and relationships to 'output/' folder")

Saved entities and relationships to 'output/' folder
