In [2]:
from dotenv import load_dotenv
import cohere
import requests
from dotenv import load_dotenv
import cohere
from time import sleep
import json
from difflib import SequenceMatcher
import os

In [3]:
import sys
import json
from pathlib import Path
# Add src/ to Python path so we can import modules
project_root = Path.cwd().parent   
sys.path.append(str(project_root / "src"))

# Import your pipeline function
from data_loader import fetch_and_clean, chunk_text

In [4]:
from dedupe_entities import finalize_entities_and_relationships

In [5]:
DATA_DIR_RAW = project_root / "data" / "raw"
DATA_DIR_PROCESSED = project_root / "data" / "processed"
PROMPTS_DIR = project_root / "prompts"

In [6]:
# LLM API
# Load all keys from .env
load_dotenv()

# Access environment variables
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
print("Cohere key loaded:", bool(COHERE_API_KEY))

Cohere key loaded: True


### Call LLM on each chunk to identify nodes and relationships

In [7]:
co = cohere.ClientV2(COHERE_API_KEY, log_warning_experimental_features=False)

In [9]:
chunks = {}
with open(DATA_DIR_RAW / "a_study_in_scarlet_chunks.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        chunks[record["id"]] = record["text"]

In [12]:
# Base prompt
prompt_base = open(PROMPTS_DIR /"prompt_template.txt").read()

# Load the response schema
with open(PROMPTS_DIR /"response_schema.json") as f:
    response_schema = json.load(f)

In [16]:
# Global lists of entities and relationships
global_entities = []
global_entity_map = {}      # name/alias -> id
global_relationships = []
global_relation_types = set()  # unique relation types

existing_rels = set()       # (source_id, relation_type, target_id) tuples
entity_counter = 1

# Loop over each chunk
for i, chunk_id in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")

    # Prepare global lists
    entity_list_str = json.dumps({"entities": global_entities}, ensure_ascii=False)
    relation_list_str = json.dumps(list(global_relation_types), ensure_ascii=False)

    chunk_text_for_prompt = f"{chunks[chunk_id]}"
    # Update prompt with current chunk and global lists
    prompt = prompt_base.replace("{CHUNK}", chunk_text_for_prompt)
    prompt = prompt.replace("{ENTITYLIST}", entity_list_str)
    prompt = prompt.replace("{RELATIONLIST}", relation_list_str)

    # Call the LLM
    response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}],
        response_format={
            "type": "json_object",
            "schema": response_schema
        }
    )

    # Parse model output
    data = json.loads(response.dict()["message"]["content"][0]["text"])
    sleep(30)  # avoid rate limits

    # Merge entities
    for ent in data["entities"]:
        key = ent["name"].lower()
        if key in global_entity_map:
            ent["id"] = global_entity_map[key]
        else:
            ent_id = f"e{entity_counter}"
            ent["id"] = ent_id
            ent["chunk_id"] = chunk_id
            global_entity_map[key] = ent_id
            global_entities.append(ent)
            entity_counter += 1

    # Merge relationships (deduplicate and normalize)
    for rel in data["relationships"]:
        src_id = global_entity_map.get(rel["source"].lower(), rel["source"])
        tgt_id = global_entity_map.get(rel["target"].lower(), rel["target"])
        rel_type = rel["relation"].lower()

        rel_key = (src_id, rel_type, tgt_id)
        if rel_key not in existing_rels:
            rel["source"] = src_id
            rel["target"] = tgt_id
            rel["relation"] = rel_type
            rel["chunk_id"] = chunk_id
            global_relationships.append(rel)
            existing_rels.add(rel_key)
            global_relation_types.add(rel_type)


Processing chunk 1/82
Processing chunk 2/82
Processing chunk 3/82
Processing chunk 4/82
Processing chunk 5/82
Processing chunk 6/82
Processing chunk 7/82
Processing chunk 8/82
Processing chunk 9/82
Processing chunk 10/82
Processing chunk 11/82
Processing chunk 12/82
Processing chunk 13/82
Processing chunk 14/82
Processing chunk 15/82
Processing chunk 16/82
Processing chunk 17/82
Processing chunk 18/82
Processing chunk 19/82
Processing chunk 20/82
Processing chunk 21/82
Processing chunk 22/82
Processing chunk 23/82
Processing chunk 24/82
Processing chunk 25/82
Processing chunk 26/82
Processing chunk 27/82
Processing chunk 28/82
Processing chunk 29/82
Processing chunk 30/82
Processing chunk 31/82
Processing chunk 32/82
Processing chunk 33/82
Processing chunk 34/82
Processing chunk 35/82
Processing chunk 36/82
Processing chunk 37/82
Processing chunk 38/82
Processing chunk 39/82
Processing chunk 40/82
Processing chunk 41/82
Processing chunk 42/82
Processing chunk 43/82
Processing chunk 44/

In [17]:
final_entities, final_relationships = finalize_entities_and_relationships(
    global_entities,
    global_relationships,
    log=True)

[Entity Resolution] Merged 'Sherlock Holmes' test' -> 'Sherlock Holmes' (sim=0.83)
[Entity Resolution] Merged 'Lestrade' -> 'Mr. Lestrade' (sim=1.00)
[Entity Resolution] Merged 'Commissionaire' -> 'Retired sergeant of Marines' (sim=1.00)
[Entity Resolution] Merged 'Rachel' -> 'RACHE' (sim=0.91)
[Entity Resolution] Merged 'I' -> 'Narrator' (sim=1.00)
[Entity Resolution] Merged 'Rance' -> 'RACHE' (sim=0.80)
[Entity Resolution] Merged 'the hounds' -> 'the house' (sim=0.84)
[Entity Resolution] Merged 'the ring' -> 'the railings' (sim=0.80)
[Entity Resolution] Merged 'the floor' -> 'the door' (sim=0.82)
[Entity Resolution] Merged 'the world' -> 'the wolf' (sim=0.82)
[Entity Resolution] Merged 'the table' -> 'the constable' (sim=0.82)
[Entity Resolution] Merged 'Evening paper' -> 'the evening paper' (sim=0.87)
[Entity Resolution] Merged 'Advertisement' -> 'the advertisement' (sim=0.87)
[Entity Resolution] Merged 'Dr. Watson' -> 'Watson' (sim=1.00)
[Entity Resolution] Merged '221B Baker Stree

In [18]:
# Save entities
with open(DATA_DIR_PROCESSED / "entities.json", "w", encoding="utf-8") as f:
    json.dump(final_entities, f, ensure_ascii=False, indent=2)

# Save relationships
with open(DATA_DIR_PROCESSED / "relationships.json", "w", encoding="utf-8") as f:
    json.dump(final_relationships, f, ensure_ascii=False, indent=2)

print("Saved entities and relationships to 'output/' folder")

Saved entities and relationships to 'output/' folder
