In [40]:
from dotenv import load_dotenv
import cohere
import requests
from dotenv import load_dotenv
import cohere
from time import sleep
import json
from difflib import SequenceMatcher
import os

In [41]:
import sys
import json
from pathlib import Path
# Add src/ to Python path so we can import modules
project_root = Path.cwd().parent   
sys.path.append(str(project_root / "src"))

# Import your pipeline function
from data_loader import fetch_and_clean, chunk_text

In [42]:
from resolve_entities import finalize_entities_and_relationships

In [43]:
DATA_DIR_RAW = project_root / "data" / "raw"
DATA_DIR_PROCESSED = project_root / "data" / "processed"
PROMPTS_DIR = project_root / "prompts"

In [44]:
# LLM API
# Load all keys from .env
load_dotenv()

# Access environment variables
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
print("Cohere key loaded:", bool(COHERE_API_KEY))

Cohere key loaded: True


### Call LLM on each chunk to identify nodes and relationships

In [45]:
co = cohere.ClientV2(COHERE_API_KEY, log_warning_experimental_features=False)

In [46]:
chunks = []
with open(DATA_DIR_RAW / "a_study_in_scarlet_chunks.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

In [53]:
#chunks = chunks[:5]  # for testing, use only first 5 chunks
#chunks

In [54]:
# Base prompt
prompt_base = open(PROMPTS_DIR /"prompt_template.txt").read()

# Load the response schema
with open(PROMPTS_DIR /"response_schema.json") as f:
    response_schema = json.load(f)

In [None]:
# Global lists of entities and relationships
global_entities = []
global_entity_map = {}      # name/alias -> id
global_relationships = []
global_relation_types = set()  # unique relation types

existing_rels = set()       # (source_id, relation_type, target_id) tuples
entity_counter = 1

# Loop over each chunk
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")

    # Prepare global lists
    entity_list_str = json.dumps({"entities": global_entities}, ensure_ascii=False)
    relation_list_str = json.dumps(list(global_relation_types), ensure_ascii=False)

    chunk_text_for_prompt = f"{chunk['text']}"
    # Update prompt with current chunk and global lists
    prompt = prompt_base.replace("{CHUNK}", chunk_text_for_prompt)
    prompt = prompt.replace("{ENTITYLIST}", entity_list_str)
    prompt = prompt.replace("{RELATIONLIST}", relation_list_str)

    # Call the LLM
    response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}],
        response_format={
            "type": "json_object",
            "schema": response_schema
        }
    )

    # Parse model output
    data = json.loads(response.dict()["message"]["content"][0]["text"])
    sleep(30)  # avoid rate limits

    # Merge entities
    for ent in data["entities"]:
        key = ent["name"].lower()
        if key in global_entity_map:
            ent["id"] = global_entity_map[key]
        else:
            ent_id = f"e{entity_counter}"
            ent["id"] = ent_id
            ent["chunk_id"] = chunk["id"]
            global_entity_map[key] = ent_id
            global_entities.append(ent)
            entity_counter += 1

    # Merge relationships (deduplicate and normalize)
    for rel in data["relationships"]:
        src_id = global_entity_map.get(rel["source"].lower(), rel["source"])
        tgt_id = global_entity_map.get(rel["target"].lower(), rel["target"])
        rel_type = rel["relation"].lower()

        rel_key = (src_id, rel_type, tgt_id)
        if rel_key not in existing_rels:
            rel["source"] = src_id
            rel["target"] = tgt_id
            rel["relation"] = rel_type
            rel["chunk_id"] = chunk["id"]
            global_relationships.append(rel)
            existing_rels.add(rel_key)
            global_relation_types.add(rel_type)


Processing chunk 1/166
Processing chunk 2/166
Processing chunk 3/166
Processing chunk 4/166
Processing chunk 5/166
Processing chunk 6/166
Processing chunk 7/166
Processing chunk 8/166
Processing chunk 9/166
Processing chunk 10/166
Processing chunk 11/166
Processing chunk 12/166
Processing chunk 13/166
Processing chunk 14/166
Processing chunk 15/166
Processing chunk 16/166
Processing chunk 17/166
Processing chunk 18/166
Processing chunk 19/166
Processing chunk 20/166
Processing chunk 21/166
Processing chunk 22/166
Processing chunk 23/166
Processing chunk 24/166
Processing chunk 25/166


ApiError: headers: {'content-type': 'text/html; charset=UTF-8', 'referrer-policy': 'no-referrer', 'content-length': '332', 'date': 'Tue, 02 Sep 2025 12:16:52 GMT', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'}, status_code: 502, body: 
<html><head>
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<title>502 Server Error</title>
</head>
<body text=#000000 bgcolor=#ffffff>
<h1>Error: Server Error</h1>
<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>
<h2></h2>
</body></html>


[{'id': 'e1',
  'name': 'Doctor of Medicine',
  'type': 'Role',
  'aliases': [],
  'span': 'Doctor of Medicine',
  'chunk_id': 'chunk_1'},
 {'id': 'e2',
  'name': 'University of London',
  'type': 'Institution',
  'aliases': [],
  'span': 'University of London',
  'chunk_id': 'chunk_1'},
 {'id': 'e3',
  'name': 'Netley',
  'type': 'Location',
  'aliases': [],
  'span': 'Netley',
  'chunk_id': 'chunk_1'},
 {'id': 'e4',
  'name': 'Fifth Northumberland Fusiliers',
  'type': 'Organization',
  'aliases': ['Northumberland Fusiliers'],
  'span': 'Fifth Northumberland Fusiliers',
  'chunk_id': 'chunk_1'},
 {'id': 'e5',
  'name': 'India',
  'type': 'Location',
  'aliases': [],
  'span': 'India',
  'chunk_id': 'chunk_1'},
 {'id': 'e6',
  'name': 'Afghanistan',
  'type': 'Location',
  'aliases': ['Afghan'],
  'span': 'Afghanistan',
  'chunk_id': 'chunk_1'},
 {'id': 'e7',
  'name': 'Bombay',
  'type': 'Location',
  'aliases': [],
  'span': 'Bombay',
  'chunk_id': 'chunk_1'},
 {'id': 'e8',
  'name'

In [37]:
final_entities, final_relationships = finalize_entities_and_relationships(
    global_entities,
    global_relationships,
    log=True
)

In [64]:
# Save entities
with open(DATA_DIR_PROCESSED / "entities.json", "w", encoding="utf-8") as f:
    json.dump(final_entities, f, ensure_ascii=False, indent=2)

# Save relationships
with open(DATA_DIR_PROCESSED / "relationships.json", "w", encoding="utf-8") as f:
    json.dump(final_relationships, f, ensure_ascii=False, indent=2)

print("Saved entities and relationships to 'output/' folder")

Saved entities and relationships to 'output/' folder
