In [33]:
!pip install wikipedia-api neo4j cohere  --quiet

In [34]:
from neo4j import GraphDatabase
import re
import os
from dotenv import load_dotenv
import cohere
import requests
from dotenv import load_dotenv
import cohere
from time import sleep
import json

In [35]:
from myutils import fetch_raw_text, strip_gutenberg_header_footer, chunk_text

### LLM API and Neo4j DB connections

In [36]:
# Load all keys from .env
load_dotenv()

# Access environment variables
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Debug check (optional – don’t print secrets in real projects)
print("Cohere key loaded:", bool(COHERE_API_KEY))
print("Neo4j URI:", NEO4J_URI)
print("Neo4j User:", NEO4J_USER)

Cohere key loaded: True
Neo4j URI: bolt://44.200.207.55:7687
Neo4j User: neo4j


In [37]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
#check connection
with driver.session() as session:
    result = session.run("RETURN 1")
    print("Connection test result:", result.single()[0])  # Should print 1 if successful

Connection test result: 1


### Fetch Data


In [54]:
GUTENBERG_TXT_URL = "https://www.gutenberg.org/cache/epub/244/pg244.txt"  # A Study in Scarlet (id=244)
#GUTENBERG_TXT_URL = "https://www.gutenberg.org/cache/epub/18897/pg18897.txt"  # The Epic of Gilgamish (Langdon, id=18897)
#GUTENBERG_TXT_URL = "https://www.gutenberg.org/cache/epub/11000/pg11000.txt"  # An Old Babylonian Version of the Gilgamesh Epic (Jastrow & Clay, id=11000)

raw = fetch_raw_text(GUTENBERG_TXT_URL)
core = strip_gutenberg_header_footer(raw)

with open("data/sherlock_raw.txt", "w", encoding="utf-8") as f:
    f.write(core)
print("Saved cleaned text -> data/sherlock_raw.txt")

Saved cleaned text -> data/sherlock_raw.txt


In [59]:
# Chunk Data
text = open("data/sherlock_raw.txt", encoding="utf-8").read()

# Clean text
# remove /n with .
text = text.replace("\n", " ").replace("\r", " ").replace("  ", " ")
# start text from CHAPTER I. MR. SHERLOCK HOLMES. to avoid preface
start_idx = text.find("In the year 1878 I took my degree of Doctor of Medicine of the")
text = text[start_idx:]


chunks = chunk_text(text, max_chars=250, overlap=150)
print(f"Chunks: {len(chunks)}")

chunks[0]

Chunks: 1251


'In the year 1878 I took my degree of Doctor of Medicine of the University of London, and proceeded to Netley to go through the course prescribed for surgeons in the army.'

### Call LLM on each chunk to identify nodes and relationship

In [57]:
co = cohere.ClientV2(COHERE_API_KEY, log_warning_experimental_features=False)

In [None]:
# Global lists
global_entities = []
global_entity_map = {}      # name/alias -> id
global_relationships = []
global_relation_types = set()  # unique relation types
existing_rels = set()       # (source_id, relation_type, target_id) tuples
entity_counter = 1

# Load the response schema
with open("response_schema.json") as f:
    response_schema = json.load(f)

# Loop over each chunk
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")

    # Prepare global lists
    entity_list_str = json.dumps({"entities": global_entities}, ensure_ascii=False)
    relation_list_str = json.dumps(list(global_relation_types), ensure_ascii=False)

    # Load prompt template
    prompt = open("prompt_template.txt").read()
    #prompt = open("prompt_v2.txt").read()
    prompt = prompt.replace("{CHUNK}", chunk)
    prompt = prompt.replace("{ENTITYLIST}", entity_list_str)
    prompt = prompt.replace("{RELATIONLIST}", relation_list_str)

    # Call the LLM
    response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}],
        response_format={
            "type": "json_object",
            "schema": response_schema
        }
    )

    # Parse model output
    data = json.loads(response.dict()["message"]["content"][0]["text"])
    sleep(7)  # avoid rate limits

    # 🔹 Merge entities
    for ent in data["entities"]:
        key = ent["name"].lower()
        if key in global_entity_map:
            ent["id"] = global_entity_map[key]
        else:
            ent_id = f"e{entity_counter}"
            ent["id"] = ent_id
            global_entity_map[key] = ent_id
            global_entities.append(ent)
            entity_counter += 1

    # 🔹 Merge relationships (deduplicate and normalize)
    for rel in data["relationships"]:
        src_id = global_entity_map.get(rel["source"].lower(), rel["source"])
        tgt_id = global_entity_map.get(rel["target"].lower(), rel["target"])
        rel_type = rel["relation"].lower()

        rel_key = (src_id, rel_type, tgt_id)
        if rel_key not in existing_rels:
            rel["source"] = src_id
            rel["target"] = tgt_id
            rel["relation"] = rel_type
            global_relationships.append(rel)
            existing_rels.add(rel_key)
            global_relation_types.add(rel_type)

# 🔹 Final merged JSON
final_output = {
    "entities": global_entities,
    "relationships": global_relationships
}

# Save or print
print(json.dumps(final_output, indent=2, ensure_ascii=False))

Processing chunk 1/1251
Processing chunk 2/1251
Processing chunk 3/1251
Processing chunk 4/1251
Processing chunk 5/1251
Processing chunk 6/1251
Processing chunk 7/1251
Processing chunk 8/1251
Processing chunk 9/1251
Processing chunk 10/1251
Processing chunk 11/1251
Processing chunk 12/1251
Processing chunk 13/1251
Processing chunk 14/1251
Processing chunk 15/1251
Processing chunk 16/1251
Processing chunk 17/1251
Processing chunk 18/1251
Processing chunk 19/1251
Processing chunk 20/1251
Processing chunk 21/1251
Processing chunk 22/1251
Processing chunk 23/1251
Processing chunk 24/1251
Processing chunk 25/1251
Processing chunk 26/1251
Processing chunk 27/1251
Processing chunk 28/1251
Processing chunk 29/1251
Processing chunk 30/1251
Processing chunk 31/1251
Processing chunk 32/1251
Processing chunk 33/1251
Processing chunk 34/1251
Processing chunk 35/1251
Processing chunk 36/1251
Processing chunk 37/1251
Processing chunk 38/1251
Processing chunk 39/1251
Processing chunk 40/1251
Processin

JSONDecodeError: Expecting value: line 1310 column 14 (char 25711)

In [24]:
#print(json.dumps(final_output, indent=2, ensure_ascii=False))
#save to file
with open("extracted.json", "w", encoding="utf-8") as f:
    json.dump(final_output, f, indent=2, ensure_ascii=False)

In [25]:
# Load your extracted JSON
with open("extracted.json", "r") as f:
    data = json.load(f)

In [27]:
delete_query = "MATCH (n) DETACH DELETE n"

with driver.session() as session:
    session.run(delete_query)

print("All nodes and relationships have been deleted.")

All nodes and relationships have been deleted.


In [28]:
# Create nodes with dynamic labels
query_nodes = """
UNWIND $entities AS entity
CALL apoc.merge.node([entity.type], {id: entity.id}, 
                     {name: entity.name, aliases: entity.aliases, span: entity.span}, 
                     {}) YIELD node
RETURN node
"""

# Create relationships with dynamic types
query_rels = """
UNWIND $relationships AS rel
MATCH (src {id: rel.source})
MATCH (tgt {id: rel.target})
CALL apoc.merge.relationship(src, rel.relation, {}, {evidence: rel.evidence_span}, tgt) YIELD rel AS r
RETURN r
"""

with driver.session() as session:
    session.run(query_nodes, entities=data["entities"])
    session.run(query_rels, relationships=data["relationships"])
