In [1]:
from pathlib import Path
import sys
import json
project_root = Path.cwd().parent   
sys.path.append(str(project_root / "src"))

In [2]:
DATA_DIR_RAW = project_root / "data" / "raw"
DATA_DIR_PROCESSED = project_root / "data" / "processed"

In [15]:
from simple_rag import load_artifacts, retrieve_chunks
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os
import cohere

In [16]:
# LLM
# Load all keys from .env
load_dotenv()

# Access environment variables
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
print("Cohere key loaded:", bool(COHERE_API_KEY))
co = cohere.ClientV2(COHERE_API_KEY, log_warning_experimental_features=False)

Cohere key loaded: True


In [154]:
Question = "Who introduced Sherlock to Watson?"
Question = "What was the motive for the murders?"

In [155]:
# RAG Retrieval
# Load RAG embeddings, index, and chunks
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
embeddings, index, chunks = load_artifacts(path=DATA_DIR_PROCESSED, prefix="simple_rag_")
rag_retrieved_chunks= retrieve_chunks(Question, index, chunks, model, top_k=3)
rag_retrieved_chunks

['from the hatred and fear expressed upon his face. By the method of exclusion, I had arrived at this result, for no other hypothesis would meet the facts. Do not imagine that it was a very unheard of idea. The forcible administration of poison is by no means a new thing in criminal annals. The cases of Dolsky in Odessa, and of Leturier in Montpellier, will occur at once to any toxicologist. “And now came the great question as to the reason why. Robbery had not been the object of the murder, for nothing was taken. Was it politics, then, or was it a woman? That was the question which confronted me. I was inclined from the first to the latter supposition. Political assassins are only too glad to do their work and to fly. This murder had, on the contrary, been done most deliberately, and the perpetrator had left his tracks all over the room, showing that he had been there all the time. It must have been a private wrong, and not a political one, which called for such a methodical revenge. 

In [156]:
context = "\n\n".join(rag_retrieved_chunks)
prompt = f"""
You are a helpful assistant. Using ONLY the context below, answer the question. Context is retrieved from a set of documents using RAG (Retrieval-Augmented Generation) method. 

Context:
{context}

Question:
{Question}

Just give me the answer. Do not include the context in your answer.
Please answer only based on the above context.
"""
response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}])

response.dict()["message"]["content"][0]["text"]

'The motive for the murders was a private wrong, likely related to a past romantic rivalry. Specifically, the murderer, Jefferson Hope, was an old rival in love of the victim, Enoch Drebber, and the murder was a methodical revenge.'

## Graph RAG

In [157]:
# GraphRAG Retrieval
# Load GraphRAG -Knowledge Graph
with open(DATA_DIR_PROCESSED /"entities.json", "r", encoding="utf-8") as f:
    entities = json.load(f)
with open(DATA_DIR_PROCESSED /"relationships.json", "r", encoding="utf-8") as f:
    relationships = json.load(f)
unique_relations = set([rel['relation'] for rel in relationships])

In [158]:
# --- Step 1: LLM generates a query to retrive data from KG ---
graph_search_prompt = f"""
You are an intelligent assistant that generates a Python function to retrieve KG facts.

Inputs:
- Entities: {entities}  # Full list with IDs and aliases
- Unique Relations: {unique_relations}
- Question: "{Question}"

Instructions:
1. Map entity names or aliases to their IDs so you can filter KG facts correctly.
2. Generate a Python function called `retrieve_kg_facts(kg_relationships, kg_entities)` that:
   - Filters the KG relationships using entity IDs and the given relations.
   - Returns a list of at most 3 relevant KG facts in the format: "source_name → relation → target_name".
3. The function should only take two arguments: `kg_relationships` and `kg_entities`.
4. Do not assume or invent any facts that are not present in the KG.
5. Think carefully about how to retrieve the most relevant information given the question, entities, and relations.
6. Output ONLY the Python function code. Do not include explanations or extra text.
7. Retrieve the entire relationship object from the KG including evidence_span

Example values in the kg_relationships list:
{relationships[0:5]}
"""


In [159]:
response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": graph_search_prompt}])

function_code = response.dict()["message"]["content"][0]["text"]

In [160]:
function_code = function_code.replace("```python", "").replace("```", "")
kg_entities = entities
kg_relationships = relationships

function_code

"\ndef retrieve_kg_facts(kg_relationships, kg_entities):\n    # Create a dictionary to map entity names and aliases to their IDs\n    entity_map = {}\n    for entity in kg_entities:\n        entity_map[entity['name']] = entity['id']\n        for alias in entity['aliases']:\n            entity_map[alias] = entity['id']\n\n    # Define relevant relations based on the question\n    relevant_relations = {'kills', 'seeks_revenge_for', 'motive_of'}\n\n    # Filter KG relationships using entity IDs and relevant relations\n    relevant_facts = []\n    for relation in kg_relationships:\n        if relation['relation'] in relevant_relations:\n            source_name = next((e['name'] for e in kg_entities if e['id'] == relation['source']), None)\n            target_name = next((e['name'] for e in kg_entities if e['id'] == relation['target']), None)\n            if source_name and target_name:\n                relevant_facts.append({\n                    'source': source_name,\n                   

In [161]:
# Execute the function safely
local_vars = {}
exec(function_code, {}, local_vars)
retrieved_facts = local_vars["retrieve_kg_facts"](kg_relationships, kg_entities)
print("Retrieved KG facts:", retrieved_facts)

Retrieved KG facts: [{'source': 'the murderer', 'relation': 'kills', 'target': 'Enoch J. Drebber', 'evidence_span': 'gratitude for him who had removed its owner from the world'}, {'source': 'milk boy', 'relation': 'kills', 'target': 'terrier', 'evidence_span': 'Stangerson was concerned in the death of Drebber'}, {'source': 'Sierra Nevada', 'relation': 'kills', 'target': 'Nebraska', 'evidence_span': 'the murderer of Enoch Drebber'}]


In [163]:
context = "\n\n".join(retrieved_facts)
prompt = f"""
You are a helpful assistant. Using ONLY the context drawn from the knowelege graph and its evidence span, answer the question. 
Evidence span gives the snippet of text from which the relationship was extracted. Please try to draw best answer from the relationships and their evidence span.

Context:
{context}

Question:
{Question}

Just give me the answer. Do not include the context in your answer.
Please answer only based on the above context. 
"""
response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}])

response.dict()["message"]["content"][0]["text"]

TypeError: sequence item 0: expected str instance, dict found