In [1]:
from pathlib import Path
import sys
import json
project_root = Path.cwd().parent   
sys.path.append(str(project_root / "src"))

In [2]:
DATA_DIR_RAW = project_root / "data" / "raw"
DATA_DIR_PROCESSED = project_root / "data" / "processed"

In [15]:
from simple_rag import load_artifacts, retrieve_chunks
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os
import cohere

In [16]:
# LLM
# Load all keys from .env
load_dotenv()

# Access environment variables
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
print("Cohere key loaded:", bool(COHERE_API_KEY))
co = cohere.ClientV2(COHERE_API_KEY, log_warning_experimental_features=False)

Cohere key loaded: True


In [17]:
Question = "How did Dr. Watson first meet Sherlock Holmes, and which events and locations led to their shared lodgings at 221B Baker Street?"

In [19]:
# RAG Retrieval
# Load RAG embeddings, index, and chunks
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
embeddings, index, chunks = load_artifacts(path=DATA_DIR_PROCESSED, prefix="simple_rag_")
rag_retrieved_chunks= retrieve_chunks(Question, index, chunks, model, top_k=3)

In [28]:
context = "\n\n".join(rag_retrieved_chunks)
prompt = f"""
You are a helpful assistant. Using ONLY the context below, answer the question.

Context:
{context}

Question:
{Question}

If you don't find the answer, ONLY state that answer not found. Do not try to make up an answer.
"""
response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}])

response.dict()["message"]["content"][0]["text"]

'Answer not found.'

In [51]:
# GraphRAG Retrieval
# Load GraphRAG -Knowledge Graph
with open(DATA_DIR_PROCESSED /"entities.json", "r", encoding="utf-8") as f:
    entities = json.load(f)
with open(DATA_DIR_PROCESSED /"relationships.json", "r", encoding="utf-8") as f:
    relationships = json.load(f)
kg_relationships = relationships
unique_relations = set([rel['relation'] for rel in relationships])

In [64]:
# --- Step 1: LLM generates a query to retrive data from KG ---
graph_search_prompt = f"""
You are an intelligent assistant that generates a Python function to retrieve KG facts.

Inputs:
- Entities: {entities}  # Full list with IDs and aliases
- Unique Relations: {unique_relations}
- Question: "{Question}"

Instructions:
1. First, map entity names or aliases to their IDs so you can filter KG facts correctly.
2. Then, generate a Python function called `retrieve_kg_facts(kg_relationships)` that:
   - Filters the KG relationships using entity IDs and the given relations.
   - Returns relevant KG facts in the format: "source_name → relation → target_name".
3. Do not assume facts that are not in the KG.
4. Output ONLY the Python function code, no explanations and no extra text.
5. function should take two arguments: kg_relationships and kg_entities.
"""

response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": graph_search_prompt}])

function_code = response.dict()["message"]["content"][0]["text"]

In [65]:
function_code = function_code.replace("```python", "").replace("```", "")
kg_entities = entities
kg_relationships = relationships

In [None]:
# Execute the function safely
local_vars = {}
exec(function_code, {}, local_vars)
retrieved_facts = local_vars["retrieve_kg_facts"](kg_relationships, kg_entities)
print("Retrieved KG facts:", retrieved_facts)

Retrieved KG facts: ['Watson → meets → Sherlock Holmes', 'Sherlock Holmes → mentions → Baker Street']
