## STEP 1 — INSTALL DEPENDENCIES

In [None]:
%%bash
pip install -q haystack-ai
pip install -q "sentence-transformers>=2.2.0"
pip install -q google-ai-haystack

## STEP 2 — IMPORTS

In [None]:
import os
import json

from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import (
    SentenceTransformersDocumentEmbedder,
    SentenceTransformersTextEmbedder,
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator

print("Imports complete.")

## STEP 3 — DOCUMENT STORE & EMBEDDER SETUP

In [None]:
document_store = InMemoryDocumentStore()

MODEL = "sentence-transformers/all-MiniLM-L6-v2"
doc_embedder  = SentenceTransformersDocumentEmbedder(model=MODEL)
text_embedder = SentenceTransformersTextEmbedder(model=MODEL)
doc_embedder.warm_up()

# top_k=50 so broad queries ("all damage scenarios") pull sufficient context
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=50)
print("Setup complete.")

## STEP 4 — DATA INGESTION

Stores **full** node and edge objects (including `style`) so the LLM can reproduce them exactly.  
Only UI-state fields that carry zero semantic value are stripped:  
`dragging`, `resizing`, `selected` (boolean interaction states, not data).

Everything else — including `style`, `position`, `positionAbsolute`, `height`, `width`,  
`isAsset`, `parentId`, `properties`, `markerStart`, `markerEnd` — is preserved.

In [None]:
ITEM_PATH   = "item_defination.json"
DAMAGE_PATH = "Damage_scenarios.json"

# Only strip pure interaction-state booleans that carry no semantic value
NODE_STRIP = {"dragging", "resizing", "selected"}
EDGE_STRIP = {"selected"}

def clean_node(node):
    """Strip interaction-state keys only; preserve style and all data fields."""
    return {k: v for k, v in node.items() if k not in NODE_STRIP}

def clean_edge(edge):
    """Strip interaction-state keys only."""
    return {k: v for k, v in edge.items() if k not in EDGE_STRIP}

docs = []

# ===========================================================================
# DATASET 1 — item_defination.json
# Each node -> 1 Document  |  Each edge -> 1 Document
# ===========================================================================
with open(ITEM_PATH, "r", encoding="utf-8") as f:
    item_data = json.load(f)

model_name = item_data["Models"][0]["name"] if item_data.get("Models") else "unknown"
model_id   = item_data["Models"][0]["_id"]  if item_data.get("Models") else None

item_start = len(docs)
for asset in item_data.get("Assets", []):
    asset_id = asset.get("_id")
    template = asset.get("template", {})

    for node in template.get("nodes", []):
        cn = clean_node(node)
        docs.append(Document(
            content=json.dumps(cn, ensure_ascii=False),
            meta={
                "source":     "item_definition",
                "type":       "node",
                "model_name": model_name,
                "model_id":   model_id,
                "asset_id":   asset_id,
                "node_id":    node.get("id"),
                "node_label": node.get("data", {}).get("label", ""),
            }
        ))

    for edge in template.get("edges", []):
        ce = clean_edge(edge)
        docs.append(Document(
            content=json.dumps(ce, ensure_ascii=False),
            meta={
                "source":      "item_definition",
                "type":        "edge",
                "model_id":    model_id,
                "asset_id":    asset_id,
                "edge_id":     edge.get("id"),
                "source_node": edge.get("source"),
                "target_node": edge.get("target"),
            }
        ))

print(f"Item Definition  -> {len(docs) - item_start} docs (nodes + edges)")

# ===========================================================================
# DATASET 2 — Damage_scenarios.json
# Each Derivation -> 1 Document  |  Each Detail -> 1 Document
# ===========================================================================
with open(DAMAGE_PATH, "r", encoding="utf-8") as f:
    damage_data = json.load(f)

damage_start = len(docs)
for ds in damage_data.get("Damage_scenarios", []):
    ds_type  = ds.get("type", "")
    ds_id    = ds.get("_id")
    ds_model = ds.get("model_id")

    for deriv in ds.get("Derivations", []):
        docs.append(Document(
            content=json.dumps(deriv, ensure_ascii=False),
            meta={
                "source":    "damage_scenarios",
                "type":      "derivation",
                "ds_type":   ds_type,
                "ds_id":     ds_id,
                "model_id":  ds_model,
                "ds_id_ref": deriv.get("id"),
                "node_id":   deriv.get("nodeId"),
            }
        ))

    for detail in (ds.get("Details", []) or []):
        docs.append(Document(
            content=json.dumps(detail, ensure_ascii=False),
            meta={
                "source":   "damage_scenarios",
                "type":     "detail",
                "ds_type":  ds_type,
                "ds_id":    ds_id,
                "model_id": ds_model,
                "node_id":  detail.get("nodeId"),
                "name":     detail.get("Name") or detail.get("name"),
            }
        ))

print(f"Damage Scenarios -> {len(docs) - damage_start} docs (derivations + details)")
print(f"Total            -> {len(docs)} documents")

embedded_docs = doc_embedder.run(documents=docs)["documents"]
document_store.write_documents(embedded_docs)
print(f"All {len(embedded_docs)} documents embedded and stored.")

## STEP 5 — PROMPT TEMPLATE (NATURAL LANGUAGE INTERPRETER + NATIVE STRUCTURE MIRROR)

In [None]:
template = """
You are an intelligent JSON data extraction engine for a BMS (Battery Management System) TARA knowledge base.

You will receive:
1. Context documents — raw JSON objects representing nodes, edges, derivations, and damage details
2. A natural language user query

YOUR TASK:
- Interpret the user's intent from the query
- Collect ALL relevant documents from the context that match the query
- Return them verbatim inside the appropriate section — DO NOT remap, rename, or restructure any fields
- Every field present in the source document MUST appear in the output with its original key and value

CRITICAL RULES:
- Return ONLY valid JSON — no markdown, no backticks, no explanation
- DO NOT rename any key (e.g. keep "id" as "id", keep "data" as "data", keep "position" as "position")
- DO NOT invent or hallucinate any values — copy them exactly from the source document
- If a source field has a value (even false / 0 / empty string) — include it as-is
- If a field is genuinely absent from the source document — omit it entirely (do not write null)
- Preserve style, position, positionAbsolute, height, width, isAsset, parentId and all nested objects exactly
- The root key must always be "result"

OUTPUT FORMAT:
{
  "result": {
    "query_intent": "<one-line description of what you understood the user wants>",
    "assets": [ <full verbatim node objects from context that match the query> ],
    "edges":  [ <full verbatim edge objects from context that match the query> ],
    "damage_scenarios": [ <full verbatim derivation objects from context that match the query> ],
    "damage_details":   [ <full verbatim detail objects from context that match the query> ]
  }
}

SECTION SELECTION — include a section only when the query asks for it:
- "assets" / "nodes" / "components" in query -> include assets
- "edges" / "connections" / "links" in query -> include edges
- "damage scenarios" / "derivations" / "loss" in query -> include damage_scenarios
- "details" / "cyber losses" / "safety" / "threats" in query -> include damage_details
- "properties" in query -> include assets (properties live inside node objects)
- "all" / "everything" / "full" / "report" in query -> include ALL four sections
- Specific node name in query -> filter all included sections to only that node
- Omit sections that are entirely irrelevant to the query

CONTEXT DOCUMENTS:
{% for document in documents %}
{{ document.content }}
{% endfor %}

USER QUERY:
{{ question }}

Return ONLY valid JSON starting with {"result":. No markdown. No explanation.
"""

prompt_builder = PromptBuilder(template=template, required_variables=["documents", "question"])
print("Prompt builder configured.")

## STEP 6 — LLM SETUP

In [None]:
os.environ["GOOGLE_API_KEY"] = "YOUR_KEY_HERE"  # <- replace with your key

generator = GoogleAIGeminiGenerator(
    model="gemini-2.0-flash",
    generation_kwargs={"temperature": 0.0}
)
print("Gemini generator initialised.")

## STEP 7 — BUILD RAG PIPELINE

`text_embedder -> retriever (top_k=50) -> prompt_builder -> llm`

In [None]:
rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder",  text_embedder)
rag_pipeline.add_component("retriever",      retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm",            generator)

rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever",               "prompt_builder.documents")
rag_pipeline.connect("prompt_builder",          "llm")

print("RAG pipeline built.")

## STEP 8 — SINGLE QUERY (one-shot)

Example natural language queries:
- `"give me all damage scenarios assets and properties"`
- `"what are the cyber loss properties for CellMonitoring"`
- `"show all nodes and their connections"`
- `"full report on BatteryPack"`
- `"list all derivations related to loss of integrity"`

In [None]:
def ask(query: str, pretty: bool = True) -> dict:
    """
    Run the RAG pipeline for a natural language query.
    Returns the parsed JSON dict and prints a summary.
    """
    result = rag_pipeline.run({
        "text_embedder":  {"text": query},
        "prompt_builder": {"question": query},
    })
    raw = result["llm"]["replies"][0]

    try:
        parsed = json.loads(raw)
        if pretty:
            print(json.dumps(parsed, indent=2, ensure_ascii=False))
        inner = parsed.get("result", {})
        print(f"\n[Intent] {inner.get('query_intent', 'n/a')}")
        for section in ["assets", "edges", "damage_scenarios", "damage_details"]:
            items = inner.get(section)
            if items is not None:
                print(f"  {section}: {len(items)} item(s)")
        return parsed
    except json.JSONDecodeError as e:
        print("[JSON parse error]", e)
        print(raw)
        return {}


# One-shot example — change query as needed
response = ask("give me all damage scenarios assets and properties")

## STEP 9 — INTERACTIVE CHATBOT

Ask anything in natural language. Type `exit` to quit. Type `history` to review past queries.  
All responses are stored in `history` as `{query, result}`.

**Example queries:**
- `give me all damage scenarios assets and properties`
- `what edges are connected to CellMonitoring`
- `show all cyber loss properties for every node`
- `list all derivations related to loss of integrity`
- `full report on BatteryPack`
- `everything`

In [None]:
history = []

print("BMS RAG Chatbot — ask anything in natural language.")
print("Commands: 'exit' to quit | 'history' to list past queries")
print("-" * 60)

while True:
    try:
        user_input = input("\nQuery: ").strip()
    except (EOFError, KeyboardInterrupt):
        print("\nSession ended.")
        break

    if not user_input:
        continue

    if user_input.lower() in ("exit", "quit"):
        print("Goodbye.")
        break

    if user_input.lower() == "history":
        if not history:
            print("No history yet.")
        else:
            for i, h in enumerate(history, 1):
                print(f"  [{i}] {h['query']}")
        continue

    print()
    parsed = ask(user_input)
    if parsed:
        history.append({"query": user_input, "result": parsed})
    print("-" * 60)