# Setup and Install Required Libraries  

In [None]:
%pip install rdflib owlready2 spacy transformers torch neo4j pandas markdown beautifulsoup4

# Download spaCy language model
%python -m spacy download en_core_web_sm


# Section 2: Load OntoCAPE ontology 


In [None]:
'''from owlready2 import get_ontology

onto = get_ontology("ontology\OntoCAPE\OntoCAPE.owl").load()

# Extract ontology class names and object properties
onto_classes = list(onto.classes())
onto_labels = [cls.name for cls in onto_classes]
onto_obj_props = list(onto.object_properties())

print("Example OntoCAPE Classes:", onto_labels[:10])
print("Example OntoCAPE Object Properties:", [p.name for p in onto_obj_props[:10]])
'''


from rdflib import Graph, RDF, RDFS, OWL
from urllib.parse import urlparse
def extract_name(uri):
    return uri.split('#')[-1] if '#' in uri else uri.split('/')[-1]
    
g = Graph()
g.parse('ontology\OntoCAPE\OntoCAPE.owl') # Try 'xml' for RDF/XML


owl_classes = set(g.subjects(RDF.type, OWL.Class))
rdfs_classes = set(g.subjects(RDF.type, RDFS.Class))
all_classes = owl_classes.union(rdfs_classes)
class_labels = [extract_name(str(c)) for c in all_classes]
print(f'Total classes found: {len(class_labels)}')
print('Example class names:', class_labels[:10])
obj_props = set(g.subjects(RDF.type, OWL.ObjectProperty))
obj_labels = [extract_name(str(p)) for p in obj_props]
print(f'Total object properties found: {len(obj_labels)}')
print('Example property names:', obj_labels[:10])

Total classes found: 783
Example class names: ['N2491f9a091d646bc9bbc050271567271', 'ModelVariableSpecification', 'N17426ab650f745b390869d0b29163c5b', 'Exchange', 'Nc610ab085aa4431fb88992ebf13af6df', 'N3175ebab14504755933b55b25acc0e24', 'IntensiveThermodynamicStateVariable', 'Nec5bae67fa8b42239cf70136cd4443a2', 'SecondLevelSubsystem', 'PhaseInterfaceProperty']
Total object properties found: 168
Example property names: ['isDefinedBy', 'fulfills', 'isIndexOf', 'has_length', 'isInfluencedBy', 'indicatesMultiplicityOf', 'isPropertyOf', 'hasDirectSubsystem', 'hasFunctionalAspect', 'hasReaction']


# Section 3: Load and Parse Process Description 

In [3]:
import markdown
from bs4 import BeautifulSoup

# Load .md file
with open("descriptions\HAZOP_011_process_description.md", "r") as f:
    md_text = f.read()

html = markdown.markdown(md_text)
text = BeautifulSoup(html, "html.parser").get_text()

print(text[:500])



Crude Oil Production Unit (COPU) Process Description
Overview
The crude oil production unit is designed to process, separate, and condition crude oil from field production, separating associated gas and produced water, and preparing the oil for storage and export. The unit has a nominal capacity of 10 Mbpd (million barrels per day) and handles crude oil with an API gravity of 33°.
Main Process Sections
1. Incoming Crude Oil and Initial Separation

Field Production Feed: 
Crude oil from wells arr


# Section 4: Extract Entities and Relations from Text
We assume a basic rule-based or LLM-based approach here.

## Using keyword based rules to extract entities

In [32]:
import re

# Example: rule-based extraction
equipment_keywords = ["reactor", "pump", "heat exchanger", "distillation", "compressor", "column"]
connections = []

for line in text.split("\n"):
    for kw in equipment_keywords:
        if kw in line.lower():
            print(f"Found equipment: {kw} in line -> {line}")

# You can also extract relations manually or use spaCy/transformers


Found equipment: pump in line -> The treated oil is stored in tanks before being pumped to the custody transfer point for export[1].
Found equipment: pump in line -> The treated water is pumped and re-injected into a disposal well[1].


## Using Spacy/transformers 

In [4]:
import spacy
import re
from transformers import pipeline

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load transformer for relation extraction
relation_extractor = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Custom equipment types (can be extended)
equipment_types = [
    "reactor", "heat exchanger", "pump", "distillation column",
    "compressor", "valve", "vessel", "tank"
]

# Entity & relation extraction
doc = nlp(text)
entities = set()
relations = []

for sent in doc.sents:
    s_text = sent.text.strip()
    for eq1 in equipment_types:
        for eq2 in equipment_types:
            if eq1 != eq2 and eq1 in s_text.lower() and eq2 in s_text.lower():
                # Use zero-shot to extract relation type
                candidate_labels = ["feeds", "cools", "heats", "connects to", "transfers to"]
                result = relation_extractor(s_text, candidate_labels)
                rel = result['labels'][0]
                relations.append((eq1.title(), rel.replace(" ", "_").upper(), eq2.title()))
                entities.add(eq1.title())
                entities.add(eq2.title())

print("Entities:", entities)
print("Relations:", relations)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


Entities: {'Tank', 'Pump'}
Relations: [('Pump', 'TRANSFERS_TO', 'Tank'), ('Tank', 'TRANSFERS_TO', 'Pump')]


# Section 5: Map to OntoCAPE Ontology Classes

In [5]:
# Map each entity to an OntoCAPE class
equipment_to_ontocape = {
    "Reactor": "ontoCAPE.ChemicalReactor",
    "Heat Exchanger": "ontoCAPE.HeatExchanger",
    "Pump": "ontoCAPE.Pump",
    "Distillation Column": "ontoCAPE.DistillationColumn",
    "Compressor": "ontoCAPE.Compressor",
    "Valve": "ontoCAPE.Valve",
    "Vessel": "ontoCAPE.Vessel",
    "Tank": "ontoCAPE.Tank"
}

nodes = [(ent, equipment_to_ontocape.get(ent, "ontoCAPE.Equipment")) for ent in entities]


# Section 6: Build Graph Model (Nodes + Relationships)

In [6]:
def to_cypher_node(name, label):
    return f"MERGE (:{label.split('.')[-1]} {{name: '{name}'}})"

def to_cypher_relation(src, rel, tgt):
    return f"MATCH (a {{name: '{src}'}}), (b {{name: '{tgt}'}}) MERGE (a)-[:{rel}]->(b)"

cypher_nodes = [to_cypher_node(n, l) for n, l in nodes]
cypher_edges = [to_cypher_relation(s, r, t) for s, r, t in relations]


# Section 7: Generate Cypher Code

In [8]:
def to_cypher_node(name, label):
    return f"MERGE (:{label} {{name: '{name}'}})"

def to_cypher_relation(src, rel, tgt):
    return f"MATCH (a {{name: '{src}'}}), (b {{name: '{tgt}'}}) MERGE (a)-[:{rel.upper()}]->(b)"

cypher_nodes = [to_cypher_node(n, l.split(".")[-1]) for n, l in nodes]
cypher_edges = [to_cypher_relation(s, r, t) for s, r, t in relations]

# Print Cypher code
for line in cypher_nodes + cypher_edges:
    print(line)

'''
# Save generated Cypher code
with open("generated_kg.cypher", "w") as f:
    f.write("// Nodes\n")
    f.write("\n".join(cypher_nodes))
    f.write("\n\n// Relationships\n")
    f.write("\n".join(cypher_edges))

print("Cypher file 'generated_kg.cypher' saved successfully.")
'''

MERGE (:Tank {name: 'Tank'})
MERGE (:Pump {name: 'Pump'})
MATCH (a {name: 'Pump'}), (b {name: 'Tank'}) MERGE (a)-[:TRANSFERS_TO]->(b)
MATCH (a {name: 'Tank'}), (b {name: 'Pump'}) MERGE (a)-[:TRANSFERS_TO]->(b)


'\n# Save generated Cypher code\nwith open("generated_kg.cypher", "w") as f:\n    f.write("// Nodes\n")\n    f.write("\n".join(cypher_nodes))\n    f.write("\n\n// Relationships\n")\n    f.write("\n".join(cypher_edges))\n\nprint("Cypher file \'generated_kg.cypher\' saved successfully.")\n'

In [1]:
%pip install -q neo4j pandas networkx lxml xlsxwriter openai sentence-transformers faiss-cpu tqdm

Note: you may need to restart the kernel to use updated packages.


# Automation of HAZOP analysis 

using RAG with the normalised causes,consequences, and safeguards as a databse and generatinh HAZOP KG and excel reoprt 

In [None]:
# main.py
# This script orchestrates the entire automated HAZOP generation pipeline.
# It calls the individual modules in the correct sequence to perform
# each step of the process, from initial data ingestion to final report generation.

import os
import subprocess
import config

def run_script(script_name):
    """Helper function to run a python script and handle errors."""
    print(f"--- Running {script_name} ---")
    try:
        # We use subprocess to run each script as a separate process.
        # This ensures a clean state for each step of the pipeline.
        result = subprocess.run(['python', script_name], check=True, capture_output=True, text=True)
        print(result.stdout)
        print(f"--- Finished {script_name} successfully ---")
        return True
    except subprocess.CalledProcessError as e:
        print(f"!!! Error running {script_name} !!!")
        print(e.stderr)
        return False
    except FileNotFoundError:
        print(f"!!! Error: {script_name} not found. Make sure all scripts are in the same directory. !!!")
        return False

def main():
    """Main function to execute the HAZOP automation pipeline."""
    print("======================================================")
    print("  Automated HAZOP Generation Framework - Pipeline     ")
    print("======================================================")
    print("\nNOTE: This pipeline assumes you have already loaded your\nP&ID data into the Neo4j database.\n")

    # Define the sequence of scripts to be executed
    # 01_pid_to_graph.py has been removed as per user request.
    pipeline_scripts = [
        "02_semantic_enrichment.py",
        "03_manual_noding.py",
        "04_hazop_analysis_engine.py",
        "05_report_generator.py"
    ]

    # Execute each script in the pipeline
    for script in pipeline_scripts:
        if not run_script(script):
            print(f"\nPipeline stopped due to an error in {script}.")
            break
        print("\n")

    print("======================================================")
    print("          Pipeline execution complete.                ")
    print("======================================================")
    print(f"Check the Neo4j database for the HAZOP Knowledge Graph.")
    print(f"Check the project directory for the generated '{config.EXCEL_REPORT_PATH}' file.")

if __name__ == "__main__":
    # Ensure necessary files exist before starting
    # DEXPI_FILE_PATH check removed.
    required_files = [
        config.PROCESS_DESC_PATH,
        config.MSDS_FILE_PATH,
        config.CAUSES_CSV_PATH,
        config.CONSEQUENCES_CSV_PATH,
        config.SAFEGUARDS_CSV_PATH,
        config.PARAMETER_GUIDEWORD_CSV_PATH
    ]
    
    files_missing = False
    for f in required_files:
        if not os.path.exists(f):
            print(f"Error: Required file not found at path: {f}")
            files_missing = True
            
    if not files_missing:
        main()
    else:
        print("\nPlease ensure all required files are present and paths in config.py are correct.")

# --------------------------------------------------------------------------
'''
# config.py
# Configuration file for the Automated HAZOP Generation Framework.
# Store all sensitive information and file paths here.
# IMPORTANT: Fill in your actual credentials and paths before running the scripts.

# --- Neo4j Database Configuration ---
# Your Neo4j AuraDB URI, username, and password.
# Example URI: "neo4j+s://xxxxxxxx.databases.neo4j.io"
NEO4J_URI = "YOUR_NEO4J_URI"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "YOUR_NEO4J_PASSWORD"

# --- Large Language Model (LLM) API Configuration ---
# Using OpenAI's API as an example.
# Replace with your API key.
LLM_API_KEY = "YOUR_OPENAI_API_KEY"
# Specify the model you want to use. gpt-4-turbo is recommended for its reasoning capabilities.
LLM_MODEL_NAME = "gpt-4-turbo"

# --- File Paths Configuration ---
# Paths to all input files required by the pipeline.
# Ensure these files are in your project directory or provide absolute paths.

# Path to the process description markdown file.
PROCESS_DESC_PATH = "path/to/your/process_description.md" # Replace with your actual file path

# Path to the Material Safety Data Sheet (MSDS) file.
MSDS_FILE_PATH = "path/to/your/msds.txt" # Replace with your actual file path

# Paths to the reference CSV files containing historical HAZOP data.
CAUSES_CSV_PATH = "Causes.csv"
CONSEQUENCES_CSV_PATH = "Consequences.csv"
SAFEGUARDS_CSV_PATH = "Safeguards.csv"

# Path to the CSV file defining parameters and their applicable guidewords.
PARAMETER_GUIDEWORD_CSV_PATH = "Parameter and Guideword.csv"

# --- Output File Path Configuration ---
# Path for the generated Excel HAZOP report.
EXCEL_REPORT_PATH = "HAZOP_Report_Generated.xlsx"

# --- HAZOP Analysis Parameters ---
# Confidence score threshold for highlighting in the Excel report.
CONFIDENCE_THRESHOLD = 0.75

# Hardcoded lists for guidewords and parameters have been removed.
# The system now reads them from PARAMETER_GUIDEWORD_CSV_PATH.
'''
# --------------------------------------------------------------------------
'''
# requirements.txt
# List of Python packages required to run the HAZOP automation scripts.
# Install these packages using pip:
# pip install -r requirements.txt

# For connecting to and interacting with the Neo4j database.
neo4j

# For handling data in tabular format, especially for CSVs and Excel generation.
pandas

# For creating and manipulating the graph structure in memory before loading to Neo4j.
networkx

# For parsing the DEXPI XML file.
lxml

# For generating formatted Excel files. Works with pandas.
xlsxwriter

# For making API calls to the Large Language Model (e.g., OpenAI).
openai

# For creating vector embeddings for semantic search.
sentence-transformers

# For efficient similarity search in the RAG step.
faiss-cpu

# For progress bars to monitor long-running processes.
tqdm
'''
# --------------------------------------------------------------------------

# 02_semantic_enrichment.py
# This script handles the semantic enrichment of the Knowledge Graph.
# It uses an LLM to parse unstructured documents (process description, MSDS)
# and adds the extracted information as properties to the existing nodes in Neo4j.

import config
from neo4j import GraphDatabase
import openai
import json

class SemanticEnricher:
    """
    A class to enrich the Process KG with data from documents using an LLM.
    """
    def __init__(self, uri, user, password, api_key):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        openai.api_key = api_key

    def close(self):
        self.driver.close()

    def get_text_from_file(self, file_path):
        """Reads content from a text file."""
        try:
            with open(file_path, 'r') as f:
                return f.read()
        except FileNotFoundError:
            print(f"Error: File not found at {file_path}")
            return None

    def extract_info_with_llm(self, text_content, prompt_template):
        """
        Uses an LLM to extract structured information from text.
        """
        print("Contacting LLM for information extraction...")
        try:
            response = openai.chat.completions.create(
                model=config.LLM_MODEL_NAME,
                messages=[
                    {"role": "system", "content": "You are a data extraction expert. Your task is to read the provided text and extract information in a structured JSON format. Do not add any explanatory text outside of the JSON object."},
                    {"role": "user", "content": f"{prompt_template}\n\n---DOCUMENT TEXT---\n{text_content}"}
                ],
                response_format={"type": "json_object"}
            )
            extracted_json = json.loads(response.choices[0].message.content)
            print("Successfully extracted information from LLM.")
            return extracted_json
        except Exception as e:
            print(f"An error occurred during LLM API call: {e}")
            return None

    def enrich_graph(self, extracted_data):
        """
        Adds the extracted information as properties to nodes in the Neo4j graph.
        """
        print("Enriching the Neo4j graph with extracted data...")
        with self.driver.session() as session:
            # Enrich equipment with process parameters
            if 'equipment_parameters' in extracted_data:
                for item in extracted_data['equipment_parameters']:
                    # The user's graph uses 'name' as the identifier.
                    name = item.get('name')
                    params = item.get('parameters', {})
                    if name:
                        query = f"""
                        MATCH (n {{name: $name}})
                        SET n += $params
                        """
                        session.run(query, name=name, params=params)
                        print(f"Enriched component '{name}' with parameters.")

            # Add chemical data and link to vessels
            if 'chemical_data' in extracted_data:
                for chem in extracted_data['chemical_data']:
                    name = chem.get('chemical_name')
                    properties = chem.get('properties', {})
                    # The user's graph uses 'name' as the identifier.
                    vessel_name = chem.get('stored_in_vessel_name')
                    if name:
                        # Create or merge the chemical node
                        session.run("MERGE (c:Chemical {name: $name}) SET c += $props", name=name, props=properties)
                        print(f"Added/Updated chemical: {name}")
                        # Link chemical to the vessel it's stored in
                        if vessel_name:
                            session.run("""
                                MATCH (v {{name: $vessel_name}})
                                MATCH (c:Chemical {{name: $chem_name}})
                                MERGE (v)-[:CONTAINS]->(c)
                            """, vessel_name=vessel_name, chem_name=name)
                            print(f"Linked '{name}' to vessel '{vessel_name}'.")

def main():
    """Main execution function for Phase 2."""
    enricher = SemanticEnricher(config.NEO4J_URI, config.NEO4J_USERNAME, config.NEO4J_PASSWORD, config.LLM_API_KEY)

    # --- Process Description Enrichment ---
    process_desc_text = enricher.get_text_from_file(config.PROCESS_DESC_PATH)
    if process_desc_text:
        process_prompt = """
        From the process description text provided, extract the operating parameters for each piece of major equipment.
        Identify equipment by its name (e.g., 'V-001', 'P-203').
        For each piece of equipment, extract parameters like 'operatingPressure', 'operatingTemperature', and 'designPressure'.
        
        Format the output as a JSON object with a single key "equipment_parameters", which is a list of objects.
        Each object in the list should have a "name" and a "parameters" object.
        
        Example JSON format:
        {
          "equipment_parameters": [
            {
              "name": "V-002",
              "parameters": {
                "operatingPressure": "10 barg",
                "operatingTemperature": "150 C",
                "designPressure": "15 barg"
              }
            }
          ]
        }
        """
        extracted_process_data = enricher.extract_info_with_llm(process_desc_text, process_prompt)
        if extracted_process_data:
            enricher.enrich_graph(extracted_process_data)

    # --- MSDS Enrichment ---
    msds_text = enricher.get_text_from_file(config.MSDS_FILE_PATH)
    if msds_text:
        msds_prompt = """
        From the Material Safety Data Sheet (MSDS) text provided, extract key safety and physical properties for the chemical.
        Also, identify which vessel name it is primarily stored or used in, if mentioned.
        
        Format the output as a JSON object with a single key "chemical_data", which is a list of objects.
        Each object should have "chemical_name", "stored_in_vessel_name", and a "properties" object.
        
        Example JSON format:
        {
          "chemical_data": [
            {
              "chemical_name": "Methanol",
              "stored_in_vessel_name": "V-001",
              "properties": {
                "flashPoint": "11 C",
                "boilingPoint": "64.7 C",
                "healthHazard": "Toxic if swallowed or inhaled"
              }
            }
          ]
        }
        """
        extracted_msds_data = enricher.extract_info_with_llm(msds_text, msds_prompt)
        if extracted_msds_data:
            enricher.enrich_graph(extracted_msds_data)

    enricher.close()
    print("Phase 2: Semantic Enrichment complete.")

if __name__ == "__main__":
    main()

# --------------------------------------------------------------------------

# 03_manual_noding.py
# This script allows the user to manually select HAZOP nodes.
# It queries all components from the graph, presents them to the user,
# and creates HAZOPNode entities based on the user's selection.

import config
from neo4j import GraphDatabase

class ManualNoder:
    """
    A class to facilitate manual selection of HAZOP nodes.
    """
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_all_components(self):
        """Fetches all potential components for HAZOP analysis from the graph."""
        print("Fetching all components from the graph...")
        with self.driver.session() as session:
            # This query fetches any node with a 'name' property.
            # This is based on the user-provided Cypher query.
            query = """
            MATCH (n) WHERE exists(n.name)
            RETURN n.name AS name, labels(n)[0] AS type
            ORDER BY type, name
            """
            result = session.run(query)
            return [record.data() for record in result]

    def select_nodes(self, components):
        """Presents components to the user and gets their selection."""
        if not components:
            print("No components found in the database. Please load your P&ID graph first.")
            return []

        print("\nPlease select the components to be treated as HAZOP Nodes:")
        for i, comp in enumerate(components):
            print(f"  [{i+1}] {comp['name']} (Type: {comp['type']})")
        
        while True:
            try:
                selection_str = input("\nEnter the numbers of the nodes to analyze, separated by commas (e.g., 1, 3, 5): ")
                selected_indices = [int(s.strip()) - 1 for s in selection_str.split(',')]
                
                selected_components = []
                for i in selected_indices:
                    if 0 <= i < len(components):
                        selected_components.append(components[i])
                    else:
                        print(f"Warning: Index {i+1} is out of range and will be ignored.")
                
                if selected_components:
                    print("\nYou have selected the following nodes for HAZOP analysis:")
                    for comp in selected_components:
                        print(f"  - {comp['name']}")
                    confirm = input("Is this correct? (yes/no): ").lower()
                    if confirm == 'yes':
                        return selected_components
                else:
                    print("No valid nodes selected. Please try again.")

            except ValueError:
                print("Invalid input. Please enter numbers separated by commas.")

    def create_hazop_nodes(self, selected_components):
        """Creates HAZOPNode entities in the graph for the selected components."""
        print("\nCreating HAZOPNode entities in the graph...")
        with self.driver.session() as session:
            # First, ensure no old HAZOP nodes exist to avoid confusion
            session.run("MATCH (n:HAZOPNode) DETACH DELETE n")
            print("Cleared any pre-existing HAZOP nodes.")

            for comp in selected_components:
                comp_name = comp['name']
                query = """
                MATCH (c {name: $comp_name})
                MERGE (n:HAZOPNode {nodeID: 'Node-' + $comp_name, description: 'Node for ' + labels(c)[0] + ' ' + $comp_name})
                MERGE (n)-[:ANALYZES]->(c)
                """
                session.run(query, comp_name=comp_name)
            print(f"Successfully created {len(selected_components)} HAZOPNode entities.")

def main():
    """Main execution function for Phase 3."""
    noder = ManualNoder(config.NEO4J_URI, config.NEO4J_USERNAME, config.NEO4J_PASSWORD)
    all_components = noder.get_all_components()
    selected = noder.select_nodes(all_components)
    if selected:
        noder.create_hazop_nodes(selected)
    else:
        print("No nodes were selected. Halting pipeline.")
        # Exit gracefully if no nodes are chosen
        exit()
    noder.close()
    print("Phase 3: Manual Noding complete.")

if __name__ == "__main__":
    main()

# --------------------------------------------------------------------------

# 04_hazop_analysis_engine.py
# This is the core of the framework. It performs the LLM-powered HAZOP analysis.
# - Systematically generates deviations for each HAZOP node based on a CSV file.
# - Uses Retrieval-Augmented Generation (RAG) to assemble context.
# - Prompts the LLM for causal analysis.
# - Simulates confidence scoring.
# - Populates the results back into the Neo4j Knowledge Graph.

import config
from neo4j import GraphDatabase
import openai
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm import tqdm

class DeviationGenerator:
    """Generates deviations based on the 'Parameter and Guideword.csv' file."""
    def __init__(self, csv_path):
        print("Initializing Deviation Generator...")
        self.param_guideword_map = self._load_param_guideword_map(csv_path)
        print("Deviation Generator initialized successfully.")

    def _load_param_guideword_map(self, csv_path):
        """
        Loads and parses the CSV to map parameters to applicable guidewords.
        This version is designed for a clean, matrix-style CSV.
        """
        try:
            df = pd.read_csv(csv_path)
            # The first column is the parameter, the rest are guidewords.
            param_col = df.columns[0]
            guideword_cols = df.columns[1:]
            
            df = df.set_index(param_col)
            
            param_map = {}
            for param, row in df.iterrows():
                # Clean up parameter name (remove trailing spaces, etc.)
                param_clean = str(param).strip()
                if param_clean:
                    # An applicable guideword is a column where the cell value contains '✔'
                    applicable_guidewords = [gw for gw in guideword_cols if pd.notna(row[gw]) and '✔' in str(row[gw])]
                    param_map[param_clean] = applicable_guidewords
            
            print(f"Loaded {len(param_map)} parameters from CSV.")
            return param_map
        except Exception as e:
            print(f"Error loading or parsing {csv_path}: {e}")
            return {}

    def generate_deviations_for_node(self, node_id):
        """Generates all valid deviations for a given node."""
        deviations = []
        for param, guidewords in self.param_guideword_map.items():
            for guideword in guidewords:
                deviation_desc = f"{guideword} {param}"
                deviation_id = f"{node_id}-{guideword.replace(' ','')}-{param.replace(' ','')}"
                deviations.append({
                    "deviationID": deviation_id,
                    "description": deviation_desc,
                    "guideword": guideword,
                    "parameter": param,
                    "nodeID": node_id
                })
        return deviations

class RAGContextBuilder:
    """Builds the context for the RAG prompt."""
    def __init__(self):
        print("Initializing RAG Context Builder...")
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.cause_df = pd.read_csv(config.CAUSES_CSV_PATH).dropna(how='all').stack().reset_index(drop=True).to_frame('description')
        self.consequence_df = pd.read_csv(config.CONSEQUENCES_CSV_PATH).dropna(how='all').stack().reset_index(drop=True).to_frame('description')
        self.safeguard_df = pd.read_csv(config.SAFEGUARDS_CSV_PATH).dropna(how='all').stack().reset_index(drop=True).to_frame('description')
        
        self.cause_embeddings = self.model.encode(self.cause_df['description'].tolist())
        self.consequence_embeddings = self.model.encode(self.consequence_df['description'].tolist())
        self.safeguard_embeddings = self.model.encode(self.safeguard_df['description'].tolist())

        self.cause_index = self._create_faiss_index(self.cause_embeddings)
        self.consequence_index = self._create_faiss_index(self.consequence_embeddings)
        self.safeguard_index = self._create_faiss_index(self.safeguard_embeddings)
        print("RAG Context Builder initialized successfully.")

    def _create_faiss_index(self, embeddings):
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(np.array(embeddings, dtype=np.float32))
        return index

    def search(self, query, k=3):
        """Performs semantic search to find relevant historical data."""
        query_embedding = self.model.encode([query])
        
        _, cause_indices = self.cause_index.search(np.array(query_embedding, dtype=np.float32), k)
        retrieved_causes = self.cause_df.iloc[cause_indices[0]]['description'].tolist()

        _, cons_indices = self.consequence_index.search(np.array(query_embedding, dtype=np.float32), k)
        retrieved_consequences = self.consequence_df.iloc[cons_indices[0]]['description'].tolist()

        _, safe_indices = self.safeguard_index.search(np.array(query_embedding, dtype=np.float32), k)
        retrieved_safeguards = self.safeguard_df.iloc[safe_indices[0]]['description'].tolist()

        return {
            "causes": retrieved_causes,
            "consequences": retrieved_consequences,
            "safeguards": retrieved_safeguards
        }

class AnalysisEngine:
    """The main engine for performing the HAZOP analysis."""
    def __init__(self, uri, user, password, api_key):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        openai.api_key = api_key
        self.rag_builder = RAGContextBuilder()
        self.deviation_generator = DeviationGenerator(config.PARAMETER_GUIDEWORD_CSV_PATH)

    def close(self):
        self.driver.close()

    def get_hazop_nodes(self):
        """Fetches all identified HAZOP nodes from the graph."""
        with self.driver.session() as session:
            result = session.run("MATCH (n:HAZOPNode) RETURN n.nodeID AS nodeID, n.description AS description")
            return [record.data() for record in result]

    def get_graph_context(self, node_id):
        """Retrieves the local topology for a given HAZOP node."""
        with self.driver.session() as session:
            # This query is adapted for the user's schema
            query = """
            MATCH (n:HAZOPNode {nodeID: $node_id})-[:ANALYZES]->(comp)
            OPTIONAL MATCH path = (comp)-[:Connected_to*1..2]-(neighbor)
            WITH n, comp, collect(DISTINCT {
                name: neighbor.name, 
                type: labels(neighbor)[0], 
                opPressure: neighbor.operatingPressure
            }) AS neighbors
            RETURN 'Component ' + comp.name + ' (' + labels(comp)[0] + ') is part of ' + n.description + '. ' +
                   'It is connected to: ' + apoc.convert.toString(neighbors) AS context
            """
            result = session.run(query, node_id=node_id)
            record = result.single()
            return record['context'] if record else "No specific graph context found."

    def perform_analysis(self, deviation):
        """Performs the full RAG and LLM analysis for a single deviation."""
        graph_context = self.get_graph_context(deviation['nodeID'])
        rag_context = self.rag_builder.search(f"{deviation['description']} in {deviation['nodeID']}")

        prompt = f"""
        You are a world-class Process Safety Engineer conducting a HAZOP study.
        Your task is to analyze the following process deviation.
        
        **HAZOP Node:** {deviation['nodeID']}
        **Deviation to Analyze:** {deviation['description']}
        
        **Context from P&ID and Process KG:**
        {graph_context}
        
        **Context from Historical HAZOP Data (Reference Files):**
        - Similar Causes Found: {rag_context['causes']}
        - Similar Consequences Found: {rag_context['consequences']}
        - Similar Safeguards Found: {rag_context['safeguards']}

        **Your Task:**
        1. Identify all credible **Causes** for this deviation.
        2. Identify all potential **Consequences**.
        3. Identify all existing **Safeguards**.
        4. **Critical Instruction:** For every item, provide a `source` tag.
           - If from context, cite the source (e.g., `source: "Causes.csv"`, `source: "P&ID"`).
           - If you generate a novel item based on your engineering reasoning, label it `source: "LLM-Inferred"`.
        5. Format your complete output as a single, valid JSON object with three keys: "causes", "consequences", and "safeguards". 
           Each key should contain a list of objects, where each object has "description" and "source" fields.
        """

        try:
            response = openai.chat.completions.create(
                model=config.LLM_MODEL_NAME,
                messages=[
                    {"role": "system", "content": "You are a process safety expert. Output only valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"}
            )
            analysis_result = json.loads(response.choices[0].message.content)
            
            for category in analysis_result:
                for item in analysis_result[category]:
                    item['confidenceLevel'] = np.random.uniform(0.6, 1.0)
            
            return analysis_result
        except Exception as e:
            print(f"Error during LLM analysis for {deviation['deviationID']}: {e}")
            return None

    def populate_graph_with_analysis(self, deviation, analysis_result):
        """Writes the analysis results back to the Neo4j graph."""
        with self.driver.session() as session:
            session.run("""
                MATCH (n:HAZOPNode {nodeID: $nodeID})
                MERGE (d:Deviation {deviationID: $devID})
                ON CREATE SET d.description = $desc, d.guideword = $gw, d.parameter = $p
                MERGE (n)-[:HAS_DEVIATION]->(d)
            """, nodeID=deviation['nodeID'], devID=deviation['deviationID'], desc=deviation['description'],
                 gw=deviation['guideword'], p=deviation['parameter'])

            for category in ['causes', 'consequences', 'safeguards']:
                node_label = category.capitalize()[:-1]
                for item in analysis_result.get(category, []):
                    query = f"""
                    MATCH (d:Deviation {{deviationID: $devID}})
                    MERGE (item:{node_label} {{description: $desc}})
                    ON CREATE SET item.source = $source, item.confidenceLevel = $conf, item.validationStatus = 'Unreviewed'
                    """
                    if node_label == 'Cause':
                        query += " MERGE (item)-[:CAUSES]->(d)"
                    elif node_label == 'Consequence':
                        query += " MERGE (d)-[:LEADS_TO]->(item)"
                    elif node_label == 'Safeguard':
                        query += " MERGE (item)-[:PROTECTS_AGAINST]->(d)"
                    
                    session.run(query, devID=deviation['deviationID'], **item)

def main():
    """Main execution function for Phase 4."""
    engine = AnalysisEngine(config.NEO4J_URI, config.NEO4J_USERNAME, config.NEO4J_PASSWORD, config.LLM_API_KEY)
    
    hazop_nodes = engine.get_hazop_nodes()
    if not hazop_nodes:
        print("No HAZOP nodes found. Please run the manual noding script (03) first.")
        return

    print(f"Found {len(hazop_nodes)} HAZOP nodes to analyze.")

    for node in tqdm(hazop_nodes, desc="Analyzing HAZOP Nodes"):
        deviations = engine.deviation_generator.generate_deviations_for_node(node['nodeID'])
        for deviation in tqdm(deviations, desc=f"Deviations for {node['nodeID']}", leave=False):
            analysis = engine.perform_analysis(deviation)
            if analysis:
                engine.populate_graph_with_analysis(deviation, analysis)

    engine.close()
    print("Phase 4: HAZOP Analysis Engine complete.")

if __name__ == "__main__":
    main()

# --------------------------------------------------------------------------

# 05_report_generator.py
# This script generates the final, traditional Excel HAZOP report.
# It queries the populated HAZOP KG, flattens the data using pandas,
# and writes it to a formatted Excel file using xlsxwriter.

import config
from neo4j import GraphDatabase
import pandas as pd

class ReportGenerator:
    """
    Generates a formatted Excel report from the HAZOP KG.
    """
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def fetch_hazop_data(self):
        """
        Fetches and flattens the HAZOP data from the graph using a Cypher query.
        """
        print("Fetching HAZOP data from Knowledge Graph...")
        with self.driver.session() as session:
            query = """
            MATCH (n:HAZOPNode)-[:HAS_DEVIATION]->(d:Deviation)
            OPTIONAL MATCH (c:Cause)-[:CAUSES]->(d)
            OPTIONAL MATCH (d)-[:LEADS_TO]->(cons:Consequence)
            OPTIONAL MATCH (s:Safeguard)-[:PROTECTS_AGAINST]->(d)
            
            WITH n, d,
                 collect(DISTINCT {desc: c.description, conf: c.confidenceLevel, source: c.source}) AS causes,
                 collect(DISTINCT {desc: cons.description, conf: cons.confidenceLevel, source: cons.source}) AS consequences,
                 collect(DISTINCT {desc: s.description, conf: s.confidenceLevel, source: s.source}) AS safeguards
            
            RETURN {
                node: n.description,
                guideword: d.guideword,
                parameter: d.parameter,
                deviation: d.description,
                causes: [c IN causes WHERE c.desc IS NOT NULL],
                consequences: [co IN consequences WHERE co.desc IS NOT NULL],
                safeguards: [sfg IN safeguards WHERE sfg.desc IS NOT NULL]
            } AS result
            ORDER BY n.nodeID, d.parameter, d.guideword
            """
            results = session.run(query)
            return [record['result'] for record in results]

    def generate_excel_report(self, hazop_data, output_path):
        """
        Processes the fetched data and writes it to a formatted Excel file.
        """
        if not hazop_data:
            print("No HAZOP data found in the graph to generate a report.")
            return

        print(f"Generating Excel report at {output_path}...")
        
        flat_data = []
        for item in hazop_data:
            def format_list(items_list):
                return "\n".join([f"- {i['desc']} (Conf: {i.get('confidenceLevel', 0):.2f}, Src: {i.get('source', 'N/A')})" for i in items_list])

            flat_data.append({
                "Node": item['node'],
                "Guideword": item['guideword'],
                "Parameter": item['parameter'],
                "Deviation": item['deviation'],
                "Causes": format_list(item['causes']),
                "Consequences": format_list(item['consequences']),
                "Safeguards": format_list(item['safeguards'])
            })

        df = pd.DataFrame(flat_data)

        with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name='HAZOP Worksheet', index=False)
            workbook = writer.book
            worksheet = writer.sheets['HAZOP Worksheet']

            header_format = workbook.add_format({'bold': True, 'text_wrap': True, 'valign': 'top', 'fg_color': '#D7E4BC', 'border': 1})
            cell_format = workbook.add_format({'text_wrap': True, 'valign': 'top'})
            low_confidence_format = workbook.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'})

            for col_num, value in enumerate(df.columns.values):
                worksheet.write(0, col_num, value, header_format)

            worksheet.set_column('A:A', 40, cell_format)
            worksheet.set_column('B:D', 15, cell_format)
            worksheet.set_column('E:G', 50, cell_format)
            worksheet.freeze_panes(1, 0)
            
            worksheet.conditional_format(f'E2:G{len(df)+1}', {
                'type': 'text',
                'criteria': 'containing',
                'value': '(Conf: 0.',
                'format': low_confidence_format
            })

        print("Excel report generated successfully.")

def main():
    """Main execution function for Phase 5."""
    reporter = ReportGenerator(config.NEO4J_URI, config.NEO4J_USERNAME, config.NEO4J_PASSWORD)
    hazop_data = reporter.fetch_hazop_data()
    reporter.generate_excel_report(hazop_data, config.EXCEL_REPORT_PATH)
    reporter.close()
    print("Phase 5: Report Generation complete.")

if __name__ == "__main__":
    main()
