# Setup and Install Required Libraries  

In [None]:
%pip install rdflib owlready2 spacy transformers torch neo4j pandas markdown beautifulsoup4

# Download spaCy language model
%python -m spacy download en_core_web_sm


# Section 2: Load OntoCAPE ontology 


In [None]:
'''from owlready2 import get_ontology

onto = get_ontology("ontology\OntoCAPE\OntoCAPE.owl").load()

# Extract ontology class names and object properties
onto_classes = list(onto.classes())
onto_labels = [cls.name for cls in onto_classes]
onto_obj_props = list(onto.object_properties())

print("Example OntoCAPE Classes:", onto_labels[:10])
print("Example OntoCAPE Object Properties:", [p.name for p in onto_obj_props[:10]])
'''


from rdflib import Graph, RDF, RDFS, OWL
from urllib.parse import urlparse
def extract_name(uri):
    return uri.split('#')[-1] if '#' in uri else uri.split('/')[-1]
    
g = Graph()
g.parse('ontology\OntoCAPE\OntoCAPE.owl') # Try 'xml' for RDF/XML


owl_classes = set(g.subjects(RDF.type, OWL.Class))
rdfs_classes = set(g.subjects(RDF.type, RDFS.Class))
all_classes = owl_classes.union(rdfs_classes)
class_labels = [extract_name(str(c)) for c in all_classes]
print(f'Total classes found: {len(class_labels)}')
print('Example class names:', class_labels[:10])
obj_props = set(g.subjects(RDF.type, OWL.ObjectProperty))
obj_labels = [extract_name(str(p)) for p in obj_props]
print(f'Total object properties found: {len(obj_labels)}')
print('Example property names:', obj_labels[:10])

Total classes found: 783
Example class names: ['N2491f9a091d646bc9bbc050271567271', 'ModelVariableSpecification', 'N17426ab650f745b390869d0b29163c5b', 'Exchange', 'Nc610ab085aa4431fb88992ebf13af6df', 'N3175ebab14504755933b55b25acc0e24', 'IntensiveThermodynamicStateVariable', 'Nec5bae67fa8b42239cf70136cd4443a2', 'SecondLevelSubsystem', 'PhaseInterfaceProperty']
Total object properties found: 168
Example property names: ['isDefinedBy', 'fulfills', 'isIndexOf', 'has_length', 'isInfluencedBy', 'indicatesMultiplicityOf', 'isPropertyOf', 'hasDirectSubsystem', 'hasFunctionalAspect', 'hasReaction']


# Section 3: Load and Parse Process Description 

In [3]:
import markdown
from bs4 import BeautifulSoup

# Load .md file
with open("descriptions\HAZOP_011_process_description.md", "r") as f:
    md_text = f.read()

html = markdown.markdown(md_text)
text = BeautifulSoup(html, "html.parser").get_text()

print(text[:500])



Crude Oil Production Unit (COPU) Process Description
Overview
The crude oil production unit is designed to process, separate, and condition crude oil from field production, separating associated gas and produced water, and preparing the oil for storage and export. The unit has a nominal capacity of 10 Mbpd (million barrels per day) and handles crude oil with an API gravity of 33°.
Main Process Sections
1. Incoming Crude Oil and Initial Separation

Field Production Feed: 
Crude oil from wells arr


# Section 4: Extract Entities and Relations from Text
We assume a basic rule-based or LLM-based approach here.

## Using keyword based rules to extract entities

In [32]:
import re

# Example: rule-based extraction
equipment_keywords = ["reactor", "pump", "heat exchanger", "distillation", "compressor", "column"]
connections = []

for line in text.split("\n"):
    for kw in equipment_keywords:
        if kw in line.lower():
            print(f"Found equipment: {kw} in line -> {line}")

# You can also extract relations manually or use spaCy/transformers


Found equipment: pump in line -> The treated oil is stored in tanks before being pumped to the custody transfer point for export[1].
Found equipment: pump in line -> The treated water is pumped and re-injected into a disposal well[1].


## Using Spacy/transformers 

In [4]:
import spacy
import re
from transformers import pipeline

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load transformer for relation extraction
relation_extractor = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Custom equipment types (can be extended)
equipment_types = [
    "reactor", "heat exchanger", "pump", "distillation column",
    "compressor", "valve", "vessel", "tank"
]

# Entity & relation extraction
doc = nlp(text)
entities = set()
relations = []

for sent in doc.sents:
    s_text = sent.text.strip()
    for eq1 in equipment_types:
        for eq2 in equipment_types:
            if eq1 != eq2 and eq1 in s_text.lower() and eq2 in s_text.lower():
                # Use zero-shot to extract relation type
                candidate_labels = ["feeds", "cools", "heats", "connects to", "transfers to"]
                result = relation_extractor(s_text, candidate_labels)
                rel = result['labels'][0]
                relations.append((eq1.title(), rel.replace(" ", "_").upper(), eq2.title()))
                entities.add(eq1.title())
                entities.add(eq2.title())

print("Entities:", entities)
print("Relations:", relations)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


Entities: {'Tank', 'Pump'}
Relations: [('Pump', 'TRANSFERS_TO', 'Tank'), ('Tank', 'TRANSFERS_TO', 'Pump')]


# Section 5: Map to OntoCAPE Ontology Classes

In [5]:
# Map each entity to an OntoCAPE class
equipment_to_ontocape = {
    "Reactor": "ontoCAPE.ChemicalReactor",
    "Heat Exchanger": "ontoCAPE.HeatExchanger",
    "Pump": "ontoCAPE.Pump",
    "Distillation Column": "ontoCAPE.DistillationColumn",
    "Compressor": "ontoCAPE.Compressor",
    "Valve": "ontoCAPE.Valve",
    "Vessel": "ontoCAPE.Vessel",
    "Tank": "ontoCAPE.Tank"
}

nodes = [(ent, equipment_to_ontocape.get(ent, "ontoCAPE.Equipment")) for ent in entities]


# Section 6: Build Graph Model (Nodes + Relationships)

In [6]:
def to_cypher_node(name, label):
    return f"MERGE (:{label.split('.')[-1]} {{name: '{name}'}})"

def to_cypher_relation(src, rel, tgt):
    return f"MATCH (a {{name: '{src}'}}), (b {{name: '{tgt}'}}) MERGE (a)-[:{rel}]->(b)"

cypher_nodes = [to_cypher_node(n, l) for n, l in nodes]
cypher_edges = [to_cypher_relation(s, r, t) for s, r, t in relations]


# Section 7: Generate Cypher Code

In [8]:
def to_cypher_node(name, label):
    return f"MERGE (:{label} {{name: '{name}'}})"

def to_cypher_relation(src, rel, tgt):
    return f"MATCH (a {{name: '{src}'}}), (b {{name: '{tgt}'}}) MERGE (a)-[:{rel.upper()}]->(b)"

cypher_nodes = [to_cypher_node(n, l.split(".")[-1]) for n, l in nodes]
cypher_edges = [to_cypher_relation(s, r, t) for s, r, t in relations]

# Print Cypher code
for line in cypher_nodes + cypher_edges:
    print(line)

'''
# Save generated Cypher code
with open("generated_kg.cypher", "w") as f:
    f.write("// Nodes\n")
    f.write("\n".join(cypher_nodes))
    f.write("\n\n// Relationships\n")
    f.write("\n".join(cypher_edges))

print("Cypher file 'generated_kg.cypher' saved successfully.")
'''

MERGE (:Tank {name: 'Tank'})
MERGE (:Pump {name: 'Pump'})
MATCH (a {name: 'Pump'}), (b {name: 'Tank'}) MERGE (a)-[:TRANSFERS_TO]->(b)
MATCH (a {name: 'Tank'}), (b {name: 'Pump'}) MERGE (a)-[:TRANSFERS_TO]->(b)


'\n# Save generated Cypher code\nwith open("generated_kg.cypher", "w") as f:\n    f.write("// Nodes\n")\n    f.write("\n".join(cypher_nodes))\n    f.write("\n\n// Relationships\n")\n    f.write("\n".join(cypher_edges))\n\nprint("Cypher file \'generated_kg.cypher\' saved successfully.")\n'