# Data
First, let's check out the directories that hold the data.

In the case of the Plant Breakdown Structure use case, we are working with data from 2 sites: Bukom and Moerdijk.

The data for the second use case is located in the `sigraph` directory.

In [0]:
%ls -lt /dbfs/mnt/bclearer/temp/neo4j

total 16
drwxrwxrwx 2 nobody nogroup 4096 Feb  8 12:26 [0m[34;42mmoerdijk[0m/
drwxrwxrwx 2 nobody nogroup 4096 Feb  8 12:26 [34;42mrecordings[0m/
drwxrwxrwx 2 nobody nogroup 4096 Feb  8 12:26 [34;42msigraph[0m/
drwxrwxrwx 2 nobody nogroup 4096 Aug 18 14:29 [34;42mbukom[0m/


For Bukom we have gone through 4 iterations, adding more complexity to the models with each iteration.

In [0]:
%ls -lt /dbfs/mnt/bclearer/temp/neo4j/bukom/

total 16
drwxrwxrwx 2 nobody nogroup 4096 Feb  8 12:26 [0m[34;42mmodel_iteration_3[0m/
drwxrwxrwx 2 nobody nogroup 4096 Feb  8 12:26 [34;42mmodel_iteration_4[0m/
drwxrwxrwx 2 nobody nogroup 4096 Sep 18 12:21 [34;42mmodel_iteration_2[0m/
drwxrwxrwx 2 nobody nogroup 4096 Sep 18 12:21 [34;42mmodel_iteration_1[0m/


Let's have a look at the structure of one of these directories:

In [0]:
%ls -lt /dbfs/mnt/bclearer/temp/neo4j/bukom/model_iteration_4/

total 8
drwxrwxrwx 2 nobody nogroup 4096 Feb  8 12:26 [0m[34;42mbclearer_edges[0m/
drwxrwxrwx 2 nobody nogroup 4096 Feb  8 12:26 [34;42mbclearer_nodes[0m/



#Global variables

In [0]:
"""
CREATE FULLTEXT INDEX search_term IF NOT EXISTS FOR (n:Entity|Site|Plant|ProcessUnit|Tag|Equipment) ON EACH [n.name, n.description]
"""

IP_ADDRESS_RAFAEL_PRIVATE = "10.1.0.5"
IP_ADDRESS_RAFAEL = "20.76.138.131"
# IP_ADDRESS_MANGI = "20.61.80.250"

IP_ADDRESS = IP_ADDRESS_RAFAEL_PRIVATE

ITERATION = 3

if ITERATION in [1, 2, 3, 4]:
    use_issue_type = False  # True for mansoors_folder_neo4j_output
    SUB_DIR = (
        "bclearer" if ITERATION == 4 else "bclearer_neo4j"
    )  # Depends on folder structure. Options are: bclearer_neo4j (iters 1,2,3), bclearer (iter 4) or all (for mansoors_folder_neo4j_output)
    DATA_CORE_PATH = f"/mnt/bclearer/temp/neo4j/bukom/model_iteration_{ITERATION}/"
    DATABASE = f"pbs-iter{ITERATION}"
else:
    use_issue_type = True  # True for mansoors_folder_neo4j_output
    SUB_DIR = "all"  # Depends on folder structure. Options are: bclearer_neo4j (iters 1,2,3), bclearer (iter 4) or all (for mansoors_folder_neo4j_output)
    DATA_CORE_PATH = "/mnt/bclearer/temp/mansoors_folder_neo4j_output/"  # Iteration
    DATABASE = "pbs-neo4j-output"

DATABASE_USERNAME = "neo4j"
DATABASE_PASSWORD = "bclearer"
DATABASE_CONNECTION_URL = f"bolt://{IP_ADDRESS}:7687"
NEO4J_WEBAPP = f"http://{IP_ADDRESS}:7474/"
print(NEO4J_WEBAPP)

http://10.1.0.5:7474/


In [0]:
%ls /dbfs/mnt/bclearer/temp/neo4j/bukom/model_iteration_1/bclearer_neo4j_nodes

[0m[34;42m_delta_log[0m/
[01;32mpart-00000-560da62d-fb98-4741-ab2d-47cba2d40d2e-c000.snappy.parquet[0m*
[01;32mpart-00001-0ebdd305-9007-4cf3-854d-eefd40f9c13f-c000.snappy.parquet[0m*
[01;32mpart-00002-fda7b3a0-879e-4006-b859-6d13243d4bb3-c000.snappy.parquet[0m*
[01;32mpart-00003-445bc8c1-2e39-41e2-9c75-0a78f55d4e5a-c000.snappy.parquet[0m*
[01;32mpart-00004-ac02cb28-42a6-44b9-929a-a971eac4f70e-c000.snappy.parquet[0m*
[01;32mpart-00005-78d8c9f2-f2de-4e03-b216-fa12040b2a63-c000.snappy.parquet[0m*
[01;32mpart-00006-0bfbaf40-7ce5-4172-9a35-f14198bde1ab-c000.snappy.parquet[0m*
[01;32mpart-00007-1fac24fe-58b8-44a4-ae46-9993e05abc38-c000.snappy.parquet[0m*
[01;32mpart-00008-7054d47e-c7a8-4590-9b32-6ab01320aa82-c000.snappy.parquet[0m*


In [0]:
%ls /dbfs/mnt/bclearer/temp/mansoors_folder_neo4j_output/all_nodes/

[0m[34;42m_delta_log[0m/
[01;32mpart-00000-06749869-923b-4057-b44a-4809f0e22e58-c000.snappy.parquet[0m*
[01;32mpart-00001-b7857ff3-81b3-46ce-88bd-a4aa4f31296d-c000.snappy.parquet[0m*
[01;32mpart-00002-0c1eb72f-b4ea-4d96-9714-47fbdabd3a48-c000.snappy.parquet[0m*
[01;32mpart-00003-0e67a4b6-5381-424b-ab47-ad1861c73280-c000.snappy.parquet[0m*
[01;32mpart-00004-eaaa07c9-af03-40e2-a770-35a9346ec98c-c000.snappy.parquet[0m*
[01;32mpart-00005-5049ba97-67db-41c2-a56b-e92ffa9b7841-c000.snappy.parquet[0m*
[01;32mpart-00006-452f2da5-70de-4ca2-b009-8dbc5c050683-c000.snappy.parquet[0m*
[01;32mpart-00007-2ef8c154-2c92-4ac4-84b2-dd8385f3f034-c000.snappy.parquet[0m*
[01;32mpart-00008-982886ba-041c-4ea3-92fb-abbba124094e-c000.snappy.parquet[0m*
[01;32mpart-00009-03a054b9-6432-401e-aaa5-dcc06335bdc8-c000.snappy.parquet[0m*
[01;32mpart-00010-4f0e1de6-b728-4be3-8dde-7c7a305988e4-c000.snappy.parquet[0m*



#Imports

In [0]:
from neo4j import GraphDatabase
from pyspark.sql import DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window


#DB management functions 

In [0]:
AUTH = (DATABASE_USERNAME, DATABASE_PASSWORD)


def delete_database(db_name: str):
    with GraphDatabase.driver(DATABASE_CONNECTION_URL, auth=AUTH) as driver:
        driver.execute_query(
            "DROP DATABASE $db IF EXISTS",
            db=db_name,
        )


def create_database(db_name: str):
    with GraphDatabase.driver(DATABASE_CONNECTION_URL, auth=AUTH) as driver:
        driver.execute_query(
            "CREATE DATABASE $db IF NOT EXISTS",
            db=db_name,
        )


# Deletes all information within a database
def delete_nodes_edges(db_name: str):
    with GraphDatabase.driver(DATABASE_CONNECTION_URL, auth=AUTH) as driver:
        driver.execute_query(
            """
            MATCH (n)
            DETACH DELETE n
            """,
            database_=db_name,
        )


#Load dataset

In [0]:
def load_dataframe(
    path: "str",
) -> "DataFrame":
    return spark.read.format("delta").load(path)

In [0]:
nodes_dataframe_input = spark.read.format("delta").load(
    f"{DATA_CORE_PATH}/{SUB_DIR}_nodes",
)

edges_dataframe = spark.read.format("delta").load(f"{DATA_CORE_PATH}/{SUB_DIR}_edges")

In [0]:
if use_issue_type:
    window_spec = Window.partitionBy("source_primary_key_hash").orderBy(
        desc("asset_name"),
    )

    nodes_dataframe_issue_types = (
        nodes_dataframe_input.where(col("type") == "issue_type")
        .withColumn("row_number", row_number().over(window_spec))
        .where(col("row_number") == 1)
        .drop("row_number")
        .withColumn("asset_name", lit("bclearer"))
    )

    nodes_dataframe = nodes_dataframe_input.where(col("type") != "issue_type").union(
        nodes_dataframe_issue_types,
    )

else:
    nodes_dataframe = nodes_dataframe_input.withColumn("asset_name", lit("bclearer"))


#Reset data in Neo4j

In [0]:
delete_database(DATABASE)
create_database(DATABASE)


#Save nodes to neo4j

In [0]:
def create_node_of_type(dataframe, entity_type_input, asset_name):
    print(f"creating entity {entity_type_input}, {asset_name}")

    entity_type_lower = entity_type_input.lower()
    entity_type_title = entity_type_input.title()

    script = f"""
CREATE CONSTRAINT IF NOT EXISTS FOR ({entity_type_lower}:{entity_type_title}) REQUIRE {entity_type_lower}.source_primary_key_hash IS UNIQUE;
CREATE INDEX IF NOT EXISTS FOR (entity:Entity) ON entity.source_primary_key_hash;"""

    dataframe = dataframe.where(col("type") == entity_type_input).where(
        col("asset_name") == asset_name,
    )
    (
        dataframe.write.format("org.neo4j.spark.DataSource")
        .mode("overwrite")
        .option("url", DATABASE_CONNECTION_URL)
        .option("authentication.basic.username", DATABASE_USERNAME)
        .option("authentication.basic.password", DATABASE_PASSWORD)
        .option("database", DATABASE)
        .option("labels", f"Entity:{entity_type_title}:{asset_name}")
        .option("node.keys", "source_primary_key_hash")
        .option("script", script)
        .save()
    )
    print(f"created entity {entity_type_input}, {asset_name}")

In [0]:
for row in nodes_dataframe.select("type", "asset_name").distinct().collect():
    create_node_of_type(nodes_dataframe, row[0], row[1])

creating entity plant, bclearer
created entity plant, bclearer
creating entity process_unit, bclearer
created entity process_unit, bclearer
creating entity tag, bclearer
created entity tag, bclearer
creating entity site, bclearer
created entity site, bclearer
creating entity issues, bclearer
created entity issues, bclearer
creating entity issue_type, bclearer
created entity issue_type, bclearer
creating entity refinery, bclearer
created entity refinery, bclearer



#Save Edges to neo4j

In [0]:
def create_edges_with_relation_type(dataframe, relation_type):
    print(f"creating relation_type {relation_type}")

    query = "MATCH (source:Entity {source_primary_key_hash: event.source}), (destination:Entity {source_primary_key_hash: event.destination})"
    query = query + f" MERGE (source)-[:{relation_type}]->(destination)"

    dataframe = dataframe.where(col("relation_type") == relation_type)
    (
        dataframe.repartition(1)
        .write.format("org.neo4j.spark.DataSource")
        .mode("append")
        .option("url", DATABASE_CONNECTION_URL)
        .option("authentication.basic.username", DATABASE_USERNAME)
        .option("authentication.basic.password", DATABASE_PASSWORD)
        .option("database", DATABASE)
        .option("query", query)
        .save()
    )

    print(f"created relation_type {relation_type}")

In [0]:
for row in edges_dataframe.select("relation_type").distinct().collect():
    create_edges_with_relation_type(edges_dataframe, row[0])

creating relation_type SAME_AS
