# Transforming the Meta-DataFrame into a Neo4j Event Log Class Diagram Graph
The following script demonstrates how the Meta-DataFrame can be modelled in Neo4j as an Event Log Class Diagram Graph of a given event log.

In [18]:
# import the libraries
import json
import pandas as pd
import numpy
import re #regular expressions for working with text data, especially extracting information from a piece of text

In [19]:
# Change the title names to circumvent certain Neo4j whitespace issues. The index_to_neo4j dictionary  maps keys will map the values from the Meta-DataFrame to the Neo4j graph later
index_to_neo4j = {
    'Title': 'title',
    'Data_type': 'data_type',
    'Number_of_entries': 'num_entries',
    'Number_of_unique_entries': 'num_unique_entries',
    'Number_of_duplicate_entries':'num_duplicate_entries',
    'Number_of_undefined_entries': 'num_undefined_entries',
    'Percentage_of_undefined_entries': 'percentage_undefined_entries',
}
index_to_neo4j

{'Title': 'title',
 'Data_type': 'data_type',
 'Number_of_entries': 'num_entries',
 'Number_of_unique_entries': 'num_unique_entries',
 'Number_of_duplicate_entries': 'num_duplicate_entries',
 'Number_of_undefined_entries': 'num_undefined_entries',
 'Percentage_of_undefined_entries': 'percentage_undefined_entries'}

In [20]:
#Define the Meta-DataFrame and generate the first command to clear any related event data in Neo4j
event_log_name = 'BPIC_Synthetic_Split_2' #change accordingly for each EL
print('// Delete all nodes and their relationships:')
print(f'MATCH (node:{event_log_name}) DETACH DELETE node;')
print()

// Delete all nodes and their relationships:
MATCH (node:BPIC_Synthetic_Split_2) DETACH DELETE node;



In [21]:
#Import the event log Meta-DataFrame based on event_log_name above
overview_table = pd.read_csv(f'analysis/overview_{event_log_name}.csv', index_col = 'Property') #use Property as the index_column
overview_table = overview_table.T #switch table layout
overview_table

Property,caseID,event time:timestamp,event concept:name,eventID,case Spend area text,case Document Type,case Sub spend area text,case Purch. Doc. Category name,case Vendor,case Item Type,...,case Spend classification text,case Source,case Name,case GR-Based Inv. Verif.,case Item,case concept:name,case Goods Receipt,event User,event org:resource,event Cumulative net worth (EUR)
Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,...,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining,Supplier2_Comparative_Mining
Title,Case ID,Timestamp,Activity,eventID,case Spend area text,case Document Type,case Sub spend area text,case Purch. Doc. Category name,case Vendor,case Item Type,...,case Spend classification text,case Source,case Name,case GR-Based Inv. Verif.,case Item,case concept:name,case Goods Receipt,event User,event org:resource,event Cumulative net worth (EUR)
Importance,Mandatory attribute,Mandatory attribute,Mandatory attribute,Mandatory attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,...,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute
Level,Case level,Event level,Event level,Event level,Case level,Case level,Case level,Case level,Case level,Case level,...,Case level,Case level,Case level,Case level,Case level,Case level,Case level,Event level,Event level,Event level
Data type,string,datetime64[ns],string,float64,string,string,string,string,string,string,...,string,string,string,int64,int64,string,string,string,string,float64
Number of entries,349526,349526,349526,349526,349526,349526,349526,349526,349526,349526,...,349526,349526,349526,349526,349526,349526,349526,349526,349526,349526
Number of unique entries,56127,42173,28,243,18,2,109,1,1109,5,...,4,1,1050,2,233,53733,1,333,333,11330
List up to 100 unique entries to view the structure of the event attributes,"['4507016139 1', '4507016140 10', '4507016140 ...",[numpy.datetime64('2018-03-14T03:11:00.0000000...,"['Record Service Entry Sheet', 'Record Goods R...","[227000000000000.0, nan, 228000000000000.0, 22...","['Logistics', 'Packaging', 'Sales', 'CAPEX & S...","['Standard PO', 'Framework order', <NA>]","['Road Packed', 'Metal Containers & Lids < 30L...","['Purchase order', <NA>]","['vendorID_0472', 'vendorID_0106', 'vendorID_0...","['Service', 'Standard', 'Third-party', 'Subcon...",...,"['NPR', 'PR', 'OTHER', 'undefined', <NA>]","['sourceSystemID_0000', <NA>]","['vendor_0457', 'vendor_0106', 'vendor_0103', ...","[1, 0]","[1, 10, 20, 30, 100, 40, 50, 60, 70, 80, 90, 1...","['4507016139_00001', '4507016140_00010', '4507...","['True', <NA>]","[<NA>, 'batch_07', 'batch_03', 'user_029', 'us...","[<NA>, 'batch_07', 'batch_03', 'user_029', 'us...","[72.0, 30957.0, 1157.0, 2386.0, 21699.0, 144.0..."
Length of the list of up to 100 unique entries,100,100,29,100,19,3,100,2,100,6,...,5,2,100,2,100,100,2,100,100,100


In [22]:
# Gather overview_columns
overview_columns = overview_table.columns

#View
overview_columns


Index(['caseID', 'event time:timestamp', 'event concept:name', 'eventID ',
       'case Spend area text', 'case Document Type',
       'case Sub spend area text', 'case Purch. Doc. Category name',
       'case Vendor', 'case Item Type', 'case Item Category',
       'case Spend classification text', 'case Source', 'case Name',
       'case GR-Based Inv. Verif.', 'case Item', 'case concept:name',
       'case Goods Receipt', 'event User', 'event org:resource',
       'event Cumulative net worth (EUR)'],
      dtype='object', name='Property')

<h1>Creating the Neo4j Commands</h1>
The author hardcoded the Neo4j commands as changes could be made easily - when adjustments have to be made to the Neo4jDB instance.

In [23]:
# create a two lists
neo4j_lines = []
mandatory_columns = []

for overview_column in overview_columns:
  spalte = overview_table[overview_column]
  spalte_json_string = spalte.to_json(default_handler=str)
  neo4j_properties = json.loads(spalte_json_string)

  overview_level = neo4j_properties['Level']

  property_label_identifier = neo4j_properties['Importance']
  if property_label_identifier == 'Mandatory attribute':
    mandatory_columns.append(overview_column)

  properties_list_of_strings = [f'`{key}`: "{value}"' for (key, value) in neo4j_properties.items()]

  neo4j_create_command = ''
  neo4j_create_command += f'''CREATE (`{overview_column}`:`{overview_column}`:`{overview_level}`:`{property_label_identifier}`:{event_log_name} '''
  neo4j_create_command += f'''{{name: '{overview_column}', {', '.join(properties_list_of_strings)}}})'''

  neo4j_lines.append(neo4j_create_command)

neo4j_lines.append('CREATE (`caseID`) - [:CASE_TO_EVENT] -> (`eventID `)')
neo4j_lines.append('CREATE (`eventID `) - [:EVENT_RELATIONSHIP] -> (`event concept:name`)')
neo4j_lines.append('CREATE (`eventID `) - [:EVENT_RELATIONSHIP] -> (`event time:timestamp`)')

for overview_column in overview_columns:
  if overview_column not in mandatory_columns and 'event' in overview_column:
    neo4j_create_command = f'''CREATE (`eventID `) - [:EVENT_RELATIONSHIP] -> (`{overview_column}`)'''
    neo4j_lines.append(neo4j_create_command)
  if overview_column not in mandatory_columns and 'case' in overview_column:
    neo4j_create_command = f'''CREATE (`caseID`) - [:CASE_RELATIONSHIP] -> (`{overview_column}`)'''
    neo4j_lines.append(neo4j_create_command)

# To add the preceding sequence of Cypher CREATE commands.
neo4j_lines.append(';')

neo4j_command = '\n'.join(neo4j_lines)

neo4j_command = '// Create all nodes and their relationships\n' + neo4j_command

with open(f'analysis/overview_{event_log_name}.cypher', 'w') as overview_file:
  print(neo4j_command, file=overview_file)

In [24]:
print('// Show all nodes and their relationships:')
print(f'MATCH (event_log:{event_log_name}) RETURN event_log;')
print()

// Show all nodes and their relationships:
MATCH (event_log:BPIC_Synthetic_Split_2) RETURN event_log;



In [25]:
# The case table is an exploratory acitivty to model the longest case to visualize all event contained in that case in Neo4j commands

case_table = pd.read_csv(f'analysis/longest_case_in_{event_log_name}.csv', index_col='EventIndex')
case_table


# The case table is an event table that only contains events belonging to a certain case.

cypher_code = 'CREATE\n'
for event_index in case_table.index:
  event = case_table.loc[event_index]
  cypher_code += \
    f'''  (event_{event_index}:`{event['event concept:name']}` {{\n''' +\
    f'''    `caseID`: "{event['caseID']}",\n''' +\
    f'''    `event time:timestamp`: "{event['event time:timestamp']}",\n''' +\
    f'''    `event concept:name`: "{event['event concept:name']}",\n''' +\
    f'''    `eventID `: "{event['eventID ']}",\n''' +\
    f'''    `case Spend area text`: "{event['case Spend area text']}",\n''' +\
    f'''    `case Document Type`: "{event['case Document Type']}",\n''' +\
    f'''    `case Sub spend area text`: "{event['case Sub spend area text']}",\n''' +\
    f'''    `case Purch. Doc. Category name`: "{event['case Purch. Doc. Category name']}",\n''' +\
    f'''    `case Vendor`: "{event['case Vendor']}",\n''' +\
    f'''    `case Item Type`: "{event['case Item Type']}",\n''' +\
    f'''    `case Item Category`: "{event['case Item Category']}",\n''' +\
    f'''    `case Spend classification text`: "{event['case Spend classification text']}",\n''' +\
    f'''    `case Source`: "{event['case Source']}",\n''' +\
    f'''    `case Name`: "{event['case Name']}",\n''' +\
    f'''    `case GR-Based Inv. Verif.`: "{event['case GR-Based Inv. Verif.']}",\n''' +\
    f'''    `case Item`: "{event['case Item']}",\n''' +\
    f'''    `case concept:name`: "{event['case concept:name']}",\n''' +\
    f'''    `case Goods Receipt`: "{event['case Goods Receipt']}",\n''' +\
    f'''    `event User`: "{event['event User']}",\n''' +\
    f'''    `event org:resource`: "{event['event org:resource']}",\n''' +\
    f'''    `event Cumulative net worth (EUR)`: "{event['event Cumulative net worth (EUR)']}"\n''' +\
    f'''  }}),\n'''

for first_event_index, second_event_index in zip(case_table.index, case_table.index[1:]):
  first_event = case_table.loc[first_event_index]
  second_event = case_table.loc[second_event_index]
  cypher_code += f'''  (event_{first_event_index})-[:DIRECTLY_FOLLOWS]->(event_{second_event_index}),\n'''

# Delete the last comma symbol occurrence.
cypher_code = cypher_code[:-2]
cypher_code += '\n;'

with open(f'analysis/longest_case_in_{event_log_name}.cypher', 'w') as case_file:
  print(cypher_code, file=case_file)


# End of Script 2 Transforming the Meta-DataFrame into a Neo4j Event Log Class Diagram Graph
Author: Kyle Smith <br>
Script: For Masterthesis <br>
University of Camerino & University of Applied Sciences Northwestern Switzerland