# Transforming the Meta-DataFrame into a Neo4j Event Log Class Diagram Graph
The following script demonstrates how the Meta-DataFrame can be modelled in Neo4j as an Event Log Class Diagram Graph of a given event log.

In [13]:
# import the libraries
import json
import pandas as pd
import numpy
import re #regular expressions for working with text data, especially extracting information from a piece of text

In [14]:
# Change the title names to circumvent certain Neo4j whitespace issues. The index_to_neo4j dictionary  maps keys will map the values from the Meta-DataFrame to the Neo4j graph later
index_to_neo4j = {
    'Title': 'title',
    'Data_type': 'data_type',
    'Number_of_entries': 'num_entries',
    'Number_of_unique_entries': 'num_unique_entries',
    'Number_of_duplicate_entries':'num_duplicate_entries',
    'Number_of_undefined_entries': 'num_undefined_entries',
    'Percentage_of_undefined_entries': 'percentage_undefined_entries',
}
index_to_neo4j

{'Title': 'title',
 'Data_type': 'data_type',
 'Number_of_entries': 'num_entries',
 'Number_of_unique_entries': 'num_unique_entries',
 'Number_of_duplicate_entries': 'num_duplicate_entries',
 'Number_of_undefined_entries': 'num_undefined_entries',
 'Percentage_of_undefined_entries': 'percentage_undefined_entries'}

In [15]:
#Define the Meta-DataFrame and generate the first command to clear any related event data in Neo4j
event_log_name = 'BPI_C_2019_3_final_synthetic' #change accordingly for each EL
print('// Delete all nodes and their relationships:')
print(f'MATCH (node:{event_log_name}) DETACH DELETE node;')
print()

// Delete all nodes and their relationships:
MATCH (node:BPI_C_2019_3_final_synthetic) DETACH DELETE node;



In [16]:
#Import the event log Meta-DataFrame based on event_log_name above
overview_table = pd.read_csv(f'analysis/overview_{event_log_name}.csv', index_col = 'Property') #use Property as the index_column
overview_table = overview_table.T #switch table layout
overview_table

Property,caseID,event time,event concept:name,event ID,case Spend area text,Company,case Document Type,case Sub spend area text,case Purchasing Document,case Vendor,...,case Spend classification text,case Source,case Name,C_GR-Based,case Item,case concept:name,case Goods Receipt,event User Account,event org type resource,Currency Costs in Dollars
Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,...,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier,BPI_C_2019_3rd_Supplier
Title,Case ID,Timestamp,Activity,Event ID,case Spend area text,Company,case Document Type,case Sub spend area text,case Purchasing Document,case Vendor,...,case Spend classification text,case Source,case Name,C_GR-Based,case Item,case concept:name,case Goods Receipt,event User Account,event org type resource,Currency Costs in Dollars
Importance,Mandatory attribute,Mandatory attribute,Mandatory attribute,Mandatory attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,...,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute
Level,Case level,Event level,Event level,Event level,Case level,Case level,Case level,Case level,Case level,Case level,...,Case level,Case level,Case level,Case level,Case level,Case level,Case level,Event level,Event level,Event level
Data type,string,"datetime64[ns, pytz.FixedOffset(-120)]",string,float64,string,string,string,string,float64,string,...,string,string,string,string,float64,string,string,string,string,int64
Number of entries,349526,349526,349526,349526,349526,349526,349526,349526,349526,349526,...,349526,349526,349526,349526,349526,349526,349526,349526,349526,349526
Number of unique entries,33958,31,37,1504,19,3,3,131,33621,1465,...,3,1,1401,2,320,109813,2,431,431,14473
List up to 100 unique entries to view the structure of the event attributes,"['4507038569.0 Standard', '4507038753.0 Servic...","[Timestamp('1970-01-01 00:00:25-0200', tz='pyt...","['Receive Order Confirmation', 'Record Service...","[555000000000000.0, 558000000000000.0, 5570000...","['Packaging Type Case', 'Supply Chain Logistic...","['company ID_A', 'company ID_C', <NA>, 'compan...","['Standard PO', 'EC Purchase order', 'Framewor...","['Packaging - Other', 'Road Packed', 'Products...","[4507038569.0, 4507038753.0, 4507038755.0, 450...","['vendorID_0104', 'vendorID_0541', 'vendorID_0...",...,"['PR', 'NPR', 'OTHER', <NA>]","['sourceSystemID_0000', <NA>]","['vendor_0104', 'vendor_0525', 'vendor_0693', ...","['False', 'True', <NA>]","[220.0, 350.0, 1.0, 300.0, 210.0, 370.0, 340.0...","['4507038569_00220', '4507038569_00350', '4507...","['True', 'False', <NA>]","['user_063', 'NONE', 'user_200', 'user_164', '...","['user_063', 'NONE', 'user_200', 'user_164', '...","[2346, 15640, 1829, 9950, 3519, 3659, 11876, 1..."
Length of the list of up to 100 unique entries,100,31,37,100,20,4,4,100,100,100,...,4,2,100,3,100,100,3,100,100,100


In [17]:
# Gather overview_columns
overview_columns = overview_table.columns

#View
overview_columns


Index(['caseID', 'event time', 'event concept:name', 'event ID ',
       'case Spend area text', 'Company', 'case Document Type',
       'case Sub spend area text', 'case Purchasing Document', 'case Vendor',
       'case Item Type', 'case Item Category',
       'case Spend classification text', 'case Source', 'case Name',
       'C_GR-Based', 'case Item', 'case concept:name', 'case Goods Receipt',
       'event User Account', 'event org type resource',
       'Currency Costs in Dollars'],
      dtype='object', name='Property')

<h1>Creating the Neo4j Commands</h1>
The author hardcoded the Neo4j commands as changes could be made easily - when adjustments have to be made to the Neo4jDB instance.

In [18]:
# create a two lists
neo4j_lines = []
mandatory_columns = []

for overview_column in overview_columns:
  spalte = overview_table[overview_column]
  spalte_json_string = spalte.to_json(default_handler=str)
  neo4j_properties = json.loads(spalte_json_string)

  overview_level = neo4j_properties['Level']

  property_label_identifier = neo4j_properties['Importance']
  if property_label_identifier == 'Mandatory attribute':
    mandatory_columns.append(overview_column)

  properties_list_of_strings = [f'`{key}`: "{value}"' for (key, value) in neo4j_properties.items()]

  neo4j_create_command = ''
  neo4j_create_command += f'''CREATE (`{overview_column}`:`{overview_column}`:`{overview_level}`:`{property_label_identifier}`:{event_log_name} '''
  neo4j_create_command += f'''{{name: '{overview_column}', {', '.join(properties_list_of_strings)}}})'''

  neo4j_lines.append(neo4j_create_command)

neo4j_lines.append('CREATE (`caseID`) - [:CASE_TO_EVENT] -> (`event ID `)')
neo4j_lines.append('CREATE (`event ID `) - [:EVENT_RELATIONSHIP] -> (`event concept:name`)')
neo4j_lines.append('CREATE (`event ID `) - [:EVENT_RELATIONSHIP] -> (`event time`)')

for overview_column in overview_columns:
  if overview_column not in mandatory_columns and 'event' in overview_column:
    neo4j_create_command = f'''CREATE (`event ID `) - [:EVENT_RELATIONSHIP] -> (`{overview_column}`)'''
    neo4j_lines.append(neo4j_create_command)
  if overview_column not in mandatory_columns and 'case' in overview_column:
    neo4j_create_command = f'''CREATE (`caseID`) - [:CASE_RELATIONSHIP] -> (`{overview_column}`)'''
    neo4j_lines.append(neo4j_create_command)

# To add the preceding sequence of Cypher CREATE commands.
neo4j_lines.append(';')

neo4j_command = '\n'.join(neo4j_lines)

neo4j_command = '// Create all nodes and their relationships\n' + neo4j_command

with open(f'analysis/overview_{event_log_name}.cypher', 'w') as overview_file:
  print(neo4j_command, file=overview_file)

In [19]:
print('// Show all nodes and their relationships:')
print(f'MATCH (event_log:{event_log_name}) RETURN event_log;')
print()

// Show all nodes and their relationships:
MATCH (event_log:BPI_C_2019_3_final_synthetic) RETURN event_log;



In [20]:
# The case table is an exploratory acitivty to model the longest case to visualize all event contained in that case in Neo4j commands
case_table = pd.read_csv(f'analysis/longest_case_in_{event_log_name}.csv', index_col='EventIndex')
case_table

# The index of a case table is the index of an event.
# The case table is an event table that only contains events belonging to a certain case.

cypher_code = 'CREATE\n'
for event_index in case_table.index:
  event = case_table.loc[event_index]
  cypher_code += \
    f'''  (event_{event_index}:`{event['event concept:name']}` {{\n''' +\
    f'''    `caseID`: "{event['caseID']}",\n''' +\
    f'''    `event time`: "{event['event time']}",\n''' +\
    f'''    `event concept:name`: "{event['event concept:name']}",\n''' +\
    f'''    `event ID `: "{event['event ID ']}",\n''' +\
    f'''    `case Spend area text`: "{event['case Spend area text']}",\n''' +\
    f'''    `Company`: "{event['Company']}",\n''' +\
    f'''    `case Document Type`: "{event['case Document Type']}",\n''' +\
    f'''    `case Sub spend area text`: "{event['case Sub spend area text']}",\n''' +\
    f'''    `case Purchasing Document`: "{event['case Purchasing Document']}",\n''' +\
    f'''    `case Vendor`: "{event['case Vendor']}",\n''' +\
    f'''    `case Item Type`: "{event['case Item Type']}",\n''' +\
    f'''    `case Item Category`: "{event['case Item Category']}",\n''' +\
    f'''    `case Spend classification text`: "{event['case Spend classification text']}",\n''' +\
    f'''    `case Source`: "{event['case Source']}",\n''' +\
    f'''    `case Name`: "{event['case Name']}",\n''' +\
    f'''    `C_GR-Based`: "{event['C_GR-Based']}",\n''' +\
    f'''    `case Item`: "{event['case Item']}",\n''' +\
    f'''    `case concept:name`: "{event['case concept:name']}",\n''' +\
    f'''    `case Goods Receipt`: "{event['case Goods Receipt']}",\n''' +\
    f'''    `event User Account`: "{event['event User Account']}",\n''' +\
    f'''    `event org type resource`: "{event['event org type resource']}",\n''' +\
    f'''    `Currency Costs in Dollars`: "{event['Currency Costs in Dollars']}"\n''' +\
    f'''  }}),\n'''

for first_event_index, second_event_index in zip(case_table.index, case_table.index[1:]):
  first_event = case_table.loc[first_event_index]
  second_event = case_table.loc[second_event_index]
  cypher_code += f'''  (event_{first_event_index})-[:DIRECTLY_FOLLOWS]->(event_{second_event_index}),\n'''

# Delete the last comma symbol occurrence.
cypher_code = cypher_code[:-2]
cypher_code += '\n;'

with open(f'analysis/longest_case_in_{event_log_name}.cypher', 'w') as case_file:
  print(cypher_code, file=case_file)

# End of Script 2 Transforming the Meta-DataFrame into a Neo4j Event Log Class Diagram Graph
Author: Kyle Smith <br>
Script: For Masterthesis <br>
University of Camerino & University of Applied Sciences Northwestern Switzerland