# First Script: Creating the Meta-DataFrame
In this script the author analyzes each column of the synthetic BPI_C 2019 event logs.

In [133]:
#import libraries
import json
import os
import pandas as pd
import numpy
import re

In [134]:
event_log_name = 'BPI_C_2019_FEL_synthetic_' #addd the name of the event log - change for each event log
event_log_file_path = f'event_logs/{event_log_name}.csv' #string interpolation, one has to create a folder called 'event_logs' to import the event log.csv
print(event_log_file_path) # review the path

event_logs/BPI_C_2019_FEL_synthetic_.csv


In [135]:
#import the event log
event_log_table = pd.read_csv(event_log_file_path) #optionally this can be added; header = 0, encoding = 'ISO-8859-1') if encoding issues arise
event_log_table.head(20) #view the first row

Unnamed: 0,eventID,case Spend area text,case Vendor,case Item Type,case Item Category,caseID,case Goods Receipt,event org:resource,event concept:name,event time:timestamp
0,65800000000000.0,"['Packaging Type Case', 'Supply Chain Logistic...",vendorID,"Standard', 'Service', 'Consignment', 'Third-pa...","3-way match, invoice before GR', '3-way match,...",4507004931_00020,"TRUE, FALSE",Sales Excellence,"Vendor creates invoice', 'Vendor creates debit...",02-01-2018 09:04:00.000


In [136]:
#review changes on the last column
event_log_table

Unnamed: 0,eventID,case Spend area text,case Vendor,case Item Type,case Item Category,caseID,case Goods Receipt,event org:resource,event concept:name,event time:timestamp
0,65800000000000.0,"['Packaging Type Case', 'Supply Chain Logistic...",vendorID,"Standard', 'Service', 'Consignment', 'Third-pa...","3-way match, invoice before GR', '3-way match,...",4507004931_00020,"TRUE, FALSE",Sales Excellence,"Vendor creates invoice', 'Vendor creates debit...",02-01-2018 09:04:00.000


In [137]:
event_log_table.columns

Index(['eventID ', 'case Spend area text', 'case Vendor', 'case Item Type',
       'case Item Category', 'caseID', 'case Goods Receipt',
       'event org:resource', 'event concept:name', 'event time:timestamp'],
      dtype='object')

In [138]:
#optionally rename column headers if they were parsed wrongly - 'old column name':'new column name' 
event_log_table.rename(columns={'eventID ': 'eventID'})

event_log_table.columns

Index(['eventID ', 'case Spend area text', 'case Vendor', 'case Item Type',
       'case Item Category', 'caseID', 'case Goods Receipt',
       'event org:resource', 'event concept:name', 'event time:timestamp'],
      dtype='object')

In [139]:
#Review all og the event log columns, to decide which ones are mandatory and which ones are additional

log_columns = event_log_table.columns
log_columns # this variable is important for the Meta-DataFrame generation

Index(['eventID ', 'case Spend area text', 'case Vendor', 'case Item Type',
       'case Item Category', 'caseID', 'case Goods Receipt',
       'event org:resource', 'event concept:name', 'event time:timestamp'],
      dtype='object')

In [140]:
# This has to be tailored to each event log
#caseID is created from 'case Purchasing Document' and 'case Item', define the mandatory columns based on domain knowledge analysis

mandatory_columns = ['caseID', 'event time:timestamp', 'event concept:name', 'eventID ']

#Create an object/dictionary and corresponding key:value pairs to add the semantic titles which the csv titles represent in a variable  which are relevant for the Meta-DataFrame
titles = {
  'caseID': 'Case ID',
  'event concept:name': 'Activity',
  'eventID ': 'Event ID',
  'event time:timestamp': 'Timestamp',
}

print(mandatory_columns)

['caseID', 'event time:timestamp', 'event concept:name', 'eventID ']


In [141]:
#Review titles object which are relevant for the Meta-DataFrame
titles

{'caseID': 'Case ID',
 'event concept:name': 'Activity',
 'eventID ': 'Event ID',
 'event time:timestamp': 'Timestamp'}

In [142]:
#Review the datatypes for datatype harmonization
event_log_table.dtypes

eventID                 float64
case Spend area text     object
case Vendor              object
case Item Type           object
case Item Category       object
caseID                   object
case Goods Receipt       object
event org:resource       object
event concept:name       object
event time:timestamp     object
dtype: object

In [143]:
#This step is optional to minimize the work needed in harmonization

#Translate object data types of timestamp data into a datetime64[ns] data type for the timestamp. This is a unique datatype from pandas.
timestamp_objects = event_log_table['event time:timestamp'] #declare timestamp to a variable
event_log_table['event time:timestamp']=pd.to_datetime(timestamp_objects)

#Review changes made
#event_log_table['event time:timestamp'].dtypes
event_log_table.dtypes

eventID                        float64
case Spend area text            object
case Vendor                     object
case Item Type                  object
case Item Category              object
caseID                          object
case Goods Receipt              object
event org:resource              object
event concept:name              object
event time:timestamp    datetime64[ns]
dtype: object

In [144]:
#This step is optional to minimize the work needed in harmonization 

#Translate object data types of object data types (which contains different possible formats) to a clear string data type format
#for loops where used as they enable somewhat automated transformations 
# = for assigning values
# == operator is used to test equality
for log_column in log_columns:
    if event_log_table[log_column].dtype == 'object': #check if the column has an object data type
        event_log_table[log_column] = event_log_table[log_column].astype('string') #change it to a string

#Review changes
#event_log_table['event time:timestamp'].dtypes
event_log_table.dtypes


eventID                        float64
case Spend area text            string
case Vendor                     string
case Item Type                  string
case Item Category              string
caseID                          string
case Goods Receipt              string
event org:resource              string
event concept:name              string
event time:timestamp    datetime64[ns]
dtype: object

In [145]:
# Reorder the columns of the event log table.
# Checking the log columns list contains each item on the mandatory columns list with the assert statement raises an 'AssertionError' if any of the elements are missing. 
# With the delete method and the == operator, the items are removed in the mandatory columns list from the log columns list.
# Then  using the + operator to combine the mandatory columns list and the log columns list (which has been turned into a list using the tolist method). By doing so, a fresh list of columns in # the desired order is created. Using the [] operator to select only the columns in the newly created list, subsetting the event log table DataFrame. By doing this, the columns in the new #DataFrame are created in the desired order.

for mandatory_column in mandatory_columns:
  assert mandatory_column in log_columns 
  log_columns = log_columns.delete(log_columns == mandatory_column)
log_columns = mandatory_columns + log_columns.tolist()

event_log_table = event_log_table[log_columns]
event_log_table #the for loop above resultes in a move of the mandatory columns to the left side (changes order)

Unnamed: 0,caseID,event time:timestamp,event concept:name,eventID,case Spend area text,case Vendor,case Item Type,case Item Category,case Goods Receipt,event org:resource
0,4507004931_00020,2018-02-01 09:04:00,"Vendor creates invoice', 'Vendor creates debit...",65800000000000.0,"['Packaging Type Case', 'Supply Chain Logistic...",vendorID,"Standard', 'Service', 'Consignment', 'Third-pa...","3-way match, invoice before GR', '3-way match,...","TRUE, FALSE",Sales Excellence


Here we are going to reset the index and reorder the event log with a new index

Setting the inplace parameter to True and using the reset index method to reset the DataFrame's index. 
This adds a new column to the DataFrame containing the old index values and resets the index to a range of sequential integers beginning at 0. 
Using the rename method and the inplace parameter set to True, rename the column containing the previous index values. 
A dictionary that maps the old column name "index" to the new column name "EventIndex" makes up the columns parameter.
Using the set index method and the inplace parameter set to True, the DataFrame's index is changed to the 'EventIndex' column. 
This will remove the "EventIndex" column from the data and make it the new index of the DataFrame.

In [146]:
# Number every single event in the event log.
# Every single event is represented by a line in the event log file.
event_log_table.reset_index(inplace=True)
event_log_table.rename(columns={'index': 'EventIndex'}, inplace=True)
event_log_table.set_index('EventIndex', inplace=True)

#review changes
event_log_table

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  event_log_table.rename(columns={'index': 'EventIndex'}, inplace=True)


Unnamed: 0_level_0,caseID,event time:timestamp,event concept:name,eventID,case Spend area text,case Vendor,case Item Type,case Item Category,case Goods Receipt,event org:resource
EventIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4507004931_00020,2018-02-01 09:04:00,"Vendor creates invoice', 'Vendor creates debit...",65800000000000.0,"['Packaging Type Case', 'Supply Chain Logistic...",vendorID,"Standard', 'Service', 'Consignment', 'Third-pa...","3-way match, invoice before GR', '3-way match,...","TRUE, FALSE",Sales Excellence


<h2>Construction of the Meta-DataFrame with the overview_table-DataFrame</h2>

In [147]:
#defining a new dictionary  overview_columns with log columns as the elements
content_overview_columns = [(log_column, []) for log_column in log_columns]
extra_overview_columns = [('Event log column property', [])]
overview_columns = dict(extra_overview_columns + content_overview_columns)
overview_table = pd.DataFrame(overview_columns)

# To add a vertically indented name to the index in the console output.
overview_table.set_index('Event log column property', inplace=True)
overview_columns

{'Event log column property': [],
 'caseID': [],
 'event time:timestamp': [],
 'event concept:name': [],
 'eventID ': [],
 'case Spend area text': [],
 'case Vendor': [],
 'case Item Type': [],
 'case Item Category': [],
 'case Goods Receipt': [],
 'event org:resource': []}

In [148]:
#view the new overview_table to construct the Meta-DataFrame
overview_table 

Unnamed: 0_level_0,caseID,event time:timestamp,event concept:name,eventID,case Spend area text,case Vendor,case Item Type,case Item Category,case Goods Receipt,event org:resource
Event log column property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


<h2>Multiple variables based on event data proprocessing which are used for the Meta-DataFrame</h2>

The author uses the pandas .loc function which is an indexer in Pandas is used to choose rows and columns from a DataFrame based on labels.

The snippet below is called title property X as it will be added to the overview table (Meta-DataFrame) with the log columns list's elements as keys and the log column itself as a value, if the title associated with the log column is present in the titles dictionary (if it does not exist in the titles dictionary). 

Then, a new row with the label "Title" and the values from the title property dictionary is added to the overview table DataFrame. The assignment will add the new row to the DataFrame, and the.loc indexer will be used to select the row by its label. 

When this code block is run, a new row with the label "Title" and values representing the names of each log column will appear in the overview table DataFrame (or the log column itself if no title is provided).

In [149]:
# Overview for property: Titles - For the Meta-DataFrame
# Depending on whether the title for the log column is present in the titles dictionary, the values of the log columns list are either the log column itself or its associated title (if it does not exist in the titles dictionary).
title_property = {}

for log_column in log_columns:
    if log_column in titles:
        title = titles[log_column]
    else:
        title = log_column
    title_property[log_column] = title


Then, a new row with the label "Title" and the values from the title property dictionary is added to the overview table DataFrame. The assignment will add the new row to the DataFrame, and the.loc indexer will be used to select the row by its label. 
When this code block is run, a new row with the label "Title" and values representing the names of each log column will appear in the overview table DataFrame (or the log column itself if no title is provided).

In [150]:
overview_table.loc['Title'] = title_property
overview_table.loc['Title']

caseID                               Case ID
event time:timestamp               Timestamp
event concept:name                  Activity
eventID                             Event ID
case Spend area text    case Spend area text
case Vendor                      case Vendor
case Item Type                case Item Type
case Item Category        case Item Category
case Goods Receipt        case Goods Receipt
event org:resource        event org:resource
Name: Title, dtype: object

<h2>Dictionary {for mandatory and additional attributes}</h2>

This code block generates a new dictionary called importance property whose keys are the items in the list of log columns and whose values are either "Mandatory attribute" or "Additional attribute" depending on whether the log column is included in the mandatory columns list or not. 
Then, a new row with the label "Importance" and values taken from the importance property dictionary is added to the overview table DataFrame. The assignment will add the new row to the DataFrame, and the.loc indexer will be used to select the row by its label. 
When this code block is run, a new row with the label "Importance" and the values "Mandatory attribute" or "Additional attribute" for each log column, depending on whether that column is in the mandatory columns list or not, will appear in the overview table DataFrame.

The concept is used for other data attributes as well

In [151]:
# Overview property: Label for columns either 'Mandatory attribute' or 'additional_attributes' - For the Meta-DataFrame

importance_property = {}
for log_column in log_columns:
    if log_column in mandatory_columns:
        importance_property[log_column] = 'Mandatory attribute'
    else:
        importance_property[log_column] = 'Additional attribute'

overview_table.loc['Importance'] = importance_property
overview_table.loc['Importance']


caseID                   Mandatory attribute
event time:timestamp     Mandatory attribute
event concept:name       Mandatory attribute
eventID                  Mandatory attribute
case Spend area text    Additional attribute
case Vendor             Additional attribute
case Item Type          Additional attribute
case Item Category      Additional attribute
case Goods Receipt      Additional attribute
event org:resource      Additional attribute
Name: Importance, dtype: object

The goal of this code snippet is to add a new row to the Meta-Dataframe termed overview table, where the keys are the columns in a list called log columns and the values are strings indicating whether the column is at the "Event level" or the "Case level." The values are determined by examining the column name for the string "event" or "case," respectively.

In [152]:
# Overview property: The level of the event data attribute in the overview table
level_property = {}
for log_column in log_columns:
  if 'event' in log_column:
    level_property[log_column] = 'Event level'
  if 'case' in log_column:
    level_property[log_column] = 'Case level'

overview_table.loc['Level'] = level_property
overview_table.loc['Level']

caseID                   Case level
event time:timestamp    Event level
event concept:name      Event level
eventID                 Event level
case Spend area text     Case level
case Vendor              Case level
case Item Type           Case level
case Item Category       Case level
case Goods Receipt       Case level
event org:resource      Event level
Name: Level, dtype: object

In [153]:
# Overview property: Data type
data_type_property = {}
datatypes = event_log_table.dtypes
for log_column in log_columns:
  data_type_property[log_column] = datatypes[log_column]

overview_table.loc['Data type'] = data_type_property

This snippet adds a new row to the Meta-Dataframe in Pandas, with the keys being the columns in a list called log columns and the values being the data types of the corresponding columns in an event log table DataFrame.

In [154]:
# Overview property: Data type
data_type_property = {}
datatypes = event_log_table.dtypes
for log_column in log_columns:
    data_type_property[log_column] = datatypes[log_column]

overview_table.loc['Data type'] = data_type_property
overview_table.loc['Data type']

caseID                          string
event time:timestamp    datetime64[ns]
event concept:name              string
eventID                        float64
case Spend area text            string
case Vendor                     string
case Item Type                  string
case Item Category              string
case Goods Receipt              string
event org:resource              string
Name: Data type, dtype: object

In [155]:
# Overview property: Count the number of entries for the Meta-Dataframe
number_of_entries_property = {}
for log_column in log_columns:
    number_of_entries_property[log_column] = len(event_log_table)

overview_table.loc['Number of entries'] = number_of_entries_property
overview_table.loc['Number of entries']

caseID                  1
event time:timestamp    1
event concept:name      1
eventID                 1
case Spend area text    1
case Vendor             1
case Item Type          1
case Item Category      1
case Goods Receipt      1
event org:resource      1
Name: Number of entries, dtype: object

In [156]:
# Overview property: Number of unique entries
number_of_unique_entries_property = {}
for log_column in log_columns:
  count = event_log_table[log_column].value_counts().count()
  number_of_unique_entries_property[log_column] = count

overview_table.loc['Number of unique entries'] = number_of_unique_entries_property
overview_table.loc['Number of unique entries']

caseID                  1
event time:timestamp    1
event concept:name      1
eventID                 1
case Spend area text    1
case Vendor             1
case Item Type          1
case Item Category      1
case Goods Receipt      1
event org:resource      1
Name: Number of unique entries, dtype: object

In [157]:
#Overview property: List of up to 100 unique entries (to grasp an understanding of how the data looks and it is structured and not just metrics)
unique_entries_property = {}
for log_column in log_columns:
    unique_entries = event_log_table[log_column].unique()
    unique_entries = list(unique_entries)
    unique_entries = unique_entries[:100]
    unique_entries_property[log_column] = unique_entries

overview_table.loc['List up to 100 unique entries to view the structure of the event attributes']= unique_entries_property
unique_entries_property

{'caseID': ['4507004931_00020'],
 'event time:timestamp': [numpy.datetime64('2018-02-01T09:04:00.000000000')],
 'event concept:name': ["Vendor creates invoice', 'Vendor creates debit memo', 'Create Purchase Requisition Item', 'Create Purchase Order Item', 'Receive Order Confirmation', ''SRM: Created', 'SRM: Document Completed', 'SRM: Awaiting Approval', 'SRM: Complete', 'SRM: In Transfer to Execution Syst.', 'SRM: Ordered', 'Record Goods Receipt', 'Record Invoice Receipt', 'Release Purchase Order', 'SRM: Transaction Completed', 'Record Subsequent Invoice'"],
 'eventID ': [65800000000000.0],
 'case Spend area text': ["['Packaging Type Case', 'Supply Chain Logistics', 'Sales', 'Latex & Monomers', 'Trading & End Products', 'Additives', 'Solvents', 'CAPEX & SOCS', 'Specialty Resins', 'Pigments & Colorants', 'Titanium Dioxides', 'Real Estate', 'Commodity Resins', 'Marketing', 'Workforce Services', 'Spend Area Unidentified', 'Enterprise Services', 'Energy']"],
 'case Vendor': ['vendorID'],
 

In [158]:
# Overview property: Length of the list of up to 100 unique entries
num_displayed_unique_entries_property = {}
for log_column in log_columns:
  unique_entries = event_log_table[log_column].unique()
  unique_entries = list(unique_entries)
  unique_entries = unique_entries[:100]
  num_displayed_unique_entries_property[log_column] = len(unique_entries)

overview_table.loc['Length of the list of up to 100 unique entries'] = num_displayed_unique_entries_property
num_displayed_unique_entries_property

{'caseID': 1,
 'event time:timestamp': 1,
 'event concept:name': 1,
 'eventID ': 1,
 'case Spend area text': 1,
 'case Vendor': 1,
 'case Item Type': 1,
 'case Item Category': 1,
 'case Goods Receipt': 1,
 'event org:resource': 1}

In [159]:
# Inspect the overview table
overview_table

Unnamed: 0_level_0,caseID,event time:timestamp,event concept:name,eventID,case Spend area text,case Vendor,case Item Type,case Item Category,case Goods Receipt,event org:resource
Event log column property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Title,Case ID,Timestamp,Activity,Event ID,case Spend area text,case Vendor,case Item Type,case Item Category,case Goods Receipt,event org:resource
Importance,Mandatory attribute,Mandatory attribute,Mandatory attribute,Mandatory attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute
Level,Case level,Event level,Event level,Event level,Case level,Case level,Case level,Case level,Case level,Event level
Data type,string,datetime64[ns],string,float64,string,string,string,string,string,string
Number of entries,1,1,1,1,1,1,1,1,1,1
Number of unique entries,1,1,1,1,1,1,1,1,1,1
List up to 100 unique entries to view the structure of the event attributes,[4507004931_00020],[2018-02-01T09:04:00.000000000],"[Vendor creates invoice', 'Vendor creates debi...",[65800000000000.0],"[['Packaging Type Case', 'Supply Chain Logisti...",[vendorID],"[Standard', 'Service', 'Consignment', 'Third-p...","[3-way match, invoice before GR', '3-way match...","[TRUE, FALSE]",[Sales Excellence]
Length of the list of up to 100 unique entries,1,1,1,1,1,1,1,1,1,1


In [160]:
# Overview property: Number of duplicate entries
number_of_duplicate_entries_property = {}
for log_column in log_columns:
  number_of_duplicate_entries_property[log_column] = number_of_entries_property[log_column] - number_of_unique_entries_property[log_column]

overview_table.loc['Number of duplicate entries'] = number_of_duplicate_entries_property
overview_table.loc['Number of duplicate entries']

caseID                  0
event time:timestamp    0
event concept:name      0
eventID                 0
case Spend area text    0
case Vendor             0
case Item Type          0
case Item Category      0
case Goods Receipt      0
event org:resource      0
Name: Number of duplicate entries, dtype: object

In [161]:
# Overview property: Number of undefined entries
number_of_undefined_entries_property = {}
nan_info = event_log_table.isna().sum()
for log_column in log_columns:
    number_of_undefined_entries_property[log_column] = nan_info[log_column]

overview_table.loc['Number of undefined entries'] = number_of_undefined_entries_property
number_of_undefined_entries_property

{'caseID': 0,
 'event time:timestamp': 0,
 'event concept:name': 0,
 'eventID ': 0,
 'case Spend area text': 0,
 'case Vendor': 0,
 'case Item Type': 0,
 'case Item Category': 0,
 'case Goods Receipt': 0,
 'event org:resource': 0}

In [162]:
# Overview propperty: Number of undefined entries in percentages
percentage_of_undefined_entries_property = {}
nan_info = event_log_table.isna().sum()
for log_column in log_columns:
    percentage = nan_info[log_column]/len(event_log_table)*100 #len calculates the length of a variable
    percentage = f'{percentage:.2f}'
    percentage_of_undefined_entries_property[log_column] = percentage

overview_table.loc['Percentage of undefined entries'] = percentage_of_undefined_entries_property
percentage_of_undefined_entries_property

{'caseID': '0.00',
 'event time:timestamp': '0.00',
 'event concept:name': '0.00',
 'eventID ': '0.00',
 'case Spend area text': '0.00',
 'case Vendor': '0.00',
 'case Item Type': '0.00',
 'case Item Category': '0.00',
 'case Goods Receipt': '0.00',
 'event org:resource': '0.00'}

In [163]:
overview_table

Unnamed: 0_level_0,caseID,event time:timestamp,event concept:name,eventID,case Spend area text,case Vendor,case Item Type,case Item Category,case Goods Receipt,event org:resource
Event log column property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Title,Case ID,Timestamp,Activity,Event ID,case Spend area text,case Vendor,case Item Type,case Item Category,case Goods Receipt,event org:resource
Importance,Mandatory attribute,Mandatory attribute,Mandatory attribute,Mandatory attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute,Additional attribute
Level,Case level,Event level,Event level,Event level,Case level,Case level,Case level,Case level,Case level,Event level
Data type,string,datetime64[ns],string,float64,string,string,string,string,string,string
Number of entries,1,1,1,1,1,1,1,1,1,1
Number of unique entries,1,1,1,1,1,1,1,1,1,1
List up to 100 unique entries to view the structure of the event attributes,[4507004931_00020],[2018-02-01T09:04:00.000000000],"[Vendor creates invoice', 'Vendor creates debi...",[65800000000000.0],"[['Packaging Type Case', 'Supply Chain Logisti...",[vendorID],"[Standard', 'Service', 'Consignment', 'Third-p...","[3-way match, invoice before GR', '3-way match...","[TRUE, FALSE]",[Sales Excellence]
Length of the list of up to 100 unique entries,1,1,1,1,1,1,1,1,1,1
Number of duplicate entries,0,0,0,0,0,0,0,0,0,0
Number of undefined entries,0,0,0,0,0,0,0,0,0,0


In [164]:
# Switching the index of the Meta-DataFrame and table structure
# The current layout of the Meta-Dataframe is transformed, that the columns become the rows, and the rows become the columns.
overview_table = overview_table.T
overview_table.reset_index(inplace = True)
overview_table.rename(columns={'index': 'Property'}, inplace = True)
overview_table

Event log column property,Property,Title,Importance,Level,Data type,Number of entries,Number of unique entries,List up to 100 unique entries to view the structure of the event attributes,Length of the list of up to 100 unique entries,Number of duplicate entries,Number of undefined entries,Percentage of undefined entries
0,caseID,Case ID,Mandatory attribute,Case level,string,1,1,[4507004931_00020],1,0,0,0.0
1,event time:timestamp,Timestamp,Mandatory attribute,Event level,datetime64[ns],1,1,[2018-02-01T09:04:00.000000000],1,0,0,0.0
2,event concept:name,Activity,Mandatory attribute,Event level,string,1,1,"[Vendor creates invoice', 'Vendor creates debi...",1,0,0,0.0
3,eventID,Event ID,Mandatory attribute,Event level,float64,1,1,[65800000000000.0],1,0,0,0.0
4,case Spend area text,case Spend area text,Additional attribute,Case level,string,1,1,"[['Packaging Type Case', 'Supply Chain Logisti...",1,0,0,0.0
5,case Vendor,case Vendor,Additional attribute,Case level,string,1,1,[vendorID],1,0,0,0.0
6,case Item Type,case Item Type,Additional attribute,Case level,string,1,1,"[Standard', 'Service', 'Consignment', 'Third-p...",1,0,0,0.0
7,case Item Category,case Item Category,Additional attribute,Case level,string,1,1,"[3-way match, invoice before GR', '3-way match...",1,0,0,0.0
8,case Goods Receipt,case Goods Receipt,Additional attribute,Case level,string,1,1,"[TRUE, FALSE]",1,0,0,0.0
9,event org:resource,event org:resource,Additional attribute,Event level,string,1,1,[Sales Excellence],1,0,0,0.0


In [165]:
#Now we will out put this csv in the analysis folder

if not os.path.exists('analysis'):
    os.mkdir('analysis')
overview_table.to_csv(f'analysis/overview_{event_log_name}.csv')

(f'Written: datatype table to analysis/overview_{event_log_name}.csv')

'Written: datatype table to analysis/overview_BPI_C_2019_FEL_synthetic_.csv'

# End of Script 1 to generate the Meta-DataFrame
Author: Kyle Smith <br>
Script: For Masterthesis <br>
University: University of Camerino & University of Applied Sciences Northwestern Switzerland