In [1]:
import pandas as pd
import pretsa as sanatizeAlg
import countVariantsInLog as variantHelper

# Loading and preprocessing the CoSeLog event log
pm4py library is used to load the event logs from the .xes file format. 

In [4]:
# eventlogURL = 'https://data.4tu.nl/collections/_/5065529/1'

eventlog = pd.read_csv('./Data/CoseLog.csv')
# The PRETSA algorithm requires certain column names to work properly. 
# These columns are hard coded in pretsa.py and can be changed if needed. 
eventlog.rename(columns={'time:timestamp':'Complete_Timestamp', 'concept:name':'Activity', 'case:concept:name':'Case_ID'}, inplace=True)
# write the event log to file 
Dataset = 'CoSeLog'
filePath = "./Data/CoSeLog.csv"
eventlog.to_csv(filePath, index=False)
print("Finished loading eventlog")
eventlog

Finished loading eventlog


Unnamed: 0,org:group,concept:instance,org:resource,Activity,Complete_Timestamp,lifecycle:transition,case:startdate,case:responsible,case:enddate_planned,case:department,case:group,Case_ID,case:deadline,case:channel,case:enddate
0,Group 1,task-42933,Resource21,Confirmation of receipt,2011-10-11 11:45:40.276000+00:00,complete,2011-10-11 11:42:22.688000+00:00,Resource21,2011-12-06 12:41:31.788000+00:00,General,Group 2,case-10011,2011-12-06 12:41:31.788000+00:00,Internet,
1,Group 4,task-42935,Resource10,T02 Check confirmation of receipt,2011-10-12 06:26:25.398000+00:00,complete,2011-10-11 11:42:22.688000+00:00,Resource21,2011-12-06 12:41:31.788000+00:00,General,Group 2,case-10011,2011-12-06 12:41:31.788000+00:00,Internet,
2,Group 1,task-42957,Resource21,T03 Adjust confirmation of receipt,2011-11-24 14:36:51.302000+00:00,complete,2011-10-11 11:42:22.688000+00:00,Resource21,2011-12-06 12:41:31.788000+00:00,General,Group 2,case-10011,2011-12-06 12:41:31.788000+00:00,Internet,
3,Group 4,task-47958,Resource21,T02 Check confirmation of receipt,2011-11-24 14:37:16.553000+00:00,complete,2011-10-11 11:42:22.688000+00:00,Resource21,2011-12-06 12:41:31.788000+00:00,General,Group 2,case-10011,2011-12-06 12:41:31.788000+00:00,Internet,
4,EMPTY,task-43021,Resource30,Confirmation of receipt,2011-10-18 11:46:39.679000+00:00,complete,2011-10-10 23:06:40.020000+00:00,Resource04,2011-12-06 00:06:40.010000+00:00,General,Group 5,case-10017,2011-12-06 00:06:40+00:00,Internet,2011-10-18 11:56:55.943000+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8572,Group 4,task-43560,Resource06,T02 Check confirmation of receipt,2011-10-18 07:04:48.732000+00:00,complete,2011-10-05 23:06:40.020000+00:00,Resource06,2011-12-01 00:06:40.010000+00:00,General,Group 5,case-9997,2011-12-01 00:06:40+00:00,Internet,2011-10-20 12:19:44.448000+00:00
8573,Group 3,task-43562,Resource06,T04 Determine confirmation of receipt,2011-10-18 07:05:12.359000+00:00,complete,2011-10-05 23:06:40.020000+00:00,Resource06,2011-12-01 00:06:40.010000+00:00,General,Group 5,case-9997,2011-12-01 00:06:40+00:00,Internet,2011-10-20 12:19:44.448000+00:00
8574,Group 2,task-43563,Resource06,T05 Print and send confirmation of receipt,2011-10-18 07:05:30.196000+00:00,complete,2011-10-05 23:06:40.020000+00:00,Resource06,2011-12-01 00:06:40.010000+00:00,General,Group 5,case-9997,2011-12-01 00:06:40+00:00,Internet,2011-10-20 12:19:44.448000+00:00
8575,Group 1,task-43561,Resource06,T06 Determine necessity of stop advice,2011-10-18 07:06:01.468000+00:00,complete,2011-10-05 23:06:40.020000+00:00,Resource06,2011-12-01 00:06:40.010000+00:00,General,Group 5,case-9997,2011-12-01 00:06:40+00:00,Internet,2011-10-20 12:19:44.448000+00:00


As seen, the event log contains data about execution of certain activities within the organization.


An important data for process mining is the duration of each activity 

In [5]:
# Adding duration column to the event logs 
! python3 add_annotation_duration.py {Dataset} {filePath}
modifiedEventlog = pd.read_csv(filePath.replace(".csv",'_duration.csv'), delimiter=';')

./Data/CoSeLog_duration.csv
Duration notation added, Available at ./Data/CoSeLog_duration.csv


# Running PRETSA

In [6]:
# Initializing the privacy parameters 
k = 50
t = 1
l = 15
# setting path for the sanitized eventlog 
targetFilePath = filePath.replace(".csv","_t%s_k%s_pretsa.csv" % (t,k))
print("Starting experiments")
# Instantiate the Pretsa class, creating the prefix tree as describe in the paper 
pretsa = sanatizeAlg.Pretsa(modifiedEventlog)
# Apply the algorithm to the event log with the given privacy parameters 
cutOutCases = pretsa.runPretsa(int(k),float(t), int(l))
print("Modified " + str(len(cutOutCases[0])) + " cases for k=" + str(k) + " l="+ str(l)+" t="+str(t))
# Generate the anonymized eventlog from the tree
privateEventLog = pretsa.getPrivatisedEventLog(applydp=False)
print("Event log sanitization finished.")
privateEventLog

Starting experiments
Generated Distance Matrix
Modified 268 cases for k=50 l=15 t=1
Event log sanitization finished.


Unnamed: 0,Activity,Case_ID,Duration,Event_Nr
408,Confirmation of receipt,case-10011,0.000000,1
1770,T02 Check confirmation of receipt,case-10011,0.778300,2
3171,T04 Determine confirmation of receipt,case-10011,0.861162,3
6127,T06 Determine necessity of stop advice,case-10011,0.000158,4
398,Confirmation of receipt,case-10017,0.000000,1
...,...,...,...,...
1750,T02 Check confirmation of receipt,case-9997,0.001116,2
3131,T04 Determine confirmation of receipt,case-9997,0.000273,3
4086,T05 Print and send confirmation of receipt,case-9997,0.000206,4
4925,T06 Determine necessity of stop advice,case-9997,0.000362,5


# Result
To measure the effect of this feature on the event log, we compare the number of variants and cases before and after applying l-diversity with PRETSA.

In [7]:
print("Number of variant in the event log: ",variantHelper.count_variants(modifiedEventlog))
print("Number of cases in the event log: ", len(modifiedEventlog))
print("Number of variant in the sanitized event log: ",variantHelper.count_variants(privateEventLog))
print("Number of cases in the sanitized event log: ", len(privateEventLog))

Number of variant in the event log:  116
Number of cases in the event log:  8577
Number of variant in the sanitized event log:  14
Number of cases in the sanitized event log:  7702
