This notebook contains code that was used at some point to format XES event logs into CSV, and split them into 80-20 training and test sets.

# Setup

In [None]:
import pandas as pd
import numpy as np
import pm4py
import plotly.express as px

In [None]:
def time_spread(df, df_test):

  combined_df = pd.DataFrame({
      'TRAIN': df['DATAEV'],
      'TEST': df_test['DATAEV']
  })

  fig = px.histogram(combined_df, x=["TRAIN", "TEST"])
  fig.update_layout(bargap=0.2, barmode='overlay')
  fig.show()

In [None]:
def my_train_test_split(df):
  """
  Splits the dataset into a train and a test,
  finding a cutoff to approximately
  have a 80-20 split,
  first trying to split on a certain date
  """
  ratios = {}
  n = len(df)
  for i in np.arange(0.025, 1., 0.025):
      cutoff = n*i
      t = pd.to_datetime(df.iloc[int(cutoff)].DATAEV).round('d')
      g = df.groupby("NUMPRO").DATAEV.max()
      try:
        ind = g[pd.to_datetime(g) <= t].keys()[-1] # We want cases in the train set to have timestamps smaller than t
      except:
        continue
      df_train = df[df.NUMPRO <= ind]
      df_test = df[df.NUMPRO > ind]
      ratios[i] = len(df_train)/len(df)
      # print(i, ratios[i])

  final_percentage = min(ratios.values(), key=lambda x: abs(x-0.8))
  final_cutoff = n*final_percentage
  t = pd.to_datetime(df.iloc[int(final_cutoff)].DATAEV).round('d')
  g = df.groupby("NUMPRO").DATAEV.max()
  ind = g[pd.to_datetime(g) <= t].keys()[-1]

  df_train = df[df.NUMPRO < ind].reset_index(drop=True)
  df_test = df[df.NUMPRO >= ind].reset_index(drop=True)

  final_split = len(df_train)/len(df) # We want df_train to be approximately 80% as big as df

  if (0.7 < final_split < 0.9):
    print("Cut-off date:", t.strftime('%Y-%m-%d'))
    return df_train, df_test

  else: # too much overlap to have a nice temporal split
    cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
    cutoff = int(0.8*len(cases_ordered))
    df_train = df[df.NUMPRO.isin(cases_ordered[:cutoff])].reset_index(drop=True)
    df_test = df[df.NUMPRO.isin(cases_ordered[cutoff:])].reset_index(drop=True)

  return df_train, df_test

Train-Test split at ~80%

# BPIC12

In [None]:
dataset = "BPIC12"

https://data.4tu.nl/articles/dataset/BPI_Challenge_2012/12689204

In [None]:
df = pm4py.read_xes('BPI_Challenge_2012.xes')
df

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
1,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
2,112,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
3,112,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
...,...,...,...,...,...,...,...
262195,112,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+00:00,2012-02-29 23:51:16.799000+00:00,214376,15000
262196,112,SCHEDULE,W_Afhandelen leads,2012-02-29 23:52:01.287000+00:00,2012-02-29 23:51:16.799000+00:00,214376,15000
262197,11169,START,W_Afhandelen leads,2012-03-01 09:26:46.736000+00:00,2012-02-29 23:51:16.799000+00:00,214376,15000
262198,11169,COMPLETE,A_DECLINED,2012-03-01 09:27:37.118000+00:00,2012-02-29 23:51:16.799000+00:00,214376,15000


In [None]:
df = df[["case:concept:name", "concept:name", "time:timestamp", "org:resource"]].rename(columns={"case:concept:name": "NUMPRO", "concept:name": "CCDOEV", "time:timestamp": "DATAEV", "org:resource": "NUMGIU"})
df.DATAEV = pd.to_datetime(df.DATAEV).dt.strftime('%Y-%m-%d %H:%M:%S')
cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
df.NUMPRO = pd.Categorical(df.NUMPRO, categories = list(cases_ordered)) # Order cases by earliest event
df = df.sort_values(by = ["NUMPRO","DATAEV"])
df.NUMPRO = df.NUMPRO.astype(int)
df

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,173688,A_SUBMITTED,2011-10-01 00:38:44,112
1,173688,A_PARTLYSUBMITTED,2011-10-01 00:38:44,112
2,173688,A_PREACCEPTED,2011-10-01 00:39:37,112
3,173688,W_Completeren aanvraag,2011-10-01 00:39:38,112
4,173688,W_Completeren aanvraag,2011-10-01 11:36:46,
...,...,...,...,...
262195,214376,A_PARTLYSUBMITTED,2012-02-29 23:51:17,112
262196,214376,W_Afhandelen leads,2012-02-29 23:52:01,112
262197,214376,W_Afhandelen leads,2012-03-01 09:26:46,11169
262198,214376,A_DECLINED,2012-03-01 09:27:37,11169


In [None]:
df_train, df_test = my_train_test_split(df)

Cut-off date: 2012-02-02


In [None]:
df_train

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,173688,A_SUBMITTED,2011-10-01 00:38:44,112
1,173688,A_PARTLYSUBMITTED,2011-10-01 00:38:44,112
2,173688,A_PREACCEPTED,2011-10-01 00:39:37,112
3,173688,W_Completeren aanvraag,2011-10-01 00:39:38,112
4,173688,W_Completeren aanvraag,2011-10-01 11:36:46,
...,...,...,...,...
210410,205782,O_ACCEPTED,2012-03-01 11:47:41,10809
210411,205782,A_APPROVED,2012-03-01 11:47:41,10809
210412,205782,A_REGISTERED,2012-03-01 11:47:41,10809
210413,205782,A_ACTIVATED,2012-03-01 11:47:41,10809


In [None]:
df_test

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,205785,A_SUBMITTED,2012-02-01 22:35:33,112
1,205785,A_PARTLYSUBMITTED,2012-02-01 22:35:35,112
2,205785,A_DECLINED,2012-02-01 22:36:20,112
3,205788,A_SUBMITTED,2012-02-01 23:06:24,112
4,205788,A_PARTLYSUBMITTED,2012-02-01 23:06:25,112
...,...,...,...,...
51780,214376,A_PARTLYSUBMITTED,2012-02-29 23:51:17,112
51781,214376,W_Afhandelen leads,2012-02-29 23:52:01,112
51782,214376,W_Afhandelen leads,2012-03-01 09:26:46,11169
51783,214376,A_DECLINED,2012-03-01 09:27:37,11169


In [None]:
time_spread(df_train, df_test)

In [None]:
df_train.to_csv(f'./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv', index=False)
df_test.to_csv(f'./{dataset}/{dataset}/{dataset}-TEST-CLEAN.csv', index=False)

# BPIC17

In [None]:
dataset = "BPIC17"

https://data.4tu.nl/articles/dataset/BPI_Challenge_2017/12696884

In [None]:
df = pm4py.read_xes('BPI Challenge 2017.xes')
df

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,Deleted,User_1,W_Call after offers,Workflow,Workitem_1817549786,ate_abort,2017-01-06 06:33:02.212000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202263,Created,User_1,W_Call after offers,Workflow,Workitem_363876066,schedule,2017-01-06 06:33:02.221000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202264,statechange,User_28,A_Cancelled,Application,ApplState_1869071797,complete,2017-01-16 09:51:21.114000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202265,statechange,User_28,O_Cancelled,Offer,OfferState_420066181,complete,2017-01-16 09:51:21.139000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,Offer_1580299144


In [None]:
df = df[["case:concept:name", "concept:name", "time:timestamp", "org:resource"]].rename(columns={"case:concept:name": "NUMPRO", "concept:name": "CCDOEV", "time:timestamp": "DATAEV", "org:resource": "NUMGIU"})
df.DATAEV = pd.to_datetime(df.DATAEV).dt.strftime('%Y-%m-%d %H:%M:%S')
cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
df.NUMPRO = pd.Categorical(df.NUMPRO, categories = list(cases_ordered)) # Order cases by earliest event
df = df.sort_values(by = ["NUMPRO","DATAEV"])
df.NUMPRO = df.NUMPRO.str.split('_').str[-1].astype(int)
df

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,652823628,A_Create Application,2016-01-01 09:51:15,User_1
1,652823628,A_Submitted,2016-01-01 09:51:15,User_1
2,652823628,W_Handle leads,2016-01-01 09:51:15,User_1
3,652823628,W_Handle leads,2016-01-01 09:52:36,User_1
4,652823628,W_Complete application,2016-01-01 09:52:36,User_1
...,...,...,...,...
1202262,1350494635,W_Call after offers,2017-01-06 06:33:02,User_1
1202263,1350494635,W_Call after offers,2017-01-06 06:33:02,User_1
1202264,1350494635,A_Cancelled,2017-01-16 09:51:21,User_28
1202265,1350494635,O_Cancelled,2017-01-16 09:51:21,User_28


In [None]:
# We cannot split BPIC17 80-20 without significant overlap between the temporal frames of train and test
# So no cutoff date in particular, and train and test cover similar periods
# (We could still cut at some date, but most cases would then be cut in two between the two sets)
df_train, df_test = my_train_test_split(df)

In [None]:
df_train

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,652823628,A_Create Application,2016-01-01 09:51:15,User_1
1,652823628,A_Submitted,2016-01-01 09:51:15,User_1
2,652823628,W_Handle leads,2016-01-01 09:51:15,User_1
3,652823628,W_Handle leads,2016-01-01 09:52:36,User_1
4,652823628,W_Complete application,2016-01-01 09:52:36,User_1
...,...,...,...,...
961203,1109722425,W_Call after offers,2016-10-25 06:24:39,User_5
961204,1109722425,W_Call after offers,2016-10-25 06:25:18,User_5
961205,1109722425,A_Cancelled,2016-11-21 07:00:34,User_1
961206,1109722425,O_Cancelled,2016-11-21 07:00:34,User_1


In [None]:
df_test

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,424064154,A_Create Application,2016-10-18 23:42:39,User_1
1,424064154,A_Submitted,2016-10-18 23:42:40,User_1
2,424064154,W_Handle leads,2016-10-18 23:42:40,User_1
3,424064154,W_Handle leads,2016-10-19 05:54:43,User_26
4,424064154,W_Handle leads,2016-10-19 05:56:36,User_26
...,...,...,...,...
241054,1350494635,W_Call after offers,2017-01-06 06:33:02,User_1
241055,1350494635,W_Call after offers,2017-01-06 06:33:02,User_1
241056,1350494635,A_Cancelled,2017-01-16 09:51:21,User_28
241057,1350494635,O_Cancelled,2017-01-16 09:51:21,User_28


In [None]:
time_spread(df_train, df_test)

In [None]:
df_train.to_csv(f'./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv', index=False)
df_test.to_csv(f'./{dataset}/{dataset}/{dataset}-TEST-CLEAN.csv', index=False)

# BPIC20

In [None]:
dataset = "BPIC20"

https://data.4tu.nl/collections/_/5065541/1

## DomesticDeclarations

In [None]:
subset = "DomesticDeclarations"

In [None]:
df = pm4py.read_xes('DomesticDeclarations.xes')
df

parsing log, completed traces ::   0%|          | 0/10500 [00:00<?, ?it/s]

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:id,case:concept:name,case:BudgetNumber,case:DeclarationNumber,case:Amount
0,st_step 86794_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 09:49:50+00:00,EMPLOYEE,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
1,st_step 86793_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2017-01-09 11:27:48+00:00,SUPERVISOR,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
2,dd_declaration 86791_19,SYSTEM,Request Payment,2017-01-10 09:34:44+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
3,dd_declaration 86791_20,SYSTEM,Payment Handled,2017-01-12 17:31:22+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
4,st_step 86798_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 10:26:14+00:00,EMPLOYEE,declaration 86795,declaration 86795,budget 86566,declaration number 86796,182.464172
...,...,...,...,...,...,...,...,...,...,...
56432,st_step 138363_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2018-12-29 17:50:14+00:00,EMPLOYEE,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576
56433,st_step 138361_0,STAFF MEMBER,Declaration APPROVED by ADMINISTRATION,2018-12-29 17:56:13+00:00,ADMINISTRATION,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576
56434,st_step 138362_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2019-01-03 08:55:52+00:00,SUPERVISOR,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576
56435,dd_declaration 138359_19,SYSTEM,Request Payment,2019-01-08 08:20:28+00:00,UNDEFINED,declaration 138359,declaration 138359,budget 86566,declaration number 138360,190.404576


In [None]:
df = df[["case:concept:name", "concept:name", "time:timestamp", "org:resource"]].rename(columns={"case:concept:name": "NUMPRO", "concept:name": "CCDOEV", "time:timestamp": "DATAEV", "org:resource": "NUMGIU"})
df.DATAEV = pd.to_datetime(df.DATAEV).dt.strftime('%Y-%m-%d %H:%M:%S')
cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
df.NUMPRO = pd.Categorical(df.NUMPRO, categories = list(cases_ordered)) # Order cases by earliest event
df = df.sort_values(by = ["NUMPRO","DATAEV"])
df.NUMPRO = df.NUMPRO.str.split().str[-1].astype(int)
df

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,86791,Declaration SUBMITTED by EMPLOYEE,2017-01-09 09:49:50,STAFF MEMBER
1,86791,Declaration FINAL_APPROVED by SUPERVISOR,2017-01-09 11:27:48,STAFF MEMBER
2,86791,Request Payment,2017-01-10 09:34:44,SYSTEM
3,86791,Payment Handled,2017-01-12 17:31:22,SYSTEM
4,86795,Declaration SUBMITTED by EMPLOYEE,2017-01-09 10:26:14,STAFF MEMBER
...,...,...,...,...
56432,138359,Declaration SUBMITTED by EMPLOYEE,2018-12-29 17:50:14,STAFF MEMBER
56433,138359,Declaration APPROVED by ADMINISTRATION,2018-12-29 17:56:13,STAFF MEMBER
56434,138359,Declaration FINAL_APPROVED by SUPERVISOR,2019-01-03 08:55:52,STAFF MEMBER
56435,138359,Request Payment,2019-01-08 08:20:28,SYSTEM


In [None]:
df_train, df_test = my_train_test_split(df)

In [None]:
df_train

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,86791,Declaration SUBMITTED by EMPLOYEE,2017-01-09 09:49:50,STAFF MEMBER
1,86791,Declaration FINAL_APPROVED by SUPERVISOR,2017-01-09 11:27:48,STAFF MEMBER
2,86791,Request Payment,2017-01-10 09:34:44,SYSTEM
3,86791,Payment Handled,2017-01-12 17:31:22,SYSTEM
4,86795,Declaration SUBMITTED by EMPLOYEE,2017-01-09 10:26:14,STAFF MEMBER
...,...,...,...,...
44756,136625,Declaration SUBMITTED by EMPLOYEE,2018-10-11 21:31:29,STAFF MEMBER
44757,136625,Declaration APPROVED by ADMINISTRATION,2018-10-11 21:36:07,STAFF MEMBER
44758,136625,Declaration FINAL_APPROVED by SUPERVISOR,2018-10-15 14:15:04,STAFF MEMBER
44759,136625,Request Payment,2018-10-15 15:42:36,SYSTEM


In [None]:
df_test

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,136630,Declaration SUBMITTED by EMPLOYEE,2018-10-11 21:44:58,STAFF MEMBER
1,136630,Declaration APPROVED by ADMINISTRATION,2018-10-11 21:48:56,STAFF MEMBER
2,136630,Declaration FINAL_APPROVED by SUPERVISOR,2018-10-18 17:02:47,STAFF MEMBER
3,136630,Request Payment,2018-10-22 08:46:59,SYSTEM
4,136630,Payment Handled,2018-10-25 17:31:41,SYSTEM
...,...,...,...,...
11671,138359,Declaration SUBMITTED by EMPLOYEE,2018-12-29 17:50:14,STAFF MEMBER
11672,138359,Declaration APPROVED by ADMINISTRATION,2018-12-29 17:56:13,STAFF MEMBER
11673,138359,Declaration FINAL_APPROVED by SUPERVISOR,2019-01-03 08:55:52,STAFF MEMBER
11674,138359,Request Payment,2019-01-08 08:20:28,SYSTEM


In [None]:
df_train.dtypes

NUMPRO     int32
CCDOEV    object
DATAEV    object
NUMGIU    object
dtype: object

In [None]:
time_spread(df_train, df_test)

In [None]:
df_train.to_csv(f'./{dataset}/{subset}/{subset}_prepared/{subset}-TRAIN-CLEAN.csv', index=False)
df_test.to_csv(f'./{dataset}/{subset}/{subset}-TEST-CLEAN.csv', index=False)

## InternationalDeclarations

In [None]:
subset = "InternationalDeclarations"

In [None]:
df = pm4py.read_xes('InternationalDeclarations.xes')
df

parsing log, completed traces ::   0%|          | 0/6449 [00:00<?, ?it/s]

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:Permit travel permit number,case:DeclarationNumber,case:Amount,case:RequestedAmount,case:Permit TaskNumber,...,case:concept:name,case:Permit OrganizationalEntity,case:travel permit number,case:Permit RequestedBudget,case:id,case:Permit ID,case:Permit id,case:BudgetNumber,case:Permit ActivityNumber,case:AdjustedAmount
0,rv_travel permit 76455_6,STAFF MEMBER,Start trip,2016-10-05 00:00:00+00:00,EMPLOYEE,travel permit number 76456,declaration number 76458,39.664561,39.664561,UNKNOWN,...,declaration 76457,organizational unit 65458,travel permit number 76456,41.613445,declaration 76457,travel permit 76455,travel permit 76455,budget 144133,activity 46005,39.664561
1,rv_travel permit 76455_7,STAFF MEMBER,End trip,2016-10-05 00:00:00+00:00,EMPLOYEE,travel permit number 76456,declaration number 76458,39.664561,39.664561,UNKNOWN,...,declaration 76457,organizational unit 65458,travel permit number 76456,41.613445,declaration 76457,travel permit 76455,travel permit 76455,budget 144133,activity 46005,39.664561
2,st_step 76459_0,STAFF MEMBER,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10+00:00,EMPLOYEE,travel permit number 76456,declaration number 76458,39.664561,39.664561,UNKNOWN,...,declaration 76457,organizational unit 65458,travel permit number 76456,41.613445,declaration 76457,travel permit 76455,travel permit 76455,budget 144133,activity 46005,39.664561
3,st_step 76460_0,STAFF MEMBER,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28+00:00,SUPERVISOR,travel permit number 76456,declaration number 76458,39.664561,39.664561,UNKNOWN,...,declaration 76457,organizational unit 65458,travel permit number 76456,41.613445,declaration 76457,travel permit 76455,travel permit 76455,budget 144133,activity 46005,39.664561
4,st_step 76461_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14+00:00,EMPLOYEE,travel permit number 76456,declaration number 76458,39.664561,39.664561,UNKNOWN,...,declaration 76457,organizational unit 65458,travel permit number 76456,41.613445,declaration 76457,travel permit 76455,travel permit 76455,budget 144133,activity 46005,39.664561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72146,st_step 13239_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2018-12-18 15:06:50+00:00,EMPLOYEE,travel permit number 13227,UNKNOWN,0.000000,0.000000,task 427,...,declaration 13232,organizational unit 65455,travel permit number 13227,1727.559756,declaration 13232,travel permit 13226,travel permit 13226,budget 147449,UNKNOWN,0.000000
72147,st_step 13241_0,STAFF MEMBER,Declaration REJECTED by ADMINISTRATION,2018-12-18 15:06:57+00:00,ADMINISTRATION,travel permit number 13227,UNKNOWN,0.000000,0.000000,task 427,...,declaration 13232,organizational unit 65455,travel permit number 13227,1727.559756,declaration 13232,travel permit 13226,travel permit 13226,budget 147449,UNKNOWN,0.000000
72148,st_step 13240_0,STAFF MEMBER,Declaration REJECTED by EMPLOYEE,2018-12-19 14:05:36+00:00,EMPLOYEE,travel permit number 13227,UNKNOWN,0.000000,0.000000,task 427,...,declaration 13232,organizational unit 65455,travel permit number 13227,1727.559756,declaration 13232,travel permit 13226,travel permit 13226,budget 147449,UNKNOWN,0.000000
72149,rv_travel permit 13226_6,STAFF MEMBER,Start trip,2019-02-19 00:00:00+00:00,EMPLOYEE,travel permit number 13227,UNKNOWN,0.000000,0.000000,task 427,...,declaration 13232,organizational unit 65455,travel permit number 13227,1727.559756,declaration 13232,travel permit 13226,travel permit 13226,budget 147449,UNKNOWN,0.000000


In [None]:
df = df[["case:concept:name", "concept:name", "time:timestamp", "org:resource"]].rename(columns={"case:concept:name": "NUMPRO", "concept:name": "CCDOEV", "time:timestamp": "DATAEV", "org:resource": "NUMGIU"})
df.DATAEV = pd.to_datetime(df.DATAEV).dt.strftime('%Y-%m-%d %H:%M:%S')
cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
df.NUMPRO = pd.Categorical(df.NUMPRO, categories = list(cases_ordered)) # Order cases by earliest event
df = df.sort_values(by = ["NUMPRO","DATAEV"])
df.NUMPRO = df.NUMPRO.str.split().str[-1].astype(int)
df

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,76457,Start trip,2016-10-05 00:00:00,STAFF MEMBER
1,76457,End trip,2016-10-05 00:00:00,STAFF MEMBER
2,76457,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,STAFF MEMBER
3,76457,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,STAFF MEMBER
4,76457,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,STAFF MEMBER
...,...,...,...,...
72146,13232,Declaration SUBMITTED by EMPLOYEE,2018-12-18 15:06:50,STAFF MEMBER
72147,13232,Declaration REJECTED by ADMINISTRATION,2018-12-18 15:06:57,STAFF MEMBER
72148,13232,Declaration REJECTED by EMPLOYEE,2018-12-19 14:05:36,STAFF MEMBER
72149,13232,Start trip,2019-02-19 00:00:00,STAFF MEMBER


In [None]:
df_train, df_test = my_train_test_split(df)

In [None]:
df_train

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,76457,Start trip,2016-10-05 00:00:00,STAFF MEMBER
1,76457,End trip,2016-10-05 00:00:00,STAFF MEMBER
2,76457,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,STAFF MEMBER
3,76457,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,STAFF MEMBER
4,76457,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,STAFF MEMBER
...,...,...,...,...
56983,68066,Declaration SUBMITTED by EMPLOYEE,2018-10-15 11:11:15,STAFF MEMBER
56984,68066,Declaration APPROVED by ADMINISTRATION,2018-10-15 11:18:33,STAFF MEMBER
56985,68066,Declaration FINAL_APPROVED by SUPERVISOR,2018-10-15 13:49:16,STAFF MEMBER
56986,68066,Request Payment,2018-10-15 13:59:03,SYSTEM


In [None]:
df_test

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,20408,Permit SUBMITTED by EMPLOYEE,2018-08-01 15:29:29,STAFF MEMBER
1,20408,Permit APPROVED by ADMINISTRATION,2018-08-01 15:29:40,STAFF MEMBER
2,20408,Permit FINAL_APPROVED by SUPERVISOR,2018-08-02 09:53:38,STAFF MEMBER
3,20408,Start trip,2018-10-07 00:00:00,STAFF MEMBER
4,20408,End trip,2018-10-10 00:00:00,STAFF MEMBER
...,...,...,...,...
15158,13232,Declaration SUBMITTED by EMPLOYEE,2018-12-18 15:06:50,STAFF MEMBER
15159,13232,Declaration REJECTED by ADMINISTRATION,2018-12-18 15:06:57,STAFF MEMBER
15160,13232,Declaration REJECTED by EMPLOYEE,2018-12-19 14:05:36,STAFF MEMBER
15161,13232,Start trip,2019-02-19 00:00:00,STAFF MEMBER


In [None]:
time_spread(df_train, df_test)

In [None]:
df_train.to_csv(f'./{dataset}/{subset}/{subset}_prepared/{subset}-TRAIN-CLEAN.csv', index=False)
df_test.to_csv(f'./{dataset}/{subset}/{subset}-TEST-CLEAN.csv', index=False)

## PermitLog

In [None]:
subset = "PermitLog"

In [None]:
df = pm4py.read_xes('PermitLog.xes')
df

parsing log, completed traces ::   0%|          | 0/7065 [00:00<?, ?it/s]

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:OrganizationalEntity,case:ProjectNumber,case:TaskNumber,case:dec_id_0,case:ActivityNumber,...,case:Cost Type_14,case:Cost Type_10,case:Cost Type_11,case:Cost Type_12,case:Task_5,case:Task_4,case:Task_9,case:Task_8,case:Task_7,case:Task_6
0,rv_travel permit 76455_6,STAFF MEMBER,Start trip,2016-10-05 00:00:00+00:00,EMPLOYEE,organizational unit 65458,UNKNOWN,UNKNOWN,declaration 76457,activity 46005,...,,,,,,,,,,
1,rv_travel permit 76455_7,STAFF MEMBER,End trip,2016-10-05 00:00:00+00:00,EMPLOYEE,organizational unit 65458,UNKNOWN,UNKNOWN,declaration 76457,activity 46005,...,,,,,,,,,,
2,st_step 76459_0,STAFF MEMBER,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10+00:00,EMPLOYEE,organizational unit 65458,UNKNOWN,UNKNOWN,declaration 76457,activity 46005,...,,,,,,,,,,
3,st_step 76460_0,STAFF MEMBER,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28+00:00,SUPERVISOR,organizational unit 65458,UNKNOWN,UNKNOWN,declaration 76457,activity 46005,...,,,,,,,,,,
4,st_step 76461_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14+00:00,EMPLOYEE,organizational unit 65458,UNKNOWN,UNKNOWN,declaration 76457,activity 46005,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86576,st_step 13113_0,STAFF MEMBER,Permit APPROVED by ADMINISTRATION,2018-12-30 15:11:51+00:00,ADMINISTRATION,organizational unit 65464,project 13110,task 427,,UNKNOWN,...,,,,,,,,,,
86577,rv_travel permit 13108_6,STAFF MEMBER,Start trip,2019-01-02 00:00:00+00:00,EMPLOYEE,organizational unit 65464,project 13110,task 427,,UNKNOWN,...,,,,,,,,,,
86578,st_step 13112_0,STAFF MEMBER,Permit APPROVED by SUPERVISOR,2019-01-02 09:11:23+00:00,SUPERVISOR,organizational unit 65464,project 13110,task 427,,UNKNOWN,...,,,,,,,,,,
86579,st_step 13114_0,STAFF MEMBER,Permit FINAL_APPROVED by DIRECTOR,2019-01-07 14:03:29+00:00,DIRECTOR,organizational unit 65464,project 13110,task 427,,UNKNOWN,...,,,,,,,,,,


In [None]:
df = df[["case:concept:name", "concept:name", "time:timestamp", "org:resource"]].rename(columns={"case:concept:name": "NUMPRO", "concept:name": "CCDOEV", "time:timestamp": "DATAEV", "org:resource": "NUMGIU"})
df.DATAEV = pd.to_datetime(df.DATAEV).dt.strftime('%Y-%m-%d %H:%M:%S')
cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
df.NUMPRO = pd.Categorical(df.NUMPRO, categories = list(cases_ordered)) # Order cases by earliest event
df = df.sort_values(by = ["NUMPRO","DATAEV"])
df.NUMPRO = df.NUMPRO.str.split().str[-1].astype(int)
df

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,76455,Start trip,2016-10-05 00:00:00,STAFF MEMBER
1,76455,End trip,2016-10-05 00:00:00,STAFF MEMBER
2,76455,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,STAFF MEMBER
3,76455,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,STAFF MEMBER
4,76455,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,STAFF MEMBER
...,...,...,...,...
86576,13108,Permit APPROVED by ADMINISTRATION,2018-12-30 15:11:51,STAFF MEMBER
86577,13108,Start trip,2019-01-02 00:00:00,STAFF MEMBER
86578,13108,Permit APPROVED by SUPERVISOR,2019-01-02 09:11:23,STAFF MEMBER
86579,13108,Permit FINAL_APPROVED by DIRECTOR,2019-01-07 14:03:29,STAFF MEMBER


In [None]:
df_train, df_test = my_train_test_split(df)

In [None]:
df_train

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,76455,Start trip,2016-10-05 00:00:00,STAFF MEMBER
1,76455,End trip,2016-10-05 00:00:00,STAFF MEMBER
2,76455,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,STAFF MEMBER
3,76455,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,STAFF MEMBER
4,76455,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,STAFF MEMBER
...,...,...,...,...
71817,60144,Declaration SUBMITTED by EMPLOYEE,2018-10-01 13:47:10,STAFF MEMBER
71818,60144,Declaration APPROVED by ADMINISTRATION,2018-10-01 13:56:55,STAFF MEMBER
71819,60144,Declaration FINAL_APPROVED by SUPERVISOR,2018-10-03 15:47:18,STAFF MEMBER
71820,60144,Request Payment,2018-10-09 07:31:19,SYSTEM


In [None]:
df_test

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,60154,Permit SUBMITTED by EMPLOYEE,2018-09-08 08:22:27,STAFF MEMBER
1,60154,Permit APPROVED by ADMINISTRATION,2018-09-08 08:23:08,STAFF MEMBER
2,60154,Permit APPROVED by BUDGET OWNER,2018-09-11 12:53:25,STAFF MEMBER
3,60154,Permit FINAL_APPROVED by SUPERVISOR,2018-09-17 13:36:15,STAFF MEMBER
4,60154,Start trip,2018-10-21 00:00:00,STAFF MEMBER
...,...,...,...,...
14754,13108,Permit APPROVED by ADMINISTRATION,2018-12-30 15:11:51,STAFF MEMBER
14755,13108,Start trip,2019-01-02 00:00:00,STAFF MEMBER
14756,13108,Permit APPROVED by SUPERVISOR,2019-01-02 09:11:23,STAFF MEMBER
14757,13108,Permit FINAL_APPROVED by DIRECTOR,2019-01-07 14:03:29,STAFF MEMBER


In [None]:
time_spread(df_train, df_test)

In [None]:
df_train.to_csv(f'./{dataset}/{subset}/{subset}_prepared/{subset}-TRAIN-CLEAN.csv', index=False)
df_test.to_csv(f'./{dataset}/{subset}/{subset}-TEST-CLEAN.csv', index=False)

## PrepaidTravelCost

In [None]:
subset = "PrepaidTravelCost"

In [None]:
df = pm4py.read_xes('PrepaidTravelCost.xes')
df

parsing log, completed traces ::   0%|          | 0/2099 [00:00<?, ?it/s]

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:Rfp_id,case:Permit travel permit number,case:Task,case:OrganizationalEntity,case:RequestedAmount,...,case:Permit BudgetNumber,case:Permit ProjectNumber,case:Project,case:concept:name,case:Permit OrganizationalEntity,case:Permit RequestedBudget,case:Cost Type,case:Permit id,case:Permit ActivityNumber,case:RfpNumber
0,st_step 73555_0,STAFF MEMBER,Permit SUBMITTED by EMPLOYEE,2017-01-09 14:48:43+00:00,EMPLOYEE,request for payment 73550,UNKNOWN,task 71977,organizational unit 65463,854.579838,...,budget 6198,UNKNOWN,project 503,request for payment 73550,organizational unit 65455,1979.272104,0,travel permit 73549,UNKNOWN,request for payment number 73551
1,st_step 73554_0,STAFF MEMBER,Permit FINAL_APPROVED by SUPERVISOR,2017-01-09 14:48:55+00:00,SUPERVISOR,request for payment 73550,UNKNOWN,task 71977,organizational unit 65463,854.579838,...,budget 6198,UNKNOWN,project 503,request for payment 73550,organizational unit 65455,1979.272104,0,travel permit 73549,UNKNOWN,request for payment number 73551
2,st_step 73558_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,2017-01-12 11:40:27+00:00,EMPLOYEE,request for payment 73550,UNKNOWN,task 71977,organizational unit 65463,854.579838,...,budget 6198,UNKNOWN,project 503,request for payment 73550,organizational unit 65455,1979.272104,0,travel permit 73549,UNKNOWN,request for payment number 73551
3,st_step 73559_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-12 11:41:59+00:00,SUPERVISOR,request for payment 73550,UNKNOWN,task 71977,organizational unit 65463,854.579838,...,budget 6198,UNKNOWN,project 503,request for payment 73550,organizational unit 65455,1979.272104,0,travel permit 73549,UNKNOWN,request for payment number 73551
4,st_step 73557_0,STAFF MEMBER,Request For Payment REJECTED by MISSING,2017-01-12 11:53:07+00:00,MISSING,request for payment 73550,UNKNOWN,task 71977,organizational unit 65463,854.579838,...,budget 6198,UNKNOWN,project 503,request for payment 73550,organizational unit 65455,1979.272104,0,travel permit 73549,UNKNOWN,request for payment number 73551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18241,st_step 186614_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,2018-12-30 20:16:15+00:00,EMPLOYEE,request for payment 186612,UNKNOWN,UNKNOWN,organizational unit 65462,96.739075,...,UNKNOWN,UNKNOWN,project 147531,request for payment 186612,UNKNOWN,0.000000,0,UNKNOWN,UNKNOWN,request for payment number 186613
18242,st_step 186616_0,STAFF MEMBER,Request For Payment APPROVED by ADMINISTRATION,2018-12-30 20:16:25+00:00,ADMINISTRATION,request for payment 186612,UNKNOWN,UNKNOWN,organizational unit 65462,96.739075,...,UNKNOWN,UNKNOWN,project 147531,request for payment 186612,UNKNOWN,0.000000,0,UNKNOWN,UNKNOWN,request for payment number 186613
18243,st_step 186615_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,2019-01-14 15:09:11+00:00,SUPERVISOR,request for payment 186612,UNKNOWN,UNKNOWN,organizational unit 65462,96.739075,...,UNKNOWN,UNKNOWN,project 147531,request for payment 186612,UNKNOWN,0.000000,0,UNKNOWN,UNKNOWN,request for payment number 186613
18244,rp_request for payment 186612_15,SYSTEM,Request Payment,2019-01-15 07:02:45+00:00,UNDEFINED,request for payment 186612,UNKNOWN,UNKNOWN,organizational unit 65462,96.739075,...,UNKNOWN,UNKNOWN,project 147531,request for payment 186612,UNKNOWN,0.000000,0,UNKNOWN,UNKNOWN,request for payment number 186613


In [None]:
df = df[["case:concept:name", "concept:name", "time:timestamp", "org:resource"]].rename(columns={"case:concept:name": "NUMPRO", "concept:name": "CCDOEV", "time:timestamp": "DATAEV", "org:resource": "NUMGIU"})
df.DATAEV = pd.to_datetime(df.DATAEV).dt.strftime('%Y-%m-%d %H:%M:%S')
cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
df.NUMPRO = pd.Categorical(df.NUMPRO, categories = list(cases_ordered)) # Order cases by earliest event
df = df.sort_values(by = ["NUMPRO","DATAEV"])
df.NUMPRO = df.NUMPRO.str.split().str[-1].astype(int)
df

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,73550,Permit SUBMITTED by EMPLOYEE,2017-01-09 14:48:43,STAFF MEMBER
1,73550,Permit FINAL_APPROVED by SUPERVISOR,2017-01-09 14:48:55,STAFF MEMBER
2,73550,Request For Payment SUBMITTED by EMPLOYEE,2017-01-12 11:40:27,STAFF MEMBER
3,73550,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-12 11:41:59,STAFF MEMBER
4,73550,Request For Payment REJECTED by MISSING,2017-01-12 11:53:07,STAFF MEMBER
...,...,...,...,...
18241,186612,Request For Payment SUBMITTED by EMPLOYEE,2018-12-30 20:16:15,STAFF MEMBER
18242,186612,Request For Payment APPROVED by ADMINISTRATION,2018-12-30 20:16:25,STAFF MEMBER
18243,186612,Request For Payment FINAL_APPROVED by SUPERVISOR,2019-01-14 15:09:11,STAFF MEMBER
18244,186612,Request Payment,2019-01-15 07:02:45,SYSTEM


In [None]:
df_train, df_test = my_train_test_split(df)

In [None]:
df_train

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,73550,Permit SUBMITTED by EMPLOYEE,2017-01-09 14:48:43,STAFF MEMBER
1,73550,Permit FINAL_APPROVED by SUPERVISOR,2017-01-09 14:48:55,STAFF MEMBER
2,73550,Request For Payment SUBMITTED by EMPLOYEE,2017-01-12 11:40:27,STAFF MEMBER
3,73550,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-12 11:41:59,STAFF MEMBER
4,73550,Request For Payment REJECTED by MISSING,2017-01-12 11:53:07,STAFF MEMBER
...,...,...,...,...
14673,63662,Request For Payment SUBMITTED by EMPLOYEE,2018-08-10 11:41:35,STAFF MEMBER
14674,63662,Request For Payment APPROVED by ADMINISTRATION,2018-08-10 11:42:14,STAFF MEMBER
14675,63662,Request For Payment FINAL_APPROVED by SUPERVISOR,2018-08-10 12:42:06,STAFF MEMBER
14676,63662,Request Payment,2018-08-16 07:07:58,SYSTEM


In [None]:
df_test

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,63680,Permit SUBMITTED by EMPLOYEE,2018-08-06 14:10:58,STAFF MEMBER
1,63680,Permit APPROVED by ADMINISTRATION,2018-08-06 14:11:12,STAFF MEMBER
2,63680,Permit APPROVED by BUDGET OWNER,2018-08-13 09:50:52,STAFF MEMBER
3,63680,Permit FINAL_APPROVED by SUPERVISOR,2018-08-20 05:15:27,STAFF MEMBER
4,63680,Request For Payment SUBMITTED by EMPLOYEE,2018-09-04 14:29:31,STAFF MEMBER
...,...,...,...,...
3563,186612,Request For Payment SUBMITTED by EMPLOYEE,2018-12-30 20:16:15,STAFF MEMBER
3564,186612,Request For Payment APPROVED by ADMINISTRATION,2018-12-30 20:16:25,STAFF MEMBER
3565,186612,Request For Payment FINAL_APPROVED by SUPERVISOR,2019-01-14 15:09:11,STAFF MEMBER
3566,186612,Request Payment,2019-01-15 07:02:45,SYSTEM


In [None]:
time_spread(df_train, df_test)

In [None]:
df_train.to_csv(f'./{dataset}/{subset}/{subset}_prepared/{subset}-TRAIN-CLEAN.csv', index=False)
df_test.to_csv(f'./{dataset}/{subset}/{subset}-TEST-CLEAN.csv', index=False)

## RequestForPayment

In [None]:
subset = "RequestForPayment"

In [None]:
df = pm4py.read_xes('RequestForPayment.xes')
df

parsing log, completed traces ::   0%|          | 0/6886 [00:00<?, ?it/s]

Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:Rfp_id,case:Project,case:Task,case:concept:name,case:OrganizationalEntity,case:Cost Type,case:RequestedAmount,case:Activity,case:RfpNumber
0,st_step 148220_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,2017-01-09 09:17:18+00:00,EMPLOYEE,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215
1,st_step 148221_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-09 09:18:00+00:00,SUPERVISOR,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215
2,st_step 148222_0,STAFF MEMBER,Request For Payment REJECTED by MISSING,2017-01-10 12:42:32+00:00,MISSING,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215
3,st_step 148219_0,STAFF MEMBER,Request For Payment SUBMITTED by EMPLOYEE,2017-03-03 09:51:13+00:00,EMPLOYEE,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215
4,st_step 148218_0,STAFF MEMBER,Request For Payment APPROVED by PRE_APPROVER,2017-03-03 09:51:42+00:00,PRE_APPROVER,request for payment 148214,project 148216,UNKNOWN,request for payment 148214,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36791,st_step 185004_0,STAFF MEMBER,Request For Payment APPROVED by ADMINISTRATION,2018-12-29 12:35:02+00:00,ADMINISTRATION,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001
36792,st_step 185003_0,STAFF MEMBER,Request For Payment APPROVED by BUDGET OWNER,2019-01-03 09:27:20+00:00,BUDGET OWNER,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001
36793,st_step 185005_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,2019-01-08 09:00:39+00:00,SUPERVISOR,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001
36794,rp_request for payment 185000_15,SYSTEM,Request Payment,2019-01-08 09:29:14+00:00,UNDEFINED,request for payment 185000,project 147860,task 152704,request for payment 185000,organizational unit 65468,0,15.409660,activity 505,request for payment number 185001


In [None]:
df = df[["case:concept:name", "concept:name", "time:timestamp", "org:resource"]].rename(columns={"case:concept:name": "NUMPRO", "concept:name": "CCDOEV", "time:timestamp": "DATAEV", "org:resource": "NUMGIU"})
df.DATAEV = pd.to_datetime(df.DATAEV).dt.strftime('%Y-%m-%d %H:%M:%S')
cases_ordered = df.groupby("NUMPRO").DATAEV.min().sort_values().keys() # cases numbers ordered by earliest event
df.NUMPRO = pd.Categorical(df.NUMPRO, categories = list(cases_ordered)) # Order cases by earliest event
df = df.sort_values(by = ["NUMPRO","DATAEV"])
df.NUMPRO = df.NUMPRO.str.split().str[-1].astype(int)
df

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,148214,Request For Payment SUBMITTED by EMPLOYEE,2017-01-09 09:17:18,STAFF MEMBER
1,148214,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-09 09:18:00,STAFF MEMBER
2,148214,Request For Payment REJECTED by MISSING,2017-01-10 12:42:32,STAFF MEMBER
3,148214,Request For Payment SUBMITTED by EMPLOYEE,2017-03-03 09:51:13,STAFF MEMBER
4,148214,Request For Payment APPROVED by PRE_APPROVER,2017-03-03 09:51:42,STAFF MEMBER
...,...,...,...,...
36791,185000,Request For Payment APPROVED by ADMINISTRATION,2018-12-29 12:35:02,STAFF MEMBER
36792,185000,Request For Payment APPROVED by BUDGET OWNER,2019-01-03 09:27:20,STAFF MEMBER
36793,185000,Request For Payment FINAL_APPROVED by SUPERVISOR,2019-01-08 09:00:39,STAFF MEMBER
36794,185000,Request Payment,2019-01-08 09:29:14,SYSTEM


In [None]:
df_train, df_test = my_train_test_split(df)

In [None]:
df_train

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,148214,Request For Payment SUBMITTED by EMPLOYEE,2017-01-09 09:17:18,STAFF MEMBER
1,148214,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-09 09:18:00,STAFF MEMBER
2,148214,Request For Payment REJECTED by MISSING,2017-01-10 12:42:32,STAFF MEMBER
3,148214,Request For Payment SUBMITTED by EMPLOYEE,2017-03-03 09:51:13,STAFF MEMBER
4,148214,Request For Payment APPROVED by PRE_APPROVER,2017-03-03 09:51:42,STAFF MEMBER
...,...,...,...,...
29300,182954,Request Payment,2018-10-29 08:19:38,SYSTEM
29301,182954,Payment Handled,2018-11-01 17:31:17,SYSTEM
29302,182960,Request For Payment SUBMITTED by EMPLOYEE,2018-10-19 10:13:57,STAFF MEMBER
29303,182960,Request For Payment REJECTED by ADMINISTRATION,2018-10-19 10:57:01,STAFF MEMBER


In [None]:
df_test

Unnamed: 0,NUMPRO,CCDOEV,DATAEV,NUMGIU
0,179254,Request For Payment SAVED by EMPLOYEE,2018-10-19 11:05:40,STAFF MEMBER
1,176152,Request For Payment SUBMITTED by EMPLOYEE,2018-10-19 11:23:43,STAFF MEMBER
2,176152,Request For Payment APPROVED by ADMINISTRATION,2018-10-19 11:23:53,STAFF MEMBER
3,176152,Request For Payment FINAL_APPROVED by SUPERVISOR,2018-10-24 10:12:33,STAFF MEMBER
4,176152,Request Payment,2018-10-24 19:57:37,SYSTEM
...,...,...,...,...
7486,185000,Request For Payment APPROVED by ADMINISTRATION,2018-12-29 12:35:02,STAFF MEMBER
7487,185000,Request For Payment APPROVED by BUDGET OWNER,2019-01-03 09:27:20,STAFF MEMBER
7488,185000,Request For Payment FINAL_APPROVED by SUPERVISOR,2019-01-08 09:00:39,STAFF MEMBER
7489,185000,Request Payment,2019-01-08 09:29:14,SYSTEM


In [None]:
time_spread(df_train, df_test)

In [None]:
df_train.to_csv(f'./{dataset}/{subset}/{subset}_prepared/{subset}-TRAIN-CLEAN.csv', index=False)
df_test.to_csv(f'./{dataset}/{subset}/{subset}-TEST-CLEAN.csv', index=False)