# Data preprocess for event attributes





In [1]:
# optional 
# import pip
# def install(package):
#     pip.main(["install",package])
# install("pm4py")

# install("missingno")
# import missingno as msno

# install("pycox")
# install("sklearn")
# install("torch")
# install("sklearn_pandas ")


In [2]:
import numpy as np
import pandas as pd
import torch
import random

In [None]:
np.random.seed(1234)
_ = torch.manual_seed(123)

In [None]:
df = pd.read_csv('2017.csv')

# Get unique values for each column
unique_values = {
    column: sorted(df[column].unique())
    for column in ['Action', 'concept:name', 'EventOrigin'] 
}

In [5]:
df.head()

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,


In [6]:
data = df.copy()

# Convert timestamp to datetime format
data['time:timestamp'] = pd.to_datetime(data['time:timestamp'])

## Prefix length bucketing 



In [None]:
# Create 20 dataframes to store the buckets
dfs = []
for i in range(1, 21):

    subset = data.groupby('case:concept:name').head(i)

    # Calculate the duration of process fragment inside the bucket 
    subset.sort_values(['case:concept:name', 'time:timestamp'], inplace=True)
    durations = subset.groupby('case:concept:name')['time:timestamp'].apply(lambda x: x.max() - x.min())
    durations = durations.dt.total_seconds()
    duration_df = pd.DataFrame({'case:concept:name': durations.index, 'Duration': durations.values})

    # calculate the counts for event attributes
    counts = pd.DataFrame(columns=['case:concept:name'], data=subset['case:concept:name'].unique())

    for column, values in unique_values.items():
        for case, group in subset.groupby('case:concept:name'):
            value_counts = group[column].value_counts()
            count_dict = {value: value_counts.get(value, 0) for value in values}
            counts.loc[counts['case:concept:name'] == case, values] = count_dict.values()

    duration_df['case:concept:name'] = duration_df['case:concept:name'].astype(str)
    counts['case:concept:name'] = counts['case:concept:name'].astype(str)

    result_df = pd.merge(duration_df, counts, on='case:concept:name')
    dfs.append(result_df)


In [None]:
# assign variable names to each bucket
for i in range(len(dfs)):
    globals()[f'data{i + 1}'] = dfs[i]

# save the buckets
for i, df in enumerate(dfs):
    df.to_csv(f'./buckets/data{i + 1}.csv', index=False)