# Import packages

In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
from preprocessing import sort_log, debiasing, mapping_case_id, add_soc_eoc,\
    create_time_features, train_log_normalize, test_log_normalize,\
    train_mapping_event_name, test_mapping_event_name
from train_test_split import get_train_test_split_point, get_discard_case_list, \
    create_table_without_discard_case, get_train_val_case_list
from create_prefix_suffix import create_log_prefix, create_trace_prefix, \
create_trace_suffix, create_log_next

# Define parameters

## BPIC2017

In [None]:
csv_path = '.../BPIC2017.csv'
start_date = None
end_date = '2017-01'
max_duration = 47.81
max_len = 87

num_act =
trace_prefix_len = 88 # max_len + 1 (SOC)
trace_prefix_len = 88 # max_len + 1 (EOC)

## BPIC2019

In [None]:
csv_path = '.../BPIC2019.csv'
start_date = '2018-01'
end_date = '2019-02'
max_duration = 143.33
max_len = 13

num_act =
trace_prefix_len =
trace_prefix_len =

## BAC

In [None]:
csv_path = '.../BAC.csv'
start_date = None
end_date = None
max_duration = 
max_len = 

num_act =
trace_prefix_len =
trace_prefix_len =

In [None]:
test_ratio = 0.2
val_ratio = 0.2

In [None]:
log_prefix_len = 200
trace_prefix_len = 14 # max_len + 1 (SOC or EOC)
trace_prefix_len = 14
num_act = 40 # number of activity labels in training set (+4): 36

In [None]:
test_ratio = 0.2
val_ratio = 0.2

log_prefix_len = 100

log_col_name = ['concept:name', 'log_ts_pre']
trace_col_name = ['concept:name', 'trace_ts_start', 'trace_ts_pre']
categorical_features = ['concept:name']
continuous_features = ['log_ts_pre', 'trace_ts_pre', 'trace_ts_start']
case_id = 'case:concept:name'
timestamp = 'time:timestamp'
event_name = 'concept:name'
event_idx = 'event_idx'

# Prepare preprocessed full event log

In [None]:
# 1. Tranform csv to dataframe
orn_df = pd.read_csv(csv_path)
print(orn_df.info())
# orn_df = orn_df.loc[:,[case_id, timestamp, event_name]]
print("Number of cases:", orn_df[case_id].nunique())
print("Number of activity labels:", orn_df[event_name].nunique())

In [None]:
# 2. Sort dataframe by timestamp
df = sort_log(orn_df,timestamp)

In [None]:
# 3. Remove duplicates
df_withdup = df.iloc[:, 1:]

In [None]:
df = df_withdup.drop_duplicates(keep='first')

In [None]:
df = df.loc[:,[case_id, timestamp, event_name]]
df.info()

In [None]:
# 4. Debiasing and cleaning
df, end_timestamp = debiasing(df, start_date, end_date, max_duration, max_len, case_id, timestamp)
print(df.info())
print("Number of retaining cases:", df[case_id].nunique())

In [None]:
print(end_timestamp)

In [None]:
print("Number of activity labels:", df[event_name].nunique())

In [None]:
# 5. Map case ID to numbers
df, case_id_dict= mapping_case_id(df, case_id)
print(len(case_id_dict))
print(df.head(20))

In [None]:
# 6. Insert SOC and EOC rows
df = add_soc_eoc(df, case_id, timestamp, event_name)
print(df.info())
print(df.head(20))

In [None]:
# 7. Create time features
df = create_time_features(df, case_id, timestamp, event_idx)
print(df.info())
print(df.head(20))

In [None]:
print(df[df[event_name] == 'SOC'])

In [None]:
print(df[df[event_name] == 'EOC'])

In [None]:
# calculate the summary statistics of time features
for col in continuous_features:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    print(col, 'Min', df[col].min())
    print(col, 'Q1:', q1)
    print(col, 'Mean', df[col].mean())
    print(col, 'Q3:', q3)
    print(col, 'Max', df[col].max())
    print(col, 'Median', df[col].median())
    print(col, 'Std', df[col].std())

In [None]:
# 8. Obtain max_value and min_value from training set
## Create df containing training and validation set
train_test_split_time, train_test_split_idx = get_train_test_split_point(df, 
                                                                         test_ratio,
                                                                         case_id, 
                                                                         timestamp)

In [None]:
print(train_test_split_time)

In [None]:
print(train_test_split_idx)

In [None]:
print(df.iloc[train_test_split_idx-3:train_test_split_idx+5])

In [None]:
df_before_split = df[df[timestamp] < train_test_split_time]
df_after_split = df[df[timestamp] >= train_test_split_time]

In [None]:
print('Number of events before train/test split":', len(df_before_split))
print('Number of cases before train/test split":', df_before_split[case_id].nunique())

In [None]:
print('Number of events after train/test split":', len(df_after_split))
print('Number of cases after train/test split":', df_after_split[case_id].nunique())

In [None]:
## Get training case list and validation case list
train_case_list, val_case_list = get_train_val_case_list(df_before_split,
                                                         val_ratio,
                                                         case_id,
                                                         timestamp)

In [None]:
print('Number of training cases:', len(train_case_list))
print('Number of validation cases:', len(val_case_list))

In [None]:
## Get training cases, based on which max_dict, min_dict and event_name_dict will be 
training_df = df_before_split[df_before_split[case_id].isin(train_case_list)]
val_df = df_before_split[df_before_split[case_id].isin(val_case_list)]

In [None]:
print('Number of training events:', len(training_df))
print('Number of validation events:', len(val_df))
print("Number of activity labels in training_df:", training_df[event_name].nunique())

In [None]:
## Get max value and min value for all continuous features from training set
_, max_dict, min_dict = train_log_normalize(training_df, continuous_features)

In [None]:
print(max_dict)

In [None]:
print(min_dict)

In [None]:
# torch.save(max_dict, 'max_dict_2019.pt')
# torch.save(min_dict, 'min_dict_2019.pt')

In [None]:
# 9. Log-normalize time featues
df = test_log_normalize(df,
                        max_dict,
                        min_dict,
                        continuous_features)

In [None]:
df.info()

In [None]:
print(df[df[event_name] == 'SOC'])

In [None]:
print(df[df[event_name] == 'EOC'])

In [None]:
# 10. Map event name to numbers
_, event_name_dict = train_mapping_event_name(training_df, event_name)
df,test_event_name_dict = test_mapping_event_name(df, 
                             event_name_dict,
                             event_name)

In [None]:
print(event_name_dict)

In [None]:
test_event_name_dict

In [None]:
df.head(20)

In [None]:
print("Number of activity labels:", df[event_name].nunique())

# Create train tensors and validation tensors

## Train tensors

In [None]:
# Create train_log_prefix_tensor, train_trace_prefix_tensor, train_suffix_act_tensor, train_suffix_time_tensor
train_log_prefix_tensor = create_log_prefix(df=df,
                                            log_prefix_len=log_prefix_len,
                                            case_list=train_case_list,
                                            start_idx=0,
                                            end_idx=len(df_before_split),
                                            num_act=num_act,
                                            log_col_name=log_col_name,
                                            categorical_features=categorical_features,
                                            case_id=case_id,
                                            event_name=event_name,
                                           event_idx=event_idx)
print(train_log_prefix_tensor.shape)

In [None]:
train_trace_prefix_tensor = create_trace_prefix(df=df, 
                        trace_prefix_len=trace_prefix_len, 
                        case_list=train_case_list,
                        start_idx=0,
                        end_idx=len(df_before_split),
                        num_act=num_act,
                        trace_col_name=trace_col_name,
                        categorical_features=categorical_features,
                        case_id=case_id,
                        event_name=event_name,
                        event_idx=event_idx)

In [None]:
print(train_trace_prefix_tensor.shape)

In [None]:
train_suffix_act_tensor, train_suffix_time_start_tensor, train_suffix_time_pre_tensor = create_trace_suffix(df=df, 
                        trace_suffix_len=trace_suffix_len, 
                        case_list=train_case_list,
                        start_idx=0,
                        end_idx=len(df_before_split),
                        trace_col_name=trace_col_name,
                        categorical_features=categorical_features,
                        case_id=case_id,
                        event_name=event_name,
                        event_idx=event_idx)

In [None]:
print(train_suffix_act_tensor.shape)
print(train_suffix_time_start_tensor.shape)
print(train_suffix_time_pre_tensor.shape)

In [None]:
train_log_next_act_tensor, train_log_next_time_tensor = create_log_next(df=df,
                                            case_list=train_case_list,
                                            start_idx=0,
                                            end_idx=len(df_before_split),
                                            log_col_name=log_col_name,
                                            categorical_features=categorical_features,
                                            case_id=case_id,
                                            event_name=event_name,
                                           event_idx=event_idx)

In [None]:
print(train_log_next_act_tensor.shape)
print(train_log_next_time_tensor.shape)

In [None]:
torch.save(train_log_prefix_tensor, 'train_log_prefix_tensor_0101.pt')

In [None]:
torch.save(train_trace_prefix_tensor,'train_trace_prefix_tensor_0101.pt')

In [None]:
torch.save(train_suffix_act_tensor, 'train_suffix_act_tensor_0101.pt')

In [None]:
torch.save(train_suffix_time_start_tensor, 'train_suffix_time_start_tensor_0101.pt')

In [None]:
torch.save(train_suffix_time_pre_tensor, 'train_suffix_time_pre_tensor_0101.pt')

In [None]:
torch.save(train_log_next_act_tensor, 'train_log_next_act_tensor_0101.pt')

In [None]:
torch.save(train_log_next_time_tensor, 'train_log_next_time_tensor_0101.pt')

## Validation tensors

In [None]:
val_log_prefix_tensor = create_log_prefix(df=df,
                                            log_prefix_len=log_prefix_len,
                                            case_list=val_case_list,
                                            start_idx=0,
                                            end_idx=len(df_before_split),
                                            num_act=num_act,
                                            log_col_name=log_col_name,
                                            categorical_features=categorical_features,
                                            case_id=case_id,
                                            event_name=event_name,
                                           event_idx=event_idx)

In [None]:
print(val_log_prefix_tensor.shape)

In [None]:
val_trace_prefix_tensor = create_trace_prefix(df=df, 
                        trace_prefix_len=trace_prefix_len, 
                        case_list=val_case_list,
                        start_idx=0,
                        end_idx=len(df_before_split),
                        num_act=num_act,
                        trace_col_name=trace_col_name,
                        categorical_features=categorical_features,
                        case_id=case_id,
                        event_name=event_name,
                        event_idx=event_idx)

In [None]:
print(val_trace_prefix_tensor.shape)

In [None]:
val_suffix_act_tensor, val_suffix_time_start_tensor, val_suffix_time_pre_tensor = create_trace_suffix(df=df, 
                        trace_suffix_len=trace_suffix_len, 
                        case_list=val_case_list,
                        start_idx=0,
                        end_idx=len(df_before_split),
                        trace_col_name=trace_col_name,
                        categorical_features=categorical_features,
                        case_id=case_id,
                        event_name=event_name,
                        event_idx=event_idx)

In [None]:
print(val_suffix_act_tensor.shape)
print(val_suffix_time_start_tensor.shape)
print(val_suffix_time_pre_tensor.shape)

In [None]:
val_log_next_act_tensor, val_log_next_time_tensor = create_log_next(df=df,
                                            case_list=val_case_list,
                                            start_idx=0,
                                            end_idx=len(df_before_split),
                                            log_col_name=log_col_name,
                                            categorical_features=categorical_features,
                                            case_id=case_id,
                                            event_name=event_name,
                                           event_idx=event_idx)

In [None]:
print(val_log_next_act_tensor.shape)
print(val_log_next_time_tensor.shape)

In [None]:
torch.save(val_log_prefix_tensor, 'val_log_prefix_tensor_0101.pt')

In [None]:
torch.save(val_trace_prefix_tensor, 'val_trace_prefix_tensor_0101.pt')

In [None]:
torch.save(val_suffix_act_tensor, 'val_suffix_act_tensor_0101.pt')

In [None]:
torch.save(val_suffix_time_start_tensor, 'val_suffix_time_start_tensor_0101.pt')

In [None]:
torch.save(val_suffix_time_pre_tensor, 'val_suffix_time_pre_tensor_0101.pt')

In [None]:
torch.save(val_log_next_act_tensor, 'val_log_next_act_tensor_0101.pt')

In [None]:
torch.save(val_log_next_time_tensor, 'val_log_next_time_tensor_0101.pt')

# Create test tensors

In [None]:
discard_case_list = get_discard_case_list(df, 
                          test_ratio, 
                          case_id, 
                          timestamp)

In [None]:
discard_cases = df[df[case_id].isin(discard_case_list)]
discard_cases.info()

In [None]:
print("Number of discard cases:", len(discard_case_list))

In [None]:
df_no_discard = create_table_without_discard_case(df, test_ratio, case_id, timestamp)
test_df = df_no_discard[df_no_discard[timestamp] >= train_test_split_time]

In [None]:
print(test_df.head(20))

In [None]:
test_case_list = test_df[case_id].unique().tolist()

In [None]:
test_end_idx = df[df[timestamp] > end_timestamp].index[0]
test_end_idx

In [None]:
test_log_prefix_tensor = create_log_prefix(df=df,
                                            log_prefix_len=log_prefix_len,
                                            case_list=test_case_list,
                                            start_idx=train_test_split_idx,
                                            end_idx=test_end_idx,
                                            num_act=num_act,
                                            log_col_name=log_col_name,
                                            categorical_features=categorical_features,
                                            case_id=case_id,
                                            event_name=event_name,
                                           event_idx=event_idx)
print(test_log_prefix_tensor.shape)

In [None]:
torch.save(test_log_prefix_tensor, 'test_log_prefix_tensor_0101.pt')

In [None]:
test_trace_prefix_tensor = create_trace_prefix(df=df, 
                        trace_prefix_len=trace_prefix_len, 
                        case_list=test_case_list,
                        start_idx=train_test_split_idx,
                        end_idx=test_end_idx,
                        num_act=num_act,
                        trace_col_name=trace_col_name,
                        categorical_features=categorical_features,
                        case_id=case_id,
                        event_name=event_name,
                        event_idx=event_idx)
print(test_trace_prefix_tensor.shape)

In [None]:
torch.save(test_trace_prefix_tensor, 'test_trace_prefix_tensor_0101.pt')

In [None]:
test_suffix_act_tensor, test_suffix_time_start_tensor, \
    test_suffix_time_pre_tensor = create_trace_suffix(df=df, 
                        trace_suffix_len=trace_suffix_len, 
                        case_list=test_case_list,
                        start_idx=train_test_split_idx,
                        end_idx=test_end_idx,
                        trace_col_name=trace_col_name,
                        categorical_features=categorical_features,
                        case_id=case_id,
                        event_name=event_name,
                        event_idx=event_idx)
print(test_suffix_act_tensor.shape)
print(test_suffix_time_start_tensor.shape)
print(test_suffix_time_pre_tensor.shape)

In [None]:
torch.save(test_suffix_act_tensor, 'test_suffix_act_tensor_0101.pt')
torch.save(test_suffix_time_start_tensor, 'test_suffix_time_start_tensor_0101.pt')
torch.save(test_suffix_time_pre_tensor, 'test_suffix_time_pre_tensor_0101.pt')

In [None]:
test_log_next_act_tensor, test_log_next_time_tensor = create_log_next(df=df,
                                            case_list=test_case_list,
                                            start_idx=train_test_split_idx,
                                            end_idx=test_end_idx,
                                            log_col_name=log_col_name,
                                            categorical_features=categorical_features,
                                            case_id=case_id,
                                            event_name=event_name,
                                           event_idx=event_idx)
print(test_log_next_act_tensor.shape)
print(test_log_next_time_tensor.shape)

In [None]:
torch.save(test_log_next_act_tensor, 'test_log_next_act_tensor_0101.pt')
torch.save(test_log_next_time_tensor, 'test_log_next_time_tensor_0101.pt')