# Import packages

In [76]:
# import packages

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import os
import random
import math
from datetime import datetime
import ast
from collections import Counter
from collections import defaultdict
from pm4py.objects.conversion.log import converter as log_converter
import scipy.stats as stats
from scipy.stats import chi2, norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Input data and set thresholds (The only one cell need to be updated by the user)

In [78]:
# use this package to store the test logs
train_log_set_package_name = 'train_log_set'

# use this package to store the test logs
test_log_set_package_name = 'test_log_set'

# use this package to store the outputs (to store the middle output after Step 2, and the final event logs with inferred resources after Step 3)
generated_log_set_package_name = 'generated_log_set'

# define the number of distinct resources to infer
num_resources = 20

# Read and preprocess data

In [79]:
current_dir = os.getcwd()
timestamp_format = "%Y-%m-%d %H:%M:%S"

train_set_path = os.path.join(current_dir, train_log_set_package_name)
train_logs_list = os.listdir(train_set_path)

# create a dictionary to store train log dataframes
df_dic_with_time_train = {}
for train_log in train_logs_list:
    log_name = train_log.replace(".csv", "")
    train_log_path = os.path.join(train_set_path, train_log)
    df_train = pd.read_csv(train_log_path)
    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'], utc=True)
    df_train['timestamp'] = df_train['timestamp'].dt.tz_localize(None)
    df_train['timestamp'] = df_train['timestamp'].dt.strftime(timestamp_format)
    df_dic_with_time_train[log_name] = (df_train, timestamp_format)

test_set_path = os.path.join(current_dir, test_log_set_package_name)
test_logs_list = os.listdir(test_set_path)

# create a dictionary to store train log dataframes
df_dic_with_time_test = {}
for test_log in test_logs_list:
    log_name = test_log.replace(".csv", "")
    test_log_path = os.path.join(test_set_path, test_log)
    df_test = pd.read_csv(test_log_path)
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'], utc=True)
    df_test['timestamp'] = df_test['timestamp'].dt.tz_localize(None)
    df_test['timestamp'] = df_test['timestamp'].dt.strftime(timestamp_format)
    df_dic_with_time_test[log_name] = (df_test, timestamp_format)

# create a package and record the path (to store the middle output after Step 2, and the final event logs with inferred resources after Step 3)
package_path = os.path.join(current_dir, generated_log_set_package_name)

# Extract basic knowledge from train logs (functions)

In [80]:
# this cell is to extract resource, and activity knowledge from event logs

# extract basic resource and handover knowledge from a single train log
def get_case_resource_act_knowledge(mas_log):
    case_knowledge = {}
    for trace_id in range(len(mas_log)):
        act_num, act_type_num, res_num, handover_num = 0, 0, 0, 0
        act_l, res_l = [], []
        trace_name = 'trace_' + str(trace_id)
        current_trace = mas_log[trace_id]
        for task_id in range(len(mas_log[trace_id])):
            act_num += 1

            if current_trace[task_id]['concept:name'] not in act_l:
                act_type_num += 1
                act_l.append(current_trace[task_id]['concept:name'])

            if current_trace[task_id]['agent_id'] not in res_l:
                res_num += 1
                res_l.append(current_trace[task_id]['agent_id'])

            if (task_id + 1) < len(mas_log[trace_id]):
                if current_trace[task_id]['agent_id'] != current_trace[task_id+1]['agent_id']:
                    handover_num += 1
        case_knowledge[trace_name] = [act_num, act_type_num, res_num, handover_num]

    return case_knowledge


# extract basic activity and resource knowledge from a single train log
def get_theory_one_information_from_log(mas_log):
    # resource_action_dic = {R1:{T1:2,T2:4,...}, R2:[T2:1,T3:3,...]}
    # action_resource_dic = {T1:[R1:1,R2:3,...], T2:[R3:4,R4:1,...]}
    resource_action_dic = {}
    action_resource_dic = {}
    # different resources set
    diff_resource_list = []
    # different actions set
    diff_task_list = []
    whole_act_num = 0
    for trace_id in range(len(mas_log)):
        current_trace = mas_log[trace_id]
        for task_id in range(len(mas_log[trace_id])):
            whole_act_num += 1
            current_resource = current_trace[task_id]['agent_id']
            # current_task_name = current_trace[task_id]['activity_type']
            current_task_name = current_trace[task_id]['concept:name']
            if current_resource not in diff_resource_list:
                diff_resource_list.append(current_resource)
            if current_task_name not in diff_task_list:
                diff_task_list.append(current_task_name)
            if current_resource not in resource_action_dic.keys():
                act_dic = {}
                act_dic[current_task_name] = 1
                resource_action_dic[current_resource] = act_dic
            else:
                if current_task_name not in resource_action_dic[current_resource].keys():
                    resource_action_dic[current_resource][current_task_name] = 1
                else:
                    resource_action_dic[current_resource][current_task_name] += 1

            if current_task_name not in action_resource_dic.keys():
                res_dic = {}
                res_dic[current_resource] = 1
                action_resource_dic[current_task_name] = res_dic
            else:
                if current_resource not in action_resource_dic[current_task_name].keys():
                    action_resource_dic[current_task_name][current_resource] = 1
                else:
                    action_resource_dic[current_task_name][current_resource] += 1

    return resource_action_dic, action_resource_dic, diff_resource_list, diff_task_list, whole_act_num

# Phase 1: Extraction
### Extract Domain 1: handover count frequency per case (functions)

In [81]:
# Phase 1: Extraction

# extract the Domain 1 (D1: handover count frequency per case)
def get_case_hand_knowledge(df_dic):
    win_num_dic = {'Poisson': [], 'Exponential': [], 'Geometric': [], 'Constant': []}
    num_zero_hand_case_percent = []
    for df_name, df_data in df_dic.items():
        df = df_data[0].rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
        unclustered_log = log_converter.apply(df) 
        exp_1 = get_case_resource_act_knowledge(unclustered_log)

        handover_num_l = []
        for key, value in exp_1.items():
            handover_num_l.append(value[3])
        
        data = handover_num_l
        lambda_hat = np.mean(data)  # Poisson MLE
        p_geom_hat = 1 / (np.mean(data) + 1)  # Geometric MLE
        lambda_exp_hat = 1 / np.mean(data) 
        c_constant = np.mean(data)

        values_c_dic = {}
        values, counts = np.unique(data, return_counts=True)
        for i in range(len(values)):
            values_c_dic[values[i]] = counts[i]

        counts_l = []
        values_l = []
        for j in range(max(values)):
            if j in values_c_dic.keys():
                values_l.append(j)
                counts_l.append(values_c_dic[j])
            else:
                values_l.append(j)
                counts_l.append(0)

        counts_prob_l = counts_l / sum(counts_l)
        num_zero_hand_case_percent.append(values_c_dic[0] / len(exp_1))

        poisson_pred_l = stats.poisson.pmf(values_l, lambda_hat)
        exp_pred_l = stats.expon.pdf(values_l, lambda_exp_hat)
        geom_pred_l = stats.geom.pmf(values_l, p_geom_hat)
        constant_pred_l = [1 / len(values_l)] * len(values_l)

        def get_square_distance(l1, l2):
            dist = 0
            for i in range(len(l1)):
                dist  = dist + (l1[i] - l2[i]) ** 2
            return dist
        
        def get_poisson_CI(data):
            lower = 0.5 * stats.chi2.ppf(0.05/2, 2*sum(data))
            upper = 0.5 * stats.chi2.ppf(1 - 0.05/2, 2*sum(data) + 2)
            ci = (lower / len(data), upper / len(data))
            return ci
        
        def get_exp_CI(data):
            x_bar = np.mean(data)
            n = len(data)
            lower = 2 * n * x_bar / chi2.ppf(1 - 0.05 / 2, df=2 * n)
            upper = 2 * n * x_bar / chi2.ppf(0.05 / 2, df=2 * n)
            ci = (1/upper, 1/lower)
            return ci
        
        def get_geom_CI(data):
            n = len(data)
            x_bar = np.mean(data)
            s = np.std(data, ddof=1)

            # Estimate of p
            p_hat = 1 / (x_bar + 1)

            # Standard error using delta method
            se_p_hat = s / (np.sqrt(n) * x_bar**2)

            # Z-value for 95% confidence
            z = norm.ppf(0.975)

            # Confidence Interval
            ci_lower = p_hat - z * se_p_hat
            ci_upper = p_hat + z * se_p_hat
            ci = (ci_lower, ci_upper)
            return ci
        
        def get_cons_CI(data):
            c_constant = np.mean(data)
            # Confidence interval
            n = len(data)
            s = np.std(data, ddof=1)
            z = norm.ppf(0.975)  # for 95% CI

            margin_error = z * s / np.sqrt(n)
            ci_lower = c_constant - margin_error
            ci_upper = c_constant + margin_error
            ci = (ci_lower, ci_upper)
            return ci

        pos_v = get_square_distance(counts_prob_l, poisson_pred_l)
        exp_v = get_square_distance(counts_prob_l, exp_pred_l)
        geom_v = get_square_distance(counts_prob_l, geom_pred_l)
        const_v = get_square_distance(counts_prob_l, constant_pred_l)

        distances = {'Poisson': (pos_v, lambda_hat, get_poisson_CI(data)), 'Exponential': (exp_v, lambda_exp_hat, get_exp_CI(data)), 'Geometric': (geom_v, p_geom_hat, get_geom_CI(data)), 'Constant': (const_v, c_constant, get_cons_CI(data))}
        best_model = min(distances, key=lambda k: distances[k][0])
        MLE_distance = distances[best_model][0]
        param_estimate = distances[best_model][1]
        ci = distances[best_model][2]
        win_num_dic[best_model].append((df_name, MLE_distance, param_estimate, ci))
    
    # our strategy is to compare the win number of these distributions firstly, if multiple approaches have the same number, then we compare their sum(distance)
    def get_possible_interval(win_num_dic):
        methods_ci_dic = {}
        for key, tups in win_num_dic.items():
            if tups != []:
                ci_left_l, ci_right_l = [], []
                for tup in tups:
                    ci_left_l.append(tup[3][0])
                    ci_right_l.append(tup[3][1])
                min_left = min(ci_left_l)
                max_right = max(ci_right_l)
                methods_ci_dic[key] = (min_left, max_right)
        return methods_ci_dic
    
    def get_sum_distance(tup_l):
        sum = 0
        for tup in tup_l:
            sum += tup[1]
        return sum

    win_max_num = max(len(v) for v in win_num_dic.values())
    methods_ci_dic = get_possible_interval(win_num_dic)
    win_distri_l = [k for k, v in win_num_dic.items() if len(v) == win_max_num]
    if len(win_distri_l) == 1:
        return [win_distri_l[0], methods_ci_dic[win_distri_l[0]], num_zero_hand_case_percent]
    else:
        best_method = win_distri_l[0]
        min_dist_sum = get_sum_distance(win_num_dic[best_method])
        for method in win_distri_l:
            if get_sum_distance(win_num_dic[method]) <= min_dist_sum:
                min_dist_sum = get_sum_distance(win_num_dic[method])
                best_method = method
        return [method, methods_ci_dic[method], num_zero_hand_case_percent]
    

### Extract Domain 2: distinct resource count frequency per case (functions)

In [82]:
# Phase 1: Extraction

# extract the Domain 2 (D2: distinct resource count frequency per case)
def get_case_resource_knowledge(df_dic):
    win_num_dic = {'Poisson': [], 'Exponential': [], 'Logarithmic': [], 'Geometric': [], 'Constant': []}
    num_zero_res_case_percent = []
    for df_name, df_data in df_dic.items():
        df = df_data[0].rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
        unclustered_log = log_converter.apply(df) 
        exp_1 = get_case_resource_act_knowledge(unclustered_log)

        res_num_l = []
        for key, value in exp_1.items():
            # for all the resource num, minus 1 to fill in these distributions
            res_num_l.append(value[2]-1)
        
        data = res_num_l
        lambda_hat = np.mean(data)  # Poisson MLE
        p_geom_hat = 1 / (np.mean(data) + 1)  # Geometric MLE
        lambda_exp_hat = 1 / np.mean(data) 
        c_constant = np.mean(data)

        values_c_dic = {}
        values, counts = np.unique(data, return_counts=True)
        for i in range(len(values)):
            values_c_dic[values[i]] = counts[i]

        counts_l = []
        values_l = []
        for j in range(max(values)):
            if j in values_c_dic.keys():
                values_l.append(j)
                counts_l.append(values_c_dic[j])
            else:
                values_l.append(j)
                counts_l.append(0)

        counts_prob_l = counts_l / sum(counts_l)
        num_zero_res_case_percent.append(values_c_dic[0] / len(exp_1))

        poisson_pred_l = stats.poisson.pmf(values_l, lambda_hat)
        exp_pred_l = stats.expon.pdf(values_l, lambda_exp_hat)
        geom_pred_l = stats.geom.pmf(values_l, p_geom_hat)
        constant_pred_l = [1 / len(values_l)] * len(values_l)

        def get_square_distance(l1, l2):
            dist = 0
            for i in range(len(l1)):
                dist  = dist + (l1[i] - l2[i]) ** 2
            return dist
        
        def get_poisson_CI(data):
            lower = 0.5 * stats.chi2.ppf(0.05/2, 2*sum(data))
            upper = 0.5 * stats.chi2.ppf(1 - 0.05/2, 2*sum(data) + 2)
            ci = (lower / len(data), upper / len(data))
            return ci
        
        def get_exp_CI(data):
            x_bar = np.mean(data)
            n = len(data)
            lower = 2 * n * x_bar / chi2.ppf(1 - 0.05 / 2, df=2 * n)
            upper = 2 * n * x_bar / chi2.ppf(0.05 / 2, df=2 * n)
            ci = (1/upper, 1/lower)
            return ci
        
        def get_geom_CI(data):
            n = len(data)
            x_bar = np.mean(data)
            s = np.std(data, ddof=1)

            # Estimate of p
            p_hat = 1 / (x_bar + 1)

            # Standard error using delta method
            se_p_hat = s / (np.sqrt(n) * x_bar**2)

            # Z-value for 95% confidence
            z = norm.ppf(0.975)

            # Confidence Interval
            ci_lower = p_hat - z * se_p_hat
            ci_upper = p_hat + z * se_p_hat
            ci = (ci_lower, ci_upper)
            return ci
        
        def get_cons_CI(data):
            c_constant = np.mean(data)
            # Confidence interval
            n = len(data)
            s = np.std(data, ddof=1)
            z = norm.ppf(0.975)  # for 95% CI

            margin_error = z * s / np.sqrt(n)
            ci_lower = c_constant - margin_error
            ci_upper = c_constant + margin_error
            ci = (ci_lower, ci_upper)
            return ci

        pos_v = get_square_distance(counts_prob_l, poisson_pred_l)
        exp_v = get_square_distance(counts_prob_l, exp_pred_l)
        geom_v = get_square_distance(counts_prob_l, geom_pred_l)
        const_v = get_square_distance(counts_prob_l, constant_pred_l)

        distances = {'Poisson': (pos_v, lambda_hat, get_poisson_CI(data)), 'Exponential': (exp_v, lambda_exp_hat, get_exp_CI(data)), 'Geometric': (geom_v, p_geom_hat, get_geom_CI(data)), 'Constant': (const_v, c_constant, get_cons_CI(data))}
        best_model = min(distances, key=lambda k: distances[k][0])
        MLE_distance = distances[best_model][0]
        param_estimate = distances[best_model][1]
        ci = distances[best_model][2]
        win_num_dic[best_model].append((df_name, MLE_distance, param_estimate, ci))
    
    # our strategy is to compare the win number of these distributions firstly, if multiple approaches have the same number, then we compare their sum(distance)
    def get_possible_interval(win_num_dic):
        methods_ci_dic = {}
        for key, tups in win_num_dic.items():
            if tups != []:
                ci_left_l, ci_right_l = [], []
                for tup in tups:
                    ci_left_l.append(tup[3][0])
                    ci_right_l.append(tup[3][1])
                min_left = min(ci_left_l)
                max_right = max(ci_right_l)
                methods_ci_dic[key] = (min_left, max_right)
        return methods_ci_dic
    
    def get_sum_distance(tup_l):
        sum = 0
        for tup in tup_l:
            sum += tup[1]
        return sum

    win_max_num = max(len(v) for v in win_num_dic.values())
    methods_ci_dic = get_possible_interval(win_num_dic)
    win_distri_l = [k for k, v in win_num_dic.items() if len(v) == win_max_num]
    if len(win_distri_l) == 1:
        return [win_distri_l[0], methods_ci_dic[win_distri_l[0]], num_zero_res_case_percent]
    else:
        best_method = win_distri_l[0]
        min_dist_sum = get_sum_distance(win_num_dic[best_method])
        for method in win_distri_l:
            if get_sum_distance(win_num_dic[method]) <= min_dist_sum:
                min_dist_sum = get_sum_distance(win_num_dic[method])
                best_method = method
        return [method, methods_ci_dic[method], num_zero_res_case_percent]


### Extract Domain 3: event trigger frequency per resource (functions)

In [83]:
# Phase 1: Extraction

# extract the Domain 3 (D3: event trigger frequency per resource)
def extract_aver_resource_rank_activity_freq_without_plot(df_dic_with_time_train):
    wx_l, wy_l = [], []
    rank_1_to_20_dic = {}
    for i in range(0, 20):
        rank_1_to_20_dic[i] = []

    for df_name, df in df_dic_with_time_train.items():
        x_l, y_l = [], []
        df1 = df[0].rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
        unclustered_log = log_converter.apply(df1)
        
        resource_action_dic, action_resource_dic, diff_resource_list, diff_task_list, whole_act_num = get_theory_one_information_from_log(unclustered_log)
        sorted_resource_action_dic_keys_l = sorted(resource_action_dic.keys(), key=lambda k: sum(resource_action_dic[k].values()), reverse=True)
        rank = 0
        for i in range(len(sorted_resource_action_dic_keys_l)):
            if rank < 20:
                x_l.append(rank)
                wx_l.append(rank)
                y_l.append(sum(resource_action_dic[sorted_resource_action_dic_keys_l[i]].values()) / whole_act_num)
                wy_l.append(sum(resource_action_dic[sorted_resource_action_dic_keys_l[i]].values()) / whole_act_num)
                rank_1_to_20_dic[rank].append(sum(resource_action_dic[sorted_resource_action_dic_keys_l[i]].values()) / whole_act_num)
                rank += 1

    aver_x_l = list(rank_1_to_20_dic.keys())
    aver_y_l = []
    for i in aver_x_l:
        aver_y_l.append(sum(rank_1_to_20_dic[i]) / len(rank_1_to_20_dic[i]))

    output_dic = {}
    for i in range(len(aver_x_l)):
        output_dic[aver_x_l[i]] = aver_y_l[i]

    return output_dic


### Extract Domain 4: case involvement frequency per resource (functions)

In [84]:
# Phase 1: Extraction

# extract the Domain 4 (D4: case involvement frequency per resource)
def get_resource_involve_case_num_without_plot(df_dic_with_time_train):
    output_dic = {} # output_dic = {'dataset':[100cases, 20cases, ...]}

    rank_1_to_20_dic = {}
    for i in range(0, 20):
        rank_1_to_20_dic[i] = []

    for df_name, df in df_dic_with_time_train.items():
        x_l, y_l = [], []
        df1 = df[0].rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
        unclustered_log = log_converter.apply(df1)
        resource_action_dic, action_resource_dic, diff_resource_list, diff_task_list, whole_act_num = get_theory_one_information_from_log(unclustered_log)
        single_set_dic = {}
        for res in diff_resource_list:
            single_set_dic[res] = [0, 0]
        for trace_id in range(len(unclustered_log)):
            current_trace = unclustered_log[trace_id]
            case_res_l = []
            for task_id in range(len(unclustered_log[trace_id])):
                current_res = current_trace[task_id]['agent_id']
                single_set_dic[current_res][0] += 1
                if current_res not in case_res_l:
                    case_res_l.append(current_res)
            for res in case_res_l:
                single_set_dic[res][1] += 1

        sort_single_set_dic = dict(sorted(single_set_dic.items(), key=lambda item: item[1][1], reverse=True))
        rank = 0
        for res, l in sort_single_set_dic.items():
            if rank < 20:
                x_l.append(rank)
                y_l.append(l[1])
            rank += 1

        x_l_1 = [x for x in x_l]
        y_l_1 = [y / sum(y_l) for y in y_l]
        for j in range(len(x_l_1)):
            rank_1_to_20_dic[x_l_1[j]].append(y_l_1[j])

    aver_x_l = list(rank_1_to_20_dic.keys())
    aver_y_l = []
    for i in aver_x_l:
        aver_y_l.append(sum(rank_1_to_20_dic[i]) / len(rank_1_to_20_dic[i]))

    output_aver = {}
    for i in range(len(aver_x_l)):
        output_aver[aver_x_l[i]] = aver_y_l[i]

    output_dic[df_name] = sort_single_set_dic

    return output_aver

### Train logistic regression classifier 1 (functions)

In [85]:
# Phase 1: Extraction
# logistic regression model 1 training (will be applied to detect the handover)

# construct the training datasets using all logs from the train set
def extract_df_for_train(dic_for_train):
    case_num, handover_num = 0, 0
    var_dic = {'case name':[], 'event location':[], 'activity same or not':[], 'time duration':[], 'continuous event number':[], 'handover or not':[]}
    for df_name, df in dic_for_train.items():
        df1 = df[0].rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
        unclustered_log = log_converter.apply(df1)

        # get the average time duration for activity pairs firstly
        time_per_act_pair = {}
        aver_time_per_act_pair = {}
        case_num += len(unclustered_log)
        for trace_id in range(len(unclustered_log)):
            trace_name = 'trace_' + str(trace_id)
            current_trace = unclustered_log[trace_id]
            for task_id in range(len(unclustered_log[trace_id])-1):
                current_time = datetime.strptime(current_trace[task_id]['time:timestamp'][:19], df[1])
                next_time = datetime.strptime(current_trace[task_id+1]['time:timestamp'][:19], df[1])
                current_act = current_trace[task_id]['concept:name']
                next_act = current_trace[task_id+1]['concept:name']
                if (current_act, next_act) not in time_per_act_pair.keys():
                    time_per_act_pair[(current_act, next_act)] = [(next_time - current_time).total_seconds()]
                else:
                    time_per_act_pair[(current_act, next_act)].append((next_time - current_time).total_seconds())

        for pair, time_l in time_per_act_pair.items():
            aver_time_per_act_pair[pair] = sum(time_l) / len(time_l)

        for trace_id in range(len(unclustered_log)):
            trace_name = 'trace_' + str(trace_id)
            current_trace = unclustered_log[trace_id]
            conti_event_len = 1
            for task_id in range(len(unclustered_log[trace_id])-1):
                current_time = datetime.strptime(current_trace[task_id]['time:timestamp'][:19], df[1])
                next_time = datetime.strptime(current_trace[task_id+1]['time:timestamp'][:19], df[1])
                current_act = current_trace[task_id]['concept:name']
                next_act = current_trace[task_id+1]['concept:name']
                current_res = current_trace[task_id]['agent_id']
                next_res = current_trace[task_id+1]['agent_id']

                var_dic['case name'].append(trace_name)
                var_dic['event location'].append(task_id)

                if current_act == next_act:
                    act_same_or_not = 1
                else:
                    act_same_or_not = 0
                var_dic['activity same or not'].append(act_same_or_not)

                if aver_time_per_act_pair[(current_act, next_act)] == 0:
                    time_dur_per = 0
                else:
                    time_dur_per = (next_time - current_time).total_seconds() / aver_time_per_act_pair[(current_act, next_act)]
                var_dic['time duration'].append(time_dur_per)

                if next_res == current_res:
                    var_dic['continuous event number'].append(conti_event_len)
                    var_dic['handover or not'].append(0)
                    conti_event_len += 1
                else:
                    var_dic['continuous event number'].append(conti_event_len)
                    var_dic['handover or not'].append(1)
                    handover_num += 1
                    conti_event_len = 1

    var_df = pd.DataFrame(var_dic)

    return var_df, case_num, handover_num


# train the logistic regression model 1
def train_datasets_store_in_dic(dic_for_train):
    var_df, train_case_num, train_hand_num = extract_df_for_train(dic_for_train)
    # train the logistic regression model
    X = var_df[['event location', 'activity same or not', 'time duration']]
    y = var_df['handover or not']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X[['event location', 'activity same or not', 'time duration']])
    X_final = np.hstack([X_scaled])
    # train logistic regression
    model = LogisticRegression()
    model.fit(X_final, y)

    return model


### Train logistic regression classifier 2 (functions)

In [86]:
# Phase 1: Extraction
# logistic regression model 2 training (will be applied to label the resource per case)

# extract the event sequence pairs
def extract_same_and_diff_resource_pair_from_one_log(log_tuple):
    df1 = log_tuple[0].rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
    unclustered_log = log_converter.apply(df1)
    # get the average time duration for activity pairs firstly
    log_pattern_dic = {}
    for trace_id in range(len(unclustered_log)):
        trace_name = 'trace_' + str(trace_id)
        current_trace = unclustered_log[trace_id]
        case_pattern_dic = {}
        pattern = [current_trace[0]['concept:name']]
        for task_id in range(len(unclustered_log[trace_id])-1):
            next_act = current_trace[task_id+1]['concept:name']
            current_res = current_trace[task_id]['agent_id']
            next_res = current_trace[task_id+1]['agent_id']
            if current_res == next_res:
                pattern.append(next_act)
            else:
                if current_res in case_pattern_dic.keys():
                    case_pattern_dic[current_res].append(pattern)
                else:
                    case_pattern_dic[current_res] = [pattern]
                pattern = [next_act]
            # consider the end of the case
            if (task_id + 1) == (len(unclustered_log[trace_id]) - 1):
                if current_res in case_pattern_dic.keys():
                    case_pattern_dic[current_res].append(pattern)
                else:
                    case_pattern_dic[current_res] = [pattern]

        log_pattern_dic[trace_name] = case_pattern_dic

    return log_pattern_dic


# also preprocess the training data (give the same resource or different resources for each event sequence pair Y variable (0 or 1))
def build_pairs_of_patterns(log_pattern_dic):
    # extract all the label 1 firstly
    pattern_1, pattern_2, label_l = [], [], []
    for case, re_dic in log_pattern_dic.items():
        if len(re_dic) > 1:
            for res, pattern_l in re_dic.items():
                if len(pattern_l) > 1:
                    for i in range(len(pattern_l)-1):
                        for j in range(i+1, len(pattern_l)):
                            pattern_1.append(pattern_l[i])
                            pattern_2.append(pattern_l[j])
                            label_l.append(1)

    # extract some of the label 2
    for case_1, re_dic_1 in log_pattern_dic.items():
        if len(re_dic_1) > 1:
            for r1 in range(len(re_dic_1)-1):
                for r2 in range(i+1, len(re_dic_1)):
                    pattern_1.append(random.choice(list(re_dic_1.values())[r1]))
                    pattern_2.append(random.choice(list(re_dic_1.values())[r2]))
                    label_l.append(0)

    pattern_dic = {'item1': pattern_1, 'item2': pattern_2, 'label': label_l}
    pattern_df = pd.DataFrame(pattern_dic)

    return pattern_df


# preprocess the data (we define the variables)
# 1. common activities (intersection, keeps minimum counts)
# 2. length difference of the two patterns ((length_1 - length2))
# 3. common activities dependencies (common dependency frequencies of two patterns / whole dependencies)
def preprocess_pattern_df(pattern_df, exists_df = ''):
    common_act_value, length_diff_value, common_dep_value, label_l = [], [], [], []
    for i, item in pattern_df.iterrows():
        pa_1 = item[0]
        pa_2 = item[1]
        label_l.append(item[2])

        counter1 = Counter(pa_1)
        counter2 = Counter(pa_2)
        common_elements = counter1 & counter2  # Intersection: keeps minimum counts
        num_common_elements = sum(common_elements.values())
        common_act_value.append(num_common_elements*2 / (len(pa_1) + len(pa_2)))

        length_diff_value.append(abs(len(pa_1)-len(pa_2)))

        dep_1, dep_2 = [], []
        num_common_dep = 0
        if len(pa_1) == 1 or len(pa_2) == 1:
            num_common_dep = 0
        else:
            for i1 in range(len(pa_1)-1):
                dep_1.append((pa_1[i1], pa_1[i1+1]))
            for i2 in range(len(pa_2)-1):
                dep_2.append((pa_2[i2], pa_2[i2+1]))
            dep_counter1 = Counter(dep_1)
            dep_counter2 = Counter(dep_2)
            common_dep = dep_counter1 & dep_counter2
            num_common_dep = 2 * sum(common_dep.values()) / (len(dep_1) + len(dep_2))

        common_dep_value.append(num_common_dep)

    processed_dic = {'common_act_value': common_act_value, 'length_diff_value': length_diff_value, 'common_dep_value': common_dep_value, 'label_l': label_l}
    processed_df = pd.DataFrame(processed_dic)

    # concat dfs
    if isinstance(exists_df, str) and exists_df == '':
        output_df = processed_df
    else:
        output_df = pd.concat([exists_df, processed_df], ignore_index=True)

    return output_df


# train the logistic regression model 2
def apply_logistic_regression_get_model(processed_df):
    X = processed_df[['common_act_value', 'length_diff_value', 'common_dep_value']]
    y = processed_df['label_l']

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X[['common_act_value', 'length_diff_value', 'common_dep_value']])
    X_final = np.hstack([X_scaled])

    X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.1, random_state=42)

    # train logistic regression
    logistic_model = LogisticRegression()
    logistic_model.fit(X_train, y_train)

    return logistic_model, X_test, y_test


# for multiple test logs, they will be applied to the same logistic regression model 2
def second_step_train_on_most_datasets(dic_for_train):
    processed_df = ''
    for dataset, inform_t in dic_for_train.items():
        log_pattern_dic = extract_same_and_diff_resource_pair_from_one_log(inform_t)
        pattern_df = build_pairs_of_patterns(log_pattern_dic)
        processed_df = preprocess_pattern_df(pattern_df, processed_df)

    logistic_model, X_test, y_test = apply_logistic_regression_get_model(processed_df)

    return logistic_model


# Phase 2: Inference
### Apply Step 1 --- handover detection (use Domain 1 and logistic regression classifier model 1) (functions)

In [87]:
# Phase 2: Inference
# Step 1 --- handover detection

# construct the test datasets for a single test log
def extract_df_for_test(log_for_test):
    # preprocess the test log
    var_test_dic = {'case name':[], 'event location':[], 'activity same or not':[], 'time duration':[], 'handover or not':[]}
    df1 = log_for_test[0].rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
    unclustered_log = log_converter.apply(df1)
    # get the average time duration for activity pairs firstly
    time_per_act_pair = {}
    aver_time_per_act_pair = {}
    case_num = len(unclustered_log)
    for trace_id in range(len(unclustered_log)):
        trace_name = 'trace_' + str(trace_id)
        current_trace = unclustered_log[trace_id]
        for task_id in range(len(unclustered_log[trace_id])-1):
            current_time = datetime.strptime(current_trace[task_id]['time:timestamp'][:19], log_for_test[1])
            next_time = datetime.strptime(current_trace[task_id+1]['time:timestamp'][:19], log_for_test[1])
            current_act = current_trace[task_id]['concept:name']
            next_act = current_trace[task_id+1]['concept:name']
            if (current_act, next_act) not in time_per_act_pair.keys():
                time_per_act_pair[(current_act, next_act)] = [(next_time - current_time).total_seconds()]
            else:
                time_per_act_pair[(current_act, next_act)].append((next_time - current_time).total_seconds())

    for pair, time_l in time_per_act_pair.items():
        aver_time_per_act_pair[pair] = sum(time_l) / len(time_l)

    for trace_id in range(len(unclustered_log)):
        trace_name = 'trace_' + str(trace_id)
        current_trace = unclustered_log[trace_id]
        for task_id in range(len(unclustered_log[trace_id])-1):
            current_time = datetime.strptime(current_trace[task_id]['time:timestamp'][:19], log_for_test[1])
            next_time = datetime.strptime(current_trace[task_id+1]['time:timestamp'][:19], log_for_test[1])
            current_act = current_trace[task_id]['concept:name']
            next_act = current_trace[task_id+1]['concept:name']
            current_res = current_trace[task_id]['agent_id']
            next_res = current_trace[task_id+1]['agent_id']

            var_test_dic['case name'].append(trace_name)
            var_test_dic['event location'].append(task_id)

            if current_act == next_act:
                act_same_or_not = 1
            else:
                act_same_or_not = 0
            var_test_dic['activity same or not'].append(act_same_or_not)

            if aver_time_per_act_pair[(current_act, next_act)] == 0:
                time_dur_per = 0
            else:
                time_dur_per = (next_time - current_time).total_seconds() / aver_time_per_act_pair[(current_act, next_act)]
            var_test_dic['time duration'].append(time_dur_per)

            if next_res == current_res:
                var_test_dic['handover or not'].append(0)
            else:
                var_test_dic['handover or not'].append(1)

    var_test_df = pd.DataFrame(var_test_dic)

    return var_test_df, case_num


# detect the handover for the test logs in the test log dictionary
def handover_detection_for_test_dic(dic_for_test, domain_1, regression_model_1):
    hand_dic = {}

    log_l = list(dic_for_test.keys())
    for i in range(len(dic_for_test)):
        print(f"{datetime.now().strftime('%H:%M:%S')} {log_l[i]} handover detection Start")
        log_for_test = dic_for_test[log_l[i]]
        distri_name, ci, num_zero_hand_case_percent = domain_1
        max_zero_hand_percent = max(num_zero_hand_case_percent)

        def get_distri_para_and_MLE(distri_name, data):
            values_c_dic = {}
            values, counts = np.unique(data, return_counts=True)
            for i in range(len(values)):
                values_c_dic[values[i]] = counts[i]

            counts_l = []
            values_l = []
            for j in range(max(values)):
                if j in values_c_dic.keys():
                    values_l.append(j)
                    counts_l.append(values_c_dic[j])
                else:
                    values_l.append(j)
                    counts_l.append(0)

            def get_square_distance(l1, l2):
                dist = 0
                for i in range(len(l1)):
                    dist  = dist + (l1[i] - l2[i]) ** 2
                return dist

            counts_prob_l = counts_l / sum(counts_l)
            if distri_name == 'Poisson':
                lambda_hat = np.mean(data)
                pred_l = stats.poisson.pmf(values_l, lambda_hat)
            if distri_name == 'Exponential':
                lambda_hat = 1 / (np.mean(data) + 1)
                pred_l = stats.expon.pdf(values_l, lambda_hat)
            if distri_name == 'Geometric':
                lambda_hat = 1 / np.mean(data) 
                pred_l = stats.geom.pmf(values_l, lambda_hat)
            if distri_name == 'Constant':
                lambda_hat = np.mean(data)
                pred_l = [1 / len(values_l)] * len(values_l)
            
            distance = get_square_distance(counts_prob_l, pred_l)
            return lambda_hat, distance

        # get the test df
        var_test_df, test_case_num = extract_df_for_test(log_for_test)

        # scale the test data
        X_prac = var_test_df[['event location', 'activity same or not', 'time duration']]
        scaler = StandardScaler()
        X_prac_scaled = scaler.fit_transform(X_prac[['event location', 'activity same or not', 'time duration']])
        y_prac = var_test_df['handover or not']

        # detect the handover for the test log and evaluate the precision
        # set the threshold set, later we will choose the best one from this set
        threshold_l = [round(0.01*i,2) for i in range(1, 100)]
        thres_best = threshold_l[0]
        lambda_best_diff = float("inf")
        lambda_in_ci_thres_dist_dic = {}
        for threshold in threshold_l:
            y_prac_pred_proba = regression_model_1.predict_proba(X_prac_scaled)[:, 1]
            y_prac_pred_adjusted = (y_prac_pred_proba >= threshold).astype(int)
            curr_case = var_test_df['case name'][0]
            hand_num_l = []
            hand_num_per_case = 0
            for j, row in var_test_df.iterrows():
                if row['case name'] == curr_case:
                    if y_prac_pred_adjusted[j] == 1:
                        hand_num_per_case += 1
                else:
                    hand_num_l.append(hand_num_per_case)
                    curr_case = row['case name']
                    if y_prac_pred_adjusted[j] == 1:
                        hand_num_per_case = 1
                    else:
                        hand_num_per_case = 0
            # get the last case handover num
            hand_num_l.append(hand_num_per_case)
            zero_percent = hand_num_l.count(0) / len(hand_num_l)
            if zero_percent <= max_zero_hand_percent:
                lambda_hat, distance = get_distri_para_and_MLE(distri_name, hand_num_l)
                if lambda_hat > ci[1]:
                    if (lambda_hat - ci[1]) < lambda_best_diff:
                        thres_best = threshold
                        lambda_best_diff = lambda_hat - ci[1]
                elif lambda_hat < ci[0]:
                    if (ci[0] - lambda_hat) < lambda_best_diff:
                        thres_best = threshold
                        lambda_best_diff = ci[0] - lambda_hat
                elif lambda_hat >= ci[0] and lambda_hat <= ci[1]:
                    lambda_in_ci_thres_dist_dic[threshold] = distance
        
        if lambda_in_ci_thres_dist_dic != {}:
            thres_best = min(lambda_in_ci_thres_dist_dic, key=lambda k: (lambda_in_ci_thres_dist_dic[k], list(lambda_in_ci_thres_dist_dic).index(k)))

        y_prac_pred_proba = regression_model_1.predict_proba(X_prac_scaled)[:, 1]
        y_prac_pred_adjusted = (y_prac_pred_proba >= thres_best).astype(int)
        prac_accuracy = accuracy_score(y_prac, y_prac_pred_adjusted)

        hand_dic[log_l[i]] = (var_test_df, y_prac_pred_adjusted)
        print(f"{datetime.now().strftime('%H:%M:%S')} {log_l[i]} handover detection Done")

    # return the handover detection results
    return hand_dic


### Apply Step 2 --- resource labeling per case (use Domain 2 and logistic regression classifier model 2) (functions)

In [88]:
# Phase 2: Inference
# Step 2 --- resource labeling per case

# preprocess the output results from step 1 (make the handover detection results as an input to this Step 2)
def construct_log_with_handover(df, var_test_df, y_prac_pred_adjusted):
    var_test_df['predict handover'] = y_prac_pred_adjusted
    df1 = df.rename(columns={
            'case_id': 'case:concept:name',     # Replace with actual case column name
            'activity_type': 'concept:name',  # Replace with actual activity column name
            'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
        })
    unclustered_log = log_converter.apply(df1)
    log_pattern_dic = {}
    for trace_id in range(len(unclustered_log)):
        trace_name = 'trace_' + str(trace_id)
        current_trace = unclustered_log[trace_id]
        case_pattern_l = []
        handover_for_the_case = var_test_df[(var_test_df['case name'] == trace_name) & (var_test_df['predict handover'] == 1)]
        position_l = handover_for_the_case['event location'].tolist()
        pattern_l = []
        for task_id in range(len(unclustered_log[trace_id])):
            if task_id not in position_l:
                pattern_l.append(current_trace[task_id]['concept:name'])
            else:
                pattern_l.append(current_trace[task_id]['concept:name'])
                case_pattern_l.append(pattern_l)
                pattern_l = []

            if task_id == (len(unclustered_log[trace_id]) - 1):
                case_pattern_l.append(pattern_l)
                pattern_l = []

        log_pattern_dic[trace_name] = case_pattern_l
    
    return log_pattern_dic


# preprocess the test dataset
def preprocess_pattern_df_test(pattern_df):
    common_act_value, length_diff_value, common_dep_value = [], [], []
    for i, item in pattern_df.iterrows():
        pa_1 = item[0]
        pa_2 = item[1]

        counter1 = Counter(pa_1)
        counter2 = Counter(pa_2)
        common_elements = counter1 & counter2  # Intersection: keeps minimum counts
        num_common_elements = sum(common_elements.values())
        common_act_value.append(num_common_elements*2 / (len(pa_1) + len(pa_2)))

        length_diff_value.append(abs(len(pa_1)-len(pa_2)))

        dep_1, dep_2 = [], []
        num_common_dep = 0
        if len(pa_1) == 1 or len(pa_2) == 1:
            num_common_dep = 0
        else:
            for i1 in range(len(pa_1)-1):
                dep_1.append((pa_1[i1], pa_1[i1+1]))
            for i2 in range(len(pa_2)-1):
                dep_2.append((pa_2[i2], pa_2[i2+1]))
            dep_counter1 = Counter(dep_1)
            dep_counter2 = Counter(dep_2)
            common_dep = dep_counter1 & dep_counter2
            num_common_dep = 2 * sum(common_dep.values()) / (len(dep_1) + len(dep_2))

        common_dep_value.append(num_common_dep)

    processed_dic = {'common_act_value': common_act_value, 'length_diff_value': length_diff_value, 'common_dep_value': common_dep_value}
    processed_df = pd.DataFrame(processed_dic)

    return processed_df


# get the prediction results (in the dataframe format)
def apply_logistic_detect_resource_for_case(log_pattern_dic, logistic_model, domain_2, file_path):
    # set the threshold set, later we will choose the best one from this set
    threshold_l = [round(0.05*i,2) for i in range(1, 20)]
    thres_best = threshold_l[0]
    lambda_best_diff = float("inf")
    lambda_in_ci_thres_dist_dic = {}
    best_whole_case_list, best_pattern_list, best_res_list = [], [], []
    distri_name, ci, zero_res_percent = domain_2
    for threshold in threshold_l:
        whole_case_list, pattern_list, res_list = [], [], []
        for case, pattern_l in log_pattern_dic.items():
            if len(pattern_l) <= 1:
                whole_case_list.append(case)
                pattern_list.append(pattern_l[0])
                res_list.append('R1')
            else:
                pattern_1, pattern_2, resource_l, i_j_l = [], [], [], []
                possible_res = {} # will be {1:[pattern_1, patterm_2], 2:[pattern_3], ...}
                current_case_re_num = 0
                for i in range(len(pattern_l)-1):
                    for j in range(i+1, len(pattern_l)):
                        pattern_1.append(pattern_l[i])
                        pattern_2.append(pattern_l[j])
                        i_j_l.append((i, j))

                pattern_dic = {'item1': pattern_1, 'item2': pattern_2}
                pattern_df = pd.DataFrame(pattern_dic)

                processed_df = preprocess_pattern_df_test(pattern_df)

                X_test = processed_df[['common_act_value', 'length_diff_value', 'common_dep_value']]

                # predict the relationships using the logistic regression model
                i_j_pa_dic = {}
                y_pred_proba = logistic_model.predict_proba(X_test)[:, 1]
                for i1 in range(len(i_j_l)):
                    i_j_pa_dic[i_j_l[i1]] = y_pred_proba[i1]
                
                whole_case_list.append(case)
                pattern_list.append(pattern_l[0])
                current_case_re_num += 1
                res_list.append('R1')
                possible_res[1] = [0]

                whole_case_list.append(case)
                pattern_list.append(pattern_l[1])
                current_case_re_num += 1
                res_list.append('R2')
                possible_res[2] = [1]

                for i2 in range(2, len(pattern_l)):
                    res_before = next((k for k, v in possible_res.items() if (i2 - 1) in v), None)
                    pat_cannot_calculate = possible_res[res_before]
                    max_prob = threshold
                    max_res = -1
                    for (i3, j3), prob_v in i_j_pa_dic.items():
                        if (j3 == i2) and (i3 not in pat_cannot_calculate):
                            if prob_v > max_prob:
                                max_res = next((k for k, v in possible_res.items() if i3 in v), None)
                                max_prob = prob_v

                    if max_res != -1:
                        res_str = 'R' + str(max_res)
                        whole_case_list.append(case)
                        pattern_list.append(pattern_l[i2])
                        res_list.append(res_str)
                        possible_res[max_res].append(i2)
                    else:
                        current_case_re_num += 1
                        res_str = 'R' + str(current_case_re_num)
                        whole_case_list.append(case)
                        pattern_list.append(pattern_l[i2])
                        res_list.append(res_str)
                        possible_res[current_case_re_num] = [i2]

        if threshold == threshold_l[0]:
            best_whole_case_list = whole_case_list
            best_pattern_list = pattern_list
            best_res_list = res_list

        def get_distri_para_and_MLE(distri_name, data):
            values_c_dic = {}
            values, counts = np.unique(data, return_counts=True)
            for i in range(len(values)):
                values_c_dic[values[i]] = counts[i]

            counts_l = []
            values_l = []
            for j in range(max(values)):
                if j in values_c_dic.keys():
                    values_l.append(j)
                    counts_l.append(values_c_dic[j])
                else:
                    values_l.append(j)
                    counts_l.append(0)

            def get_square_distance(l1, l2):
                dist = 0
                for i in range(len(l1)):
                    dist  = dist + (l1[i] - l2[i]) ** 2
                return dist

            counts_prob_l = counts_l / sum(counts_l)
            if distri_name == 'Poisson':
                lambda_hat = np.mean(data)
                pred_l = stats.poisson.pmf(values_l, lambda_hat)
            if distri_name == 'Exponential':
                lambda_hat = 1 / (np.mean(data) + 1)
                pred_l = stats.expon.pdf(values_l, lambda_hat)
            if distri_name == 'Geometric':
                lambda_hat = 1 / np.mean(data) 
                pred_l = stats.geom.pmf(values_l, lambda_hat)
            if distri_name == 'Constant':
                lambda_hat = np.mean(data)
                pred_l = [1 / len(values_l)] * len(values_l)
            
            distance = get_square_distance(counts_prob_l, pred_l)
            
            return lambda_hat, distance

        res_num_l = []
        res_single_case_l = [res_list[0]]
        for j in range(len(whole_case_list)-1):
            if whole_case_list[j+1] == whole_case_list[j]:
                if res_list[j+1] not in res_single_case_l:
                    res_single_case_l.append(res_list[j+1])
            else:
                res_num_l.append(len(res_single_case_l)-1) # minus the res_num by 1, for better fits
                res_single_case_l = [res_list[j+1]]
        # append the final value
        res_num_l.append(len(res_single_case_l))
        max_zero_res_percent = max(zero_res_percent)
        zero_percent = res_num_l.count(0) / len(res_num_l)
        if zero_percent <= max_zero_res_percent:
            lambda_hat, distance = get_distri_para_and_MLE(distri_name, res_num_l)
            if lambda_hat > ci[1]:
                if (lambda_hat - ci[1]) < lambda_best_diff:
                    thres_best = threshold
                    lambda_best_diff = lambda_hat - ci[1]
                    best_whole_case_list = whole_case_list
                    best_pattern_list = pattern_list
                    best_res_list = res_list
            elif lambda_hat < ci[0]:
                if (ci[0] - lambda_hat) < lambda_best_diff:
                    thres_best = threshold
                    lambda_best_diff = ci[0] - lambda_hat
                    best_whole_case_list = whole_case_list
                    best_pattern_list = pattern_list
                    best_res_list = res_list
            elif lambda_hat >= ci[0] and lambda_hat <= ci[1]:
                lambda_in_ci_thres_dist_dic[threshold] = (distance, whole_case_list, pattern_list, res_list)

    if lambda_in_ci_thres_dist_dic != {}:
        thres_best = min(lambda_in_ci_thres_dist_dic, key=lambda k: lambda_in_ci_thres_dist_dic[k][0])
        best_whole_case_list, best_pattern_list, best_res_list = lambda_in_ci_thres_dist_dic[thres_best][1:4]

    best_pattern_with_resource_dic = {'whole case list': best_whole_case_list, 'pattern list': best_pattern_list, 'res list': best_res_list}
    pattern_with_resource_df = pd.DataFrame(best_pattern_with_resource_dic)
    pattern_with_resource_df.to_csv(file_path)

    return pattern_with_resource_df


# finalize the Step 2 and store the output of this step into a file path
def create_event_df_after_step_two(df_dic_with_time_train, df_dic_with_time_test, test_df_with_y_pred_dic, domain_2, lrc_model_2, f_path):
    eve_log_l = list(df_dic_with_time_test.keys())
    for eve_log in eve_log_l:
        print(f"{datetime.now().strftime('%H:%M:%S')} {eve_log} resource labeling per case Start")
        (var_test_df, y_prac_pred_adjusted) = test_df_with_y_pred_dic[eve_log]
        logistic_model = lrc_model_2
        log_pattern_dic = construct_log_with_handover(df_dic_with_time_test[eve_log][0], var_test_df, y_prac_pred_adjusted)
        file_path = f_path + '/' + str(eve_log) + '_middle_output.csv'
        apply_logistic_detect_resource_for_case(log_pattern_dic, logistic_model, domain_2, file_path)
        print(f"{datetime.now().strftime('%H:%M:%S')} {eve_log} Step 2 middle outputs stored successfully")
        print(f"{datetime.now().strftime('%H:%M:%S')} {eve_log} resource labeling per case Done")


### Apply Step 3 --- resource assignment for the log (use Domain 3 and Domain 4) (functions)

In [89]:
# Phase 2: Inference
# Step 3 --- resource assignment for the log

# collect the domain 3 and domain 4 information
def get_aver_res_rank_act_and_case_from_train_set(dic_for_train):
    res_rank_act_perc_dic = extract_aver_resource_rank_activity_freq_without_plot(dic_for_train)
    res_rank_case_perc_dic = get_resource_involve_case_num_without_plot(dic_for_train)

    return res_rank_act_perc_dic, res_rank_case_perc_dic


# case_ids = [0, 0, 1, 1, 2, 2]  # 6 events across 3 cases
# event_groups = [[0, 1], [2, 3], [4, 5]]  # Each group spans a case
# target_event_counts = [2, 2, 2]  # For 3 resources
# target_case_counts = [2, 2, 2]
# the main of the simulated annealing step
def simulated_annealing_with_groups(
    case_ids,
    target_event_counts,
    target_case_counts,
    num_resources,
    event_groups,  # List of groups, e.g., [[0,1], [2,3]] where events in the same group must share a resource
    case_event_normalize, # use it to normalize the result (normalize = total case num / total event num)
    initial_temp=10000,
    cooling_rate=0.95,
    max_iter=1000,
):
    num_events = len(case_ids)
    current_assignments = [0] * num_events

    # Initialize grouped events with the same resource
    for group in event_groups:
        r = random.randint(0, num_resources - 1)
        for event_idx in group:
            current_assignments[event_idx] = r

    # Initialize non-grouped events randomly
    all_grouped_events = set(event_idx for group in event_groups for event_idx in group)
    non_group_events = [idx for idx in range(num_events) if idx not in all_grouped_events]
    for idx in non_group_events:
        current_assignments[idx] = random.randint(0, num_resources - 1)

    # Track event counts and case assignments efficiently
    event_counts = np.zeros(num_resources, dtype=int)
    case_event_counts = [defaultdict(int) for _ in range(num_resources)]
    case_sets = [set() for _ in range(num_resources)]

    for i, r in enumerate(current_assignments):
        event_counts[r] += 1
        case_id = case_ids[i]
        case_event_counts[r][case_id] += 1
        if case_event_counts[r][case_id] == 1:
            case_sets[r].add(case_id)

    # Compute initial cost
    current_cost = 0
    for r in range(num_resources):
        current_cost += (event_counts[r] - target_event_counts[r]) ** 2
        current_cost += (len(case_sets[r]) - target_case_counts[r]) ** 2

    best_assignments = current_assignments.copy()
    best_cost = current_cost
    temperature = initial_temp

    for iter in range(max_iter):
        # Decide whether to modify a group or an ungrouped event
        if non_group_events and random.random() < 0.5:
            # Move: Change an ungrouped event
            event_idx = random.choice(non_group_events)
            old_r = current_assignments[event_idx]
            new_r = random.choice([r for r in range(num_resources) if r != old_r])
            case_id = case_ids[event_idx]

            # Calculate delta cost
            delta_event = ((event_counts[old_r] - 1 - target_event_counts[old_r]) ** 2 - (event_counts[old_r] - target_event_counts[old_r]) ** 2) + \
                          ((event_counts[new_r] + 1 - target_event_counts[new_r]) ** 2 - (event_counts[new_r] - target_event_counts[new_r]) ** 2)

            # Case count delta for old_r
            if case_event_counts[old_r][case_id] == 1:
                delta_case_old = (len(case_sets[old_r]) - 1 - target_case_counts[old_r]) ** 2 - (len(case_sets[old_r]) - target_case_counts[old_r]) ** 2
            else:
                delta_case_old = 0

            # Case count delta for new_r
            if case_event_counts[new_r][case_id] == 0:
                delta_case_new = (len(case_sets[new_r]) + 1 - target_case_counts[new_r]) ** 2 - (len(case_sets[new_r]) - target_case_counts[new_r]) ** 2
            else:
                delta_case_new = 0
            
            # modify it with the normalized one
            total_delta = delta_event * (case_event_normalize ** 2) + (delta_case_old + delta_case_new)

            # Accept/reject
            if total_delta < 0 or random.random() < math.exp(-total_delta / temperature):
                # Update assignments and trackers
                current_assignments[event_idx] = new_r
                event_counts[old_r] -= 1
                event_counts[new_r] += 1

                case_event_counts[old_r][case_id] -= 1
                if case_event_counts[old_r][case_id] == 0:
                    case_sets[old_r].remove(case_id)
                case_event_counts[new_r][case_id] += 1
                if case_event_counts[new_r][case_id] == 1:
                    case_sets[new_r].add(case_id)

                current_cost += total_delta
                if current_cost < best_cost:
                    best_assignments = current_assignments.copy()
                    best_cost = current_cost

        else:
            # Move: Change a group's resource
            group = random.choice(event_groups)
            old_r = current_assignments[group[0]]
            new_r = random.choice([r for r in range(num_resources) if r != old_r])
            case_id = case_ids[group[0]]
            group_size = len(group)

            # Calculate delta event cost
            delta_event = ((event_counts[old_r] - group_size - target_event_counts[old_r]) ** 2 - (event_counts[old_r] - target_event_counts[old_r]) ** 2) + \
                          ((event_counts[new_r] + group_size - target_event_counts[new_r]) ** 2 - (event_counts[new_r] - target_event_counts[new_r]) ** 2)

            # Case count delta for old_r
            if case_event_counts[old_r][case_id] == group_size:
                delta_case_old = (len(case_sets[old_r]) - 1 - target_case_counts[old_r]) ** 2 - (len(case_sets[old_r]) - target_case_counts[old_r]) ** 2
            else:
                delta_case_old = 0

            # Case count delta for new_r
            if case_event_counts[new_r][case_id] == 0:
                delta_case_new = (len(case_sets[new_r]) + 1 - target_case_counts[new_r]) ** 2 - (len(case_sets[new_r]) - target_case_counts[new_r]) ** 2
            else:
                delta_case_new = 0

            total_delta = delta_event + delta_case_old + delta_case_new

            # Accept/reject
            if total_delta < 0 or random.random() < math.exp(-total_delta / temperature):
                # Update assignments and trackers for the entire group
                for event_idx in group:
                    current_assignments[event_idx] = new_r

                event_counts[old_r] -= group_size
                event_counts[new_r] += group_size

                case_event_counts[old_r][case_id] -= group_size
                if case_event_counts[old_r][case_id] == 0:
                    case_sets[old_r].remove(case_id)

                case_event_counts[new_r][case_id] += group_size
                if case_event_counts[new_r][case_id] == group_size:
                    case_sets[new_r].add(case_id)

                current_cost += total_delta
                if current_cost < best_cost:
                    best_assignments = current_assignments.copy()
                    best_cost = current_cost

        temperature *= cooling_rate

    return best_assignments, best_cost


# case_ids = [0, 0, 1, 1, 2, 2]  # 6 events across 3 cases
# event_groups = [[0, 1], [2, 3], [4, 5]]  # Each group spans a case
# target_event_counts = [2, 2, 2]  # For 3 resources
# target_case_counts = [2, 2, 2]
# preprocess the outputs from Step 2 (the stored file), to make the data as the input for Step 3
def assign_res_for_a_single_log(log_name, res_rank_act_perc_dic, res_rank_case_perc_dic, file_p):
    # get the csv file path after the step 2
    file_path = f'{file_p}/{log_name}_middle_output.csv'
    data = pd.read_csv(file_path)
    data['pattern list'] = data['pattern list'].apply(ast.literal_eval)
    case_ids, event_groups = [], []
    case_id = -1
    case_name = 'trace'
    event_id = 0
    for i, row in data.iterrows():
        if row['whole case list'] != case_name:
            if i != 0:
                for event_group in res_event_per_case_dic.values():
                    event_groups.append(event_group)
            res_event_per_case_dic = {}
            case_id += 1
            event_group = []
            for _ in row['pattern list']:
                case_ids.append(case_id)
                event_group.append(event_id)
                event_id += 1
            res_event_per_case_dic[row['res list']] = event_group
            case_name = row['whole case list']
        else:
            if row['res list'] not in res_event_per_case_dic.keys():
                event_group = []
                for _ in row['pattern list']:
                    case_ids.append(case_id)
                    event_group.append(event_id)
                    event_id += 1
                res_event_per_case_dic[row['res list']] = event_group
            else:
                for _ in row['pattern list']:
                    case_ids.append(case_id)
                    res_event_per_case_dic[row['res list']].append(event_id)
                    event_id += 1

    # deal with the act and case dics
    event_num = event_id
    case_num = case_id + 1
    case_event_normalize = case_num / event_num
    target_event_l = [int(event_num * per) for per in res_rank_act_perc_dic.values()]
    target_case_l = [int(case_num * per) for per in res_rank_case_perc_dic.values()]

    return case_ids, event_groups, target_event_l, target_case_l, case_event_normalize


# construct final output event logs using the assigned resource attributes
def add_resource_attr_to_csv(res_assignments, data, new_file_path, df_name, log_name, whether_TSI=False):
    data_1 = data
    if "agent_id" in data.columns:
        data_1 = data.drop(columns=["agent_id"], inplace=False)
    if "agent_activity_type" in data.columns:
        data_2 = data_1.drop(columns=["agent_activity_type"], inplace=False)
    else:
        data_2 = data_1
    # add the attributes with assigned resources
    res_l = []
    res_act_type_l = []
    for r in res_assignments:
        res = 'A' + str(r+1)
        res_l.append(res)
    for i, row in data_1.iterrows():
        act = row['activity_type']
        res_act_type = res_l[i] + "|" + act
        res_act_type_l.append(res_act_type)

    data_2['agent_id'] = res_l
    data_2['agent_activity_type'] = res_act_type_l

    data_2.to_csv(new_file_path, index=False)

    if whether_TSI == True:
        print(f"{datetime.now().strftime('%H:%M:%S')} {df_name} Step 3 final outputs stored successfully")
    else:
        print(f"{log_name} stored successfully")


# Evaluate the three-step resource inference pipeline (functions)

In [90]:
# Evaluation methods

# the evaluation cell (the mapping approach)
# map one predicted resource to the existed resource, then get the maximum precision
def get_theory_one_information_from_log_1(mas_log):
    # resource_action_dic = {R1:{T1:2,T2:4,...}, R2:[T2:1,T3:3,...]}
    # action_resource_dic = {T1:[R1:1,R2:3,...], T2:[R3:4,R4:1,...]}
    # case_action = {C1:[T1,T2,T3, ...], C2:[T1,T2,...], ...}
    resource_action_dic = {}
    action_resource_dic = {}
    case_action_dic = {}
    case_name = -1
    # different resources set
    diff_resource_list = []
    # different actions set
    diff_task_list = []
    # collect all the resources
    whole_resource_l = []
    for trace_id in range(len(mas_log)):
        case_name = 'Case_' + str(trace_id)
        current_trace = mas_log[trace_id]
        case_action_l = []
        for task_id in range(len(mas_log[trace_id])):
            current_resource = current_trace[task_id]['agent_id']
            whole_resource_l.append(current_resource)
            current_task_name = current_trace[task_id]['concept:name']
            case_action_l.append(current_task_name)
            if current_resource not in diff_resource_list:
                diff_resource_list.append(current_resource)
            if current_task_name not in diff_task_list:
                diff_task_list.append(current_task_name)
            if current_resource not in resource_action_dic.keys():
                act_dic = {}
                act_dic[current_task_name] = 1
                resource_action_dic[current_resource] = act_dic
            else:
                if current_task_name not in resource_action_dic[current_resource].keys():
                    resource_action_dic[current_resource][current_task_name] = 1
                else:
                    resource_action_dic[current_resource][current_task_name] += 1

            if current_task_name not in action_resource_dic.keys():
                res_dic = {}
                res_dic[current_resource] = 1
                action_resource_dic[current_task_name] = res_dic
            else:
                if current_resource not in action_resource_dic[current_task_name].keys():
                    action_resource_dic[current_task_name][current_resource] = 1
                else:
                    action_resource_dic[current_task_name][current_resource] += 1

        case_action_dic[case_name] = case_action_l

    return resource_action_dic, action_resource_dic, case_action_dic, diff_resource_list, diff_task_list, whole_resource_l


def evaluation_approach_one(diff_res_list, pred_res_l, whole_res_l, pred_whole_l):
    # sort the diff_res_list, trigger most events to trigger least events
    gd_freq_res_dic = {}
    pred_freq_res_dic = {}
    for gd_res in diff_res_list:
        gd_freq_res_dic[gd_res] = 0
    for res in whole_res_l:
        gd_freq_res_dic[res] += 1

    for pred_res in pred_res_l:
        pred_freq_res_dic[pred_res] = 0
    for res1 in pred_whole_l:
        pred_freq_res_dic[res1] += 1

    pred_res_same_num_dic = {} # {'r1':[1,3,5], 'r2':[2,4,6], ...}
    for pred_res in pred_res_l:
        pred_res_same_num_dic[pred_res] = {}
        for diff_res in diff_res_list:
            pred_res_same_num_dic[pred_res][diff_res] = 0

    for i in range(len(whole_res_l)):
        pred_res_same_num_dic[pred_whole_l[i]][whole_res_l[i]] += 1

    # get the plain dictionary and sort it
    plain_dic = {}
    for pred1, item1 in pred_res_same_num_dic.items():
        for gd1, num in item1.items():
            plain_dic[(pred1, gd1)] = num

    sorted_plain_dic = dict(sorted(plain_dic.items(), key=lambda item: item[1], reverse=True))
    record_pred_res_l, record_gd_res_l = [], []
    pred_gd_res_map = {}
    corr_num_min = 0
    for (pred_res, gd_res) in sorted_plain_dic.keys():
        if pred_res not in record_pred_res_l and gd_res not in record_gd_res_l:
            corr_num_min += sorted_plain_dic[(pred_res, gd_res)]
            pred_gd_res_map[(pred_res, gd_res)] = sorted_plain_dic[(pred_res, gd_res)]
            record_pred_res_l.append(pred_res)
            record_gd_res_l.append(gd_res)
        if len(record_gd_res_l) == len(diff_res_list):
            break
            
    print(f"{' ' * 8} Resource prediction accuracy: {round(corr_num_min / len(whole_res_l), 4)}")


def evaluation_approach_two(case_ids, whole_resource_l, best_assignments):
    total_num = 0
    pred_acc_num = 0
    for i in range(len(case_ids)-1):
        f_eve_c = case_ids[i]
        s_eve_c = case_ids[i+1]
        if f_eve_c == s_eve_c:
            total_num += 1
            if (whole_resource_l[i] == whole_resource_l[i+1] and best_assignments[i] == best_assignments[i+1]) or (whole_resource_l[i] != whole_resource_l[i+1] and best_assignments[i] != best_assignments[i+1]):
                pred_acc_num += 1

    print(f"{' ' * 8} Event pair resource handover accuracy: {round(pred_acc_num / total_num, 4)}")


def evaluation_pipeline(df_dic_with_time_test, dataset_name, case_ids, best_assignments):
    for df_name, df in df_dic_with_time_test.items():
        if df_name == dataset_name:
            df1 = df[0].rename(columns={
                'case_id': 'case:concept:name',     # Replace with actual case column name
                'activity_type': 'concept:name',  # Replace with actual activity column name
                'timestamp': 'time:timestamp'  # Replace with actual timestamp column name
            })
            unclustered_log = log_converter.apply(df1)

            pred_res_l = []
            for i in best_assignments:
                if i not in pred_res_l:
                    pred_res_l.append(i)

            resource_action_dic, action_resource_dic, case_action_dic, diff_resource_list, diff_task_list, whole_resource_l = get_theory_one_information_from_log_1(unclustered_log)
            evaluation_approach_one(diff_resource_list, pred_res_l, whole_resource_l, best_assignments)
            evaluation_approach_two(case_ids, whole_resource_l, best_assignments)

# Run the three-step resource inference pipeline and the evaluation

In [91]:
# run the three steps resource inference pipeline

df_name_l = list(df_dic_with_time_train.keys())
df_test_name_l = list(df_dic_with_time_test.keys())

print(f"{datetime.now().strftime('%H:%M:%S')} Three-step resource inference technique pipeline Start")
print()
# Phase 1: Extraction
print(f"{datetime.now().strftime('%H:%M:%S')} Phase 1: Extraction Start")
print()
# 1. extract Domain 1 from training logs
domain_1 = get_case_hand_knowledge(df_dic_with_time_train)
print(f"{datetime.now().strftime('%H:%M:%S')} Domain 1 extracted successfully")

# 2. extract Domain 2 from training logs
domain_2 = get_case_resource_knowledge(df_dic_with_time_train)
print(f"{datetime.now().strftime('%H:%M:%S')} Domain 2 extracted successfully")

# 3. extract Domain 3 and Domain 4 together
domain_3, domain_4 = get_aver_res_rank_act_and_case_from_train_set(df_dic_with_time_train)
print(f"{datetime.now().strftime('%H:%M:%S')} Domain 3 extracted successfully")
print(f"{datetime.now().strftime('%H:%M:%S')} Domain 4 extracted successfully")

# 4. train logistic regression classification model 1 (LRC1)
lrc_model_1 = train_datasets_store_in_dic(df_dic_with_time_train)
print(f"{datetime.now().strftime('%H:%M:%S')} Logistic regression model 1 trained successfully")

# 5. train logistic regression classification model 2 (LRC2)
lrc_model_2 = second_step_train_on_most_datasets(df_dic_with_time_train)
print(f"{datetime.now().strftime('%H:%M:%S')} Logistic regression model 2 trained successfully")

print()
print(f"{datetime.now().strftime('%H:%M:%S')} Phase 1: Extraction Done")
print()


# Phase 2: Inference
print(f"{datetime.now().strftime('%H:%M:%S')} Phase 2: Inference Start")
print()

# 1. Step 1 --- for all the test logs, we detect their handover and store the results in a dictionary
print(f"{datetime.now().strftime('%H:%M:%S')} Step 1 --- handover detection (for all test logs) Start")
handover_dic = handover_detection_for_test_dic(df_dic_with_time_test, domain_1, lrc_model_1)
print(f"{datetime.now().strftime('%H:%M:%S')} Step 1 --- handover detection (for all test logs) Done")
print()

# 2. Step 2 --- for all the test logs, we label the resources per case, and store the results in a file at a given path
print(f"{datetime.now().strftime('%H:%M:%S')} Step 2 --- resource labeling per case (for all test logs) Start")
create_event_df_after_step_two(df_dic_with_time_train, df_dic_with_time_test, handover_dic, domain_2, lrc_model_2, package_path)
print(f"{datetime.now().strftime('%H:%M:%S')} Step 2 --- resource labeling per case (for all test logs) Done")
print()

# 3. Step 3 --- for each test event log in the test set, we apply simulated annealing to infer resources
print(f"{datetime.now().strftime('%H:%M:%S')} Step 3 --- resource assignment for logs Start")
for i in range(len(df_test_name_l)):
    df_name = df_test_name_l[i]
    data = df_dic_with_time_test[df_name][0]
    print(f"{datetime.now().strftime('%H:%M:%S')} {df_name} resource assignment for the log Start")
    case_ids, event_groups, target_event_l, target_case_l, case_event_normalize = assign_res_for_a_single_log(df_name, domain_3, domain_4, package_path)
    best_assignments, best_cost = simulated_annealing_with_groups(case_ids, target_event_l, target_case_l, num_resources=num_resources, event_groups=event_groups, case_event_normalize=case_event_normalize, initial_temp=10000, cooling_rate=0.99, max_iter=1000000)
    new_file_path = f"{package_path}/{df_name}_final_TSI_resource_inferred_log.csv"
    log_name = f"{df_name}_final_resource_inferred_log.csv"
    whether_TSI = True
    add_resource_attr_to_csv(best_assignments, data, new_file_path, df_name, log_name, whether_TSI)
    print(f"{datetime.now().strftime('%H:%M:%S')} {df_name} resource assignment for the log Done")
    print(f"{datetime.now().strftime('%H:%M:%S')} {df_name} resource inferred log evaluation Start")
    evaluation_pipeline(df_dic_with_time_test, df_name, case_ids, best_assignments)
    print(f"{datetime.now().strftime('%H:%M:%S')} {df_name} resource inferred log evaluation Done")
print(f"{datetime.now().strftime('%H:%M:%S')} Step 3 --- resource assignment for logs Done")

print()
print(f"{datetime.now().strftime('%H:%M:%S')} Phase 2: Inference Done")
print()
print(f"{datetime.now().strftime('%H:%M:%S')} Three-step resource inference technique pipeline Done")

22:24:51 Three-step resource inference technique pipeline Start

22:24:51 Phase 1: Extraction Start

22:24:57 Domain 1 extracted successfully
22:25:02 Domain 2 extracted successfully
22:25:13 Domain 3 extracted successfully
22:25:13 Domain 4 extracted successfully
22:25:29 Logistic regression model 1 trained successfully
22:25:58 Logistic regression model 2 trained successfully

22:25:58 Phase 1: Extraction Done

22:25:58 Phase 2: Inference Start

22:25:58 Step 1 --- handover detection (for all test logs) Start
22:25:58 bpic_2011 handover detection Start
22:28:38 bpic_2011 handover detection Done
22:28:38 Step 1 --- handover detection (for all test logs) Done

22:28:38 Step 2 --- resource labeling per case (for all test logs) Start
22:28:38 bpic_2011 resource labeling per case Start
22:32:13 bpic_2011 Step 2 middle outputs stored successfully
22:32:13 bpic_2011 resource labeling per case Done
22:32:13 Step 2 --- resource labeling per case (for all test logs) Done

22:32:13 Step 3 --- r

# Generate logs with inferred resources using the following two baseline approaches

In [92]:
# this cell is related to baseline approaches (we choose two baseline approaches)
# 1. randomly assigned
# 2. assigned based on resource-activity number distributions

# 1. randomly assigned

def baseline_randomly_assign(case_ids, num_resources):
    randomly_assign_res_l = [random.randint(0, num_resources-1) for _ in case_ids]
    
    return randomly_assign_res_l


for i in range(len(df_test_name_l)):
    df_name = df_test_name_l[i]
    print(f"{df_name} resource inference log with uniformly distribution start generating")
    data = df_dic_with_time_test[df_name][0]
    case_ids, event_groups, target_event_l, target_case_l, case_event_normalize = assign_res_for_a_single_log(df_name, domain_3, domain_4, package_path)
    randomly_res_l = baseline_randomly_assign(case_ids, num_resources=num_resources)
    evaluation_pipeline(df_dic_with_time_test, df_name, case_ids, randomly_res_l)
    new_file_path = f"{package_path}/{df_name}_final_uniformly_resource_inferred_log.csv"
    log_name = f"{df_name}_final_uniformly_resource_inferred_log.csv"
    add_resource_attr_to_csv(randomly_res_l, data, new_file_path, df_name, log_name)
    print()


# 2. assigned based on resource-activity number distributions
def baseline_randomly_distribution_assign(case_ids, res_event_distri_l, num_resources):
    population = list(range(num_resources))
    weights = res_event_distri_l
    random_distribution_assign_l = random.choices(population, weights=weights,k=len(case_ids))
    
    return random_distribution_assign_l


for i in range(len(df_test_name_l)):
    df_name = df_test_name_l[i]
    print(f"{df_name} resource inference log based on resource-activity distribution start generating")
    data = df_dic_with_time_test[df_name][0]
    case_ids, event_groups, target_event_l, target_case_l, case_event_normalize = assign_res_for_a_single_log(df_name, domain_3, domain_4, package_path)
    random_distribution_assign_l = baseline_randomly_distribution_assign(case_ids, target_event_l, num_resources=num_resources)
    evaluation_pipeline(df_dic_with_time_test, df_name, case_ids, random_distribution_assign_l)
    new_file_path = f"{package_path}/{df_name}_final_randomly_distributed_resource_inferred_log.csv"
    log_name = f"{df_name}_final_randomly_distributed_resource_inferred_log.csv"
    add_resource_attr_to_csv(random_distribution_assign_l, data, new_file_path, df_name, log_name)
    print()

bpic_2011 resource inference log with uniformly distribution start generating
         Resource prediction accuracy: 0.0521
         Event pair resource handover accuracy: 0.2273
bpic_2011_final_uniformly_resource_inferred_log.csv stored successfully

bpic_2011 resource inference log based on resource-activity distribution start generating
         Resource prediction accuracy: 0.2157
         Event pair resource handover accuracy: 0.2864
bpic_2011_final_randomly_distributed_resource_inferred_log.csv stored successfully

