# Алгоритм разбиения бизнесс процесса на подпроцессы по журналу событий основанный на кластеризации трасс
В данном блокноте представлен алгоритм разбиения бизнесс процесса на подпроцессы по журналу событий и его применение 
на журнале событий:
1. [BPI Challenge 2019](https://data.4tu.nl/articles/_/12715853/1)

## Оглавление:
1. [Подгатовка данных](#11-подгатовка-данных)
2. [Применение алгоритма](#12-применение-алгоритма)


In [1]:
import numpy as np
import pandas as pd
import pm4py
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
event_log_xes = pm4py.read_xes("../Data/BPI Challenge 2019/BPI Challenge 2019.xes")
event_log_csv = pm4py.convert_to_dataframe(event_log_xes)
event_log_csv.to_csv("../Data/BPI Challenge 2019/BPI Challenge 2019.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 251734/251734 [01:01<00:00, 4067.77it/s]


In [2]:
df = pd.read_csv("../Data/BPI Challenge 2019/BPI Challenge 2019.csv")
df.head(5)

Unnamed: 0,User,org:resource,concept:name,Cumulative net worth (EUR),time:timestamp,case:Spend area text,case:Company,case:Document Type,case:Sub spend area text,case:Purchasing Document,...,case:Vendor,case:Item Type,case:Item Category,case:Spend classification text,case:Source,case:Name,case:GR-Based Inv. Verif.,case:Item,case:concept:name,case:Goods Receipt
0,batch_00,batch_00,SRM: Created,298.0,2018-01-02 12:53:00+00:00,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,...,vendorID_0000,Standard,"3-way match, invoice before GR",NPR,sourceSystemID_0000,vendor_0000,False,1,2000000000_00001,True
1,batch_00,batch_00,SRM: Complete,298.0,2018-01-02 13:53:00+00:00,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,...,vendorID_0000,Standard,"3-way match, invoice before GR",NPR,sourceSystemID_0000,vendor_0000,False,1,2000000000_00001,True
2,batch_00,batch_00,SRM: Awaiting Approval,298.0,2018-01-02 13:53:00+00:00,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,...,vendorID_0000,Standard,"3-way match, invoice before GR",NPR,sourceSystemID_0000,vendor_0000,False,1,2000000000_00001,True
3,batch_00,batch_00,SRM: Document Completed,298.0,2018-01-02 13:53:00+00:00,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,...,vendorID_0000,Standard,"3-way match, invoice before GR",NPR,sourceSystemID_0000,vendor_0000,False,1,2000000000_00001,True
4,batch_00,batch_00,SRM: In Transfer to Execution Syst.,298.0,2018-01-02 13:53:00+00:00,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,...,vendorID_0000,Standard,"3-way match, invoice before GR",NPR,sourceSystemID_0000,vendor_0000,False,1,2000000000_00001,True


In [3]:
print("Размер журнала", df.shape)
print("Активности в журнале", df["concept:name"].unique())

code_case, unique_case = pd.factorize(df["case:concept:name"])
code_activity, unique_activity = pd.factorize(df["concept:name"])

print("Count of cases:", len(unique_case))
print("Count of activities:", len(unique_activity))

df["code_case"] = code_case
df["code_activity"] = code_activity


for_activity_df = df.groupby(df["code_case"])["concept:name"].agg(list).reset_index()
corpus = for_activity_df['concept:name'].to_list()


Размер журнала (1595923, 21)
Активности в журнале ['SRM: Created' 'SRM: Complete' 'SRM: Awaiting Approval'
 'SRM: Document Completed' 'SRM: In Transfer to Execution Syst.'
 'SRM: Ordered' 'SRM: Change was Transmitted' 'Create Purchase Order Item'
 'Vendor creates invoice' 'Record Goods Receipt' 'Record Invoice Receipt'
 'Clear Invoice' 'Record Service Entry Sheet'
 'SRM: Transfer Failed (E.Sys.)' 'Cancel Goods Receipt'
 'Vendor creates debit memo' 'Cancel Invoice Receipt'
 'Change Delivery Indicator' 'Remove Payment Block' 'SRM: Deleted'
 'Change Price' 'Delete Purchase Order Item' 'SRM: Transaction Completed'
 'Change Quantity' 'Change Final Invoice Indicator' 'SRM: Incomplete'
 'SRM: Held' 'Receive Order Confirmation' 'Cancel Subsequent Invoice'
 'Reactivate Purchase Order Item' 'Update Order Confirmation'
 'Block Purchase Order Item' 'Change Approval for Purchase Order'
 'Release Purchase Order' 'Record Subsequent Invoice' 'Set Payment Block'
 'Create Purchase Requisition Item' 'Cha

In [4]:
grouped_df = df.groupby(df["code_case"])["code_activity"].agg(list).reset_index()
grouped_df["len"] = grouped_df["code_activity"].apply(lambda x : len(x))
grouped_df_graph = grouped_df.drop(["code_case", "code_activity"], axis=1)
grouped_df_graph = grouped_df_graph.rename(columns={'len': 'Length of the trace'})

In [5]:
grouped_df["len"].max()

990

In [6]:
mean_length = grouped_df_graph["Length of the trace"].mean()
mode_length = grouped_df_graph["Length of the trace"].median()
print("Средняя длинна слова:", mean_length)
print("Самая часто встречающаяся длинна слова:", mode_length)

Средняя длинна слова: 6.33971970413214
Самая часто встречающаяся длинна слова: 5.0


#### 2. Применение алгоритма

In [10]:
import LogSplitter
import Metrics

model = LogSplitter.LogSplitter(2)
model.fit(corpus)

max_dif, min_dif, average = 0, 1000000000, 0
for i in range(10, 100, 10):
    temp_dif = 0
    for j in range(5):

        model.transform(for_activity_df, name_id='code_case', name_traces='concept:name', trace_parts=i/100)
        temp_dif += abs( model.class_to_is[0] - model.class_to_is[1])
    
    temp_dif /= 5

    print(i, model.len_part, temp_dif)
    max_dif = max(max_dif, temp_dif)
    min_dif = min(min_dif, temp_dif)
    average += temp_dif

print("min_dif:", min_dif)
print("max_dif:", max_dif)
print("average_dif:", average / 9)

10 1 202123.6
20 1 195657.2
30 1 191660.8
40 2 182608.8
50 2 186523.2
60 3 135965.2
70 3 115496.8
80 4 157652.0
90 4 171568.4
min_dif: 115496.8
max_dif: 202123.6
average_dif: 171028.44444444444


In [None]:
for i in range(2, 5):
    model = LogSplitter.LogSplitter(i)
    model.fit(corpus)

    model.transform(for_activity_df, name_id='code_case', name_traces='concept:name', trace_parts=30/100)
    model.save_as_csv(df, "../Test/BPI Challenge 2019/" + str(i) + "/1/")

    model.transform(for_activity_df, name_id='code_case', name_traces='concept:name', trace_parts=70/100)
    model.save_as_csv(df, "../Test/BPI Challenge 2019/" + str(i) + "/2/")

In [12]:
metric = Metrics.Metric()
result = pd.DataFrame(columns=['Metric', '1', '2', '3', '4'])
e_cardoso_average = list()
e_cardoso_w_average = list()
pt_cd_average = list()
pt_cd_w_average = list()

e_cardoso_average.append("Average E-Cardoso")
e_cardoso_w_average.append("Weighted average E-Cardoso")
pt_cd_average.append("Average PT/CD")
pt_cd_w_average.append("Weighted average PT/CD")

t_df = [pd.read_csv("../Data/BPI Challenge 2019/BPI Challenge 2019.csv") for _ in range(1)]
t_df[0]["time:timestamp"] = pd.to_datetime(t_df[0]["time:timestamp"], format='ISO8601')
ans, e_average, e_w_average = metric.e_cardoso(t_df)
ans, pt_average, pt_w_average = metric.pt_cd(t_df)

e_cardoso_average.append(e_average.copy())
e_cardoso_w_average.append(e_w_average.copy())
pt_cd_average.append(pt_average.copy())
pt_cd_w_average.append(pt_w_average.copy())


for i in range(2, 5):
    t_df = [pd.read_csv("../Test/BPI Challenge 2019/" + str(i) + "/1/" + str(j) + ".csv") for j in range(i)]
    for temp in t_df:
        temp["time:timestamp"] = pd.to_datetime(temp["time:timestamp"], format='ISO8601')
    ans, e_average, e_w_average = metric.e_cardoso(t_df)
    ans, pt_average, pt_w_average = metric.pt_cd(t_df)

    e_cardoso_average.append(e_average.copy())
    e_cardoso_w_average.append(e_w_average.copy())
    pt_cd_average.append(pt_average.copy())
    pt_cd_w_average.append(pt_w_average.copy())


In [13]:
result.loc[len(result)] = e_cardoso_average
result.loc[len(result)] = e_cardoso_w_average
result.loc[len(result)] = pt_cd_average
result.loc[len(result)] = pt_cd_w_average

In [14]:
result

Unnamed: 0,Metric,1,2,3,4
0,Average E-Cardoso,76.0,67.5,46.0,67.75
1,Weighted average E-Cardoso,76.0,67.5625,52.368932,73.606218
2,Average PT/CD,2.945131,2.855158,2.944638,2.709685
3,Weighted average PT/CD,2.945131,2.849698,2.900019,2.722831


In [23]:
metric = Metrics.Metric()
result = pd.DataFrame(columns=['Metric', '1', '2', '3', '4'])

e_cardoso_average.append("Average E-Cardoso")
e_cardoso_w_average.append("Weighted average E-Cardoso")
pt_cd_average.append("Average PT/CD")
pt_cd_w_average.append("Weighted average PT/CD")

t_df = [pd.read_csv("../Data/BPI Challenge 2019/BPI Challenge 2019.csv") for _ in range(1)]
t_df[0]["time:timestamp"] = pd.to_datetime(t_df[0]["time:timestamp"], format='ISO8601')
ans, e_average, e_w_average = metric.e_cardoso(t_df)
ans, pt_average, pt_w_average = metric.pt_cd(t_df)

e_cardoso_average.append(e_average.copy())
e_cardoso_w_average.append(e_w_average.copy())
pt_cd_average.append(pt_average.copy())
pt_cd_w_average.append(pt_w_average.copy())


for i in range(2, 5):
    t_df = [pd.read_csv("../Test/BPI Challenge 2019/" + str(i) + "/2/" + str(j) + ".csv") for j in range(i)]
    for temp in t_df:
        temp["time:timestamp"] = pd.to_datetime(temp["time:timestamp"], format='ISO8601')
    ans, e_average, e_w_average = metric.e_cardoso(t_df)
    ans, pt_average, pt_w_average = metric.pt_cd(t_df)

    e_cardoso_average.append(e_average.copy())
    e_cardoso_w_average.append(e_w_average.copy())
    pt_cd_average.append(pt_average.copy())
    pt_cd_w_average.append(pt_w_average.copy())

In [24]:
result.loc[len(result)] = e_cardoso_average[5:]
result.loc[len(result)] = e_cardoso_w_average[5:]
result.loc[len(result)] = pt_cd_average[5:]
result.loc[len(result)] = pt_cd_w_average[5:]

In [25]:
result

Unnamed: 0,Metric,1,2,3,4
0,Average E-Cardoso,76.0,113.0,70.666667,84.25
1,Weighted average E-Cardoso,76.0,113.096774,72.134228,84.699187
2,Average PT/CD,2.945131,2.714289,2.865135,2.745701
3,Weighted average PT/CD,2.945131,2.714244,2.8666,2.741916
