In [24]:
import pm4py
import pandas as pd
import os
import numpy as np
import itertools as it
from pm4py.objects.log.importer.xes import factory as xes_importer
from itertools import combinations
from operator import itemgetter
from pm4py.algo.filtering.log.variants import variants_filter

log = xes_importer.apply("C:/Users/Bianka/Documents/chapter_1/roadtraffic50traces.xes")

In [25]:
def lookUpTable(log):
    variant=variants_filter.get_variants_from_log_trace_idx(log, parameters=None)
    return variant

# get the variants from the dicctionary created with lookUpTable() and omit the IDs
# input: dicctionary with variants as keys and case IDs as values
# stored as a list containing the variants as lists (-> list of lists)
def getVariants(dicctionary):
    variants=list(dicctionary.keys())
    return [variant.split(',') for variant in variants]

## Calculating mappings

In [50]:
def createEventIDs(variants=[]):
    seq = it.count()
    return [[(next(seq),event) for event in variant] for variant in variants]


def common_labels(variant1, variant2):
    var1 = [y[1] for (x, y) in enumerate(variant1)]
    var2 = [y[1] for (x, y) in enumerate(variant2)]
    return list(set(var1).intersection(var2))



def getNumberOfCommonLabels(variant1=[], variant2=[]):
    return len(common_labels(variant1,variant2))



def get_positions_label(string, variant):
    positions = []
    i = 0
    for x,y in enumerate(variant):
        if y[1] == string:
            positions.insert(i, y[0])
            i +=1
    return positions

#Args: variant1, variant2 as a list of tuples from createEventIDs(variants)
#Returns: list of all possible mappings for variant1 and variant2 where each mapping is a set of matched pairs
def possibleMappings(variant1, variant2):
    matches = [(a,c) for (a,b) in variant1 for (c,d) in variant2 if b == d]
    n = getNumberOfCommonLabels(variant1, variant2)
    
    return [set(combi) for combi in it.combinations(matches, n)
                        if len(set(it.chain.from_iterable(combi))) == (2*n)]

#Args: set of candidate labels for refinement, list of all trace variants in event log
#Returns: a list with all event IDs whose label is in the candidate set, we use this later for edge creation in the graph
def positions_of_candidates(candidates, variants):
    positions_of_candidates = []
    for variant in variants:
        labels = set(map(itemgetter(1), variant))
        for label in labels:
            if label in candidates:
                positions = get_positions_label(label, variant)
                positions_of_candidates.extend(positions)
    return positions_of_candidates

In [62]:
orig_variants = lookUpTable(log)
print(orig_variants) #variants as set with ID
variants_withoutID = getVariants(orig_variants) #a list containing lists corresponding to variants (without IDs)
print("Variants: \n", variants_withoutID)
variants = createEventIDs(variants_withoutID)
print("Variants with event IDs: \n", variants)

{'Create Fine,Send Fine': [0, 12, 27, 31, 36, 48, 49], 'Create Fine,Send Fine,Insert Fine Notification,Add penalty,Send for Credit Collection': [1, 4, 7, 9, 10, 11, 13, 15, 16, 17, 19, 21, 22, 26, 30, 32, 35, 42, 43, 44, 47, 50], 'Create Fine,Send Fine,Insert Fine Notification,Add penalty,Payment': [2, 14, 25, 29, 41, 45, 46], 'Create Fine,Send Fine,Insert Fine Notification,Insert Date Appeal to Prefecture,Add penalty,Send Appeal to Prefecture': [3], 'Create Fine,Payment': [5, 6, 18, 20, 23, 24, 33, 34, 37, 38, 39, 40], 'Create Fine,Send Fine,Insert Fine Notification,Add penalty,Payment,Payment': [8, 28]}
Variants: 
 [['Create Fine', 'Send Fine'], ['Create Fine', 'Send Fine', 'Insert Fine Notification', 'Add penalty', 'Send for Credit Collection'], ['Create Fine', 'Send Fine', 'Insert Fine Notification', 'Add penalty', 'Payment'], ['Create Fine', 'Send Fine', 'Insert Fine Notification', 'Insert Date Appeal to Prefecture', 'Add penalty', 'Send Appeal to Prefecture'], ['Create Fine', 'Pa

In [63]:
var1 = variants[2]
var2 = variants[5]
print("Variant1: \n", var1)
print("Variant2: \n", var2)

Variant1: 
 [(7, 'Create Fine'), (8, 'Send Fine'), (9, 'Insert Fine Notification'), (10, 'Add penalty'), (11, 'Payment')]
Variant2: 
 [(20, 'Create Fine'), (21, 'Send Fine'), (22, 'Insert Fine Notification'), (23, 'Add penalty'), (24, 'Payment'), (25, 'Payment')]


In [64]:
candidates = ["Send Fine", "Payment"]
#positions_of_candidates(candidates, eventIDs)
print("Pos. of candidates ", candidates, "are: \n", positions_of_candidates(candidates,eventIDs))

Pos. of candidates  ['Send Fine', 'Payment'] are: 
 [1, 3, 11, 8, 13, 19, 24, 25, 21]


In [65]:
get_positions_label("Send Fine", var1)

[8]

In [68]:
maps = possibleMappings(var1, var2)
print("Possible mappings between Variant1 and Variant2: \n", maps)

Possible mappings between Variant1 and Variant2: 
 [{(7, 20), (9, 22), (8, 21), (11, 24), (10, 23)}, {(7, 20), (9, 22), (8, 21), (10, 23), (11, 25)}]


## Cost function

In [178]:
def costStructure(variant1, variant2, mapping):
    cost_structure = 0
    combis = list(combinations(mapping, 2)) 
    for (pair1, pair2) in combis:
            distance1 = abs(pair1[0] - pair2[0])
            distance2 = abs(pair1[1] - pair2[1])
            cost_structure += abs(distance1 - distance2)/2
    return cost_structure


def context(variant):
    predecessors_list = []
    successors_list = []
    predecessors = []
    successors = []
    empty = []
    rest = list(map(itemgetter(1), variant[1:]))
    predecessors_list.insert(0,empty)
    successors_list.insert(0,rest)
    for index in range(1,len(variant)):
        pred_before = predecessors_list[index-1]
        succ_before = successors_list[index-1]
        last_label = [variant[index-1][1]]
        current_label = variant[index][1]
        #predecessors of current label are the predecessors of the last label plus last label
        predecessors_list.insert(index, pred_before + last_label)
        s_temp = succ_before.copy()
        s_temp.remove(current_label)
        #successors of current label are the successors of the last label minus current label
        successors_list.insert(index, s_temp) 
    for elem in predecessors_list:
        predecessors.append(set(elem))
    for elem2 in successors_list:
        successors.append(set(elem2))
        
    return predecessors, successors


def costNoMatch(variant1, variant2, mapping):
    mapped = set(common_labels(variant1, variant2)) #set of labels that were mapped
    unmapped1 = set(map(itemgetter(1), variant1)).difference(mapped) #set of unmapped labels in variant1
    unmapped2 = set(map(itemgetter(1), variant2)).difference(mapped) #set of unmapped labels in variant2
    firstId1 = variant1[0][0]
    firstId2 = variant2[0][0]
    pred1, succ1 = context(variant1)
    pred2, succ2 = context(variant2)
    np1 = 0
    ns1 = 0
    np2 = 0
    ns2 = 0
    for unmapped_label1 in unmapped1:
        positions1 = [x-firstId1 for x in get_positions_label(unmapped_label1, variant1)]
        for p1 in positions1:
            np1 += len(pred1[p1])
            ns1 += len(succ1[p1])
    for unmapped_label2 in unmapped2:
        positions2 = [x-firstId2 for x in get_positions_label(unmapped_label2, variant2)]
        for p2 in positions2:
            np1 += len(pred2[p2])
            ns1 += len(succ2[p2])
    sum = np1+np2+ns1+ns2
    return sum


def costMatched(variant1, variant2, mapping):
    firstId1 = variant1[0][0]
    firstId2 = variant2[0][0]
    pred1, succ1 = context(variant1)
    pred2, succ2 = context(variant2)
    sum = 0
    for pair in mapping:
        p1 = pair[0]-firstId1
        #print("Predecessors on pos ", p1, ": \n", pred1[p1])
        #print("Successors on pos ", p1, ": \n",succ1[p1])
        p2 = pair[1]-firstId2
        #print("Predecessors on pos ", p2, ": \n", pred2[p2])
        #print("Successors on pos ", p1, ": \n", succ2[p2])
        sum += len(pred1[p1])+len(pred2[p2])-2*len(pred1[p1].intersection(pred2[p2])) #number of distinct predecessors
        sum += len(succ1[p1])+len(succ2[p2])-2*len(succ1[p1].intersection(succ2[p2])) #number of distinct successors
    return sum  



def costMapping(wm,ws,wn,variant1,variant2,mapping):
    cost_struc = costStructure(variant1, variant2, mapping)
    cost_nomatch = costNoMatch(variant1, variant2, mapping)
    cost_match = costMatched(variant1, variant2, mapping)
    return wm*cost_match + ws*cost_struc + wn*cost_nomatch



def optimalMapping(variants, variant1, variant2, matrixx, wm, ws, wn):
    pos_variant1 = variants.index(variant1)
    pos_variant2 = variants.index(variant2)
    possible_mappings = possibleMappings(variant1, variant2)
    if possible_mappings != []:
        best_mapping = possible_mappings[0]
        cost_best = costMapping(wm,ws,wn,variant1,variant2,best_mapping)
        for mapping in possible_mappings:
            cost_new = costMapping(wm,ws,wn,variant1,variant2,mapping)
            if cost_new < cost_best:
                best_mapping = mapping
                cost_best = cost_new
        matrixx[pos_variant1, pos_variant2] = cost_best #entry ij in matrix updated with best cost
        matrixx[pos_variant2, pos_variant1] = cost_best #entry ji in matrix updated with best cost
        #bestMappings.append((best_mapping,cost_best))
    else:
        matrixx[pos_variant1, pos_variant2] = -42 #entry ij in matrix updated with best cost
        matrixx[pos_variant2, pos_variant1] = -42 #entry ji in matrix updated with best cost
    return best_mapping, cost_best




#Args: variant as a list of tuples, where variant = [(EventID,"Label")...]
#       k as integer for the k-predecessors/successors
def context2(variant,k):
    p, s = [], []
    n = len(variant)
    
    for i in range(n):
        s.append(set([b for (a,b) in variant[i+1:i+k+1]]))
        p.append(set([b for (a,b) in variant[i-n-k:i-n]]))

    return p,s

In [179]:
mapping12 = maps[1]
print("Mapping chosen to calculate costs: \n", mapping12)

Mapping chosen to calculate costs: 
 {(7, 20), (9, 22), (8, 21), (10, 23), (11, 25)}


In [180]:
print("CostStruc:", costStructure(var1,var2,mapping12))
print("CostNoMatch:", costNoMatch(var1,var2,mapping12))
print("CostMatched:", costMatched(var1,var2,mapping12))

CostStruc: 2.0
CostNoMatch: 0
CostMatched: 1


In [181]:
var3 = variants[0]
mapping13 = possibleMappings(var1,var3)[0]
print("Variant3:", var3)
print("Mapping for Variant1 and variant3:", mapping13)
print("CostStuc:", costStructure(var1,var3,mapping13))
print("CostNoMatch:", costNoMatch(var1,var3,mapping13))
print("CostMatched:", costMatched(var1,var3,mapping13))
print("Cost of mapping:", costMapping(1,1,1,var1,var3,mapping13))

Variant3: [(0, 'Create Fine'), (1, 'Send Fine')]
Mapping for Variant1 and variant3: {(8, 1), (7, 0)}
CostStuc: 0.0
CostNoMatch: 12
CostMatched: 6
Cost of mapping: 18.0


## Calculating best mappings for all pairs and saving best costs in a matrix

In [174]:
count = len(variants) 
C = np.zeros((count,count)) 
C

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [182]:
bestMappings = [] #list containing all best mappings

all_pairs = list(combinations(variants, 2))
for pair in all_pairs:
    optimal = optimalMapping(variants, pair[0],pair[1],C,1,1,1)
    best_mapping = optimal[0]
    best_cost = optimal[1]
    bestMappings.append((best_mapping,best_cost))
    
maxCost = np.amax(C)
print("MaxCost used for normalization:", maxCost)
C = C/maxCost
print("Cost Matrix C: \n", C)
print("No of mappings:", len(bestmappings))

print(bestmappings)

MaxCost used for normalization: 33.5
Cost Matrix C: 
 [[0.         0.53731343 0.53731343 0.8358209  0.11940299 0.71641791]
 [0.53731343 0.         0.47761194 0.82089552 0.65671642 0.65671642]
 [0.53731343 0.47761194 0.         0.82089552 0.58208955 0.02985075]
 [0.8358209  0.82089552 0.82089552 0.         0.95522388 1.        ]
 [0.11940299 0.65671642 0.58208955 0.95522388 0.         0.6119403 ]
 [0.71641791 0.65671642 0.02985075 1.         0.6119403  0.        ]]
No of mappings: 15
[({(1, 3), (0, 2)}, 18.0), ({(1, 8), (0, 7)}, 18.0), ({(1, 13), (0, 12)}, 28.0), ({(0, 18)}, 4), ({(1, 21), (0, 20)}, 24.0), ({(2, 7), (5, 10), (4, 9), (3, 8)}, 16.0), ({(4, 14), (2, 12), (5, 16), (3, 13)}, 27.5), ({(2, 18)}, 22), ({(4, 22), (5, 23), (2, 20), (3, 21)}, 22.0), ({(7, 12), (10, 16), (8, 13), (9, 14)}, 27.5), ({(11, 19), (7, 18)}, 19.5), ({(7, 20), (9, 22), (8, 21), (11, 24), (10, 23)}, 1.0), ({(12, 18)}, 32), ({(12, 20), (13, 21), (16, 23), (14, 22)}, 33.5), ({(19, 24), (18, 20)}, 20.5)]
