In [20]:
import numpy as np
import networkx as nx
import itertools as it

def clusterDetection(G, threshold):
    """
    clusters the variants based on a given threshold; to do so, edges with a weight above the threshold are deleted from the given graph respresenting the optimal mappings

    :param threshold: the variant threshold the algorithm should use
    :return: list of subgraphs where each subgraph represents a cluster of variants

    """

    edges = list(G.edges(data=True))
    
    # remove the edges above the threshold (and below 0)
    for node1, node2, weight in edges:
        if weight['weight'] > threshold or weight['weight'] < 0:
            G.remove_edge(node1, node2)

    # get the subgraphs of the graph created this way
    subgraphNodes= nx.k_edge_subgraphs(G, k=1)
    subgraphs=[G.subgraph(nodes) for nodes in subgraphNodes]
    return subgraphs



def horizontalRefinement(candidateLabels, graphList):
    """
    Performs horizontal relabelling of event labels within a cluster; each event that belongs to the candidate labels will get a unique new label per cluster

    :param candidateLabels: s list of lsbels that should be refined
    :param graphList: a list of subgraphs where each subgraph represents a cluster of variants
    :return: s list of refined subgraphs, where the attribute 'newLabel' is changed for each candidate label, such that the event labels are unique per cluster
    """

    counter=1
    for subgraph in graphList:
        for label in candidateLabels:
            for node, dict in list(subgraph.nodes(data=True)):
                if dict['curLabel'] == label:
                    dict['newLabel'] += str(counter)
        counter += 1

    return graphList






In [21]:
G = nx.Graph()
G.add_nodes_from([(1, {'curLabel':'a', 'newLabel':'a'}),(2,{'curLabel':'b', 'newLabel':'b'}), (3,{'curLabel':'c', 'newLabel':'c'}),(4,{'curLabel':'b', 'newLabel':'b'}), (5,{'curLabel':'a', 'newLabel':'a'})])
G.add_edges_from([(1, 3, {'weight': 0}), (4, 5, {'weight': 0}), (2, 4, {'weight': 0.05}), (1,5,{'weight':0.7})])

subgraphs=clusterDetection(G,0.5)
print(subgraphs[0].nodes())
print(subgraphs[1].nodes(), "\n")

graphs=horizontalRefinement(['a','b'], subgraphs)
print(graphs[0].nodes(data=True))
print(graphs[1].nodes(data=True))


[1, 3]
[2, 4, 5] 

[(1, {'curLabel': 'a', 'newLabel': 'a1'}), (3, {'curLabel': 'c', 'newLabel': 'c'})]
[(2, {'curLabel': 'b', 'newLabel': 'b2'}), (4, {'curLabel': 'b', 'newLabel': 'b2'}), (5, {'curLabel': 'a', 'newLabel': 'a2'})]


In [252]:
def createConnectedComponents(graphList, candidateLabels):

    """
    creates the connected components for each imprecise label candidate

    :param graphList: a list of graphs representing each cluster
    :param candidateLabels: a list of labels that should be refined
    :return: list of event names with their corresponding connected components, i.e., a list of tuples (event name, nodes), where nodes is a list containing the IDs of the nodes in that component

    """

    connectedComponents=[]
    for label in candidateLabels:
        components=[]
        for subgraph in graphList:
            #only consider edges between different variants
            edges=[]
            allEdges=list(subgraph.edges(data=True))
            for v,w,weight in allEdges:
                if weight['weight']>-1:
                    edges.append((v,w))
                        
            # create the connected components
            for node, dict in list(subgraph.nodes(data=True)):
                if dict['curLabel'] == label:
                    for component in components:
                        for nodes in component:
                            if (nodes, node) in edges:
                                component.append(node)
                                break
                        else: 
                            continue
                        break

                    else:                         
                        components.append([node])
                    
        connectedComponents.append([label, components])

    return connectedComponents

def getMaxSizes(connectedComponents):
    """
    gives the maximal sizes of the components for each label with respect to the number of events

    :param connectedComponents: list of components that consider of lists [label, event IDs]
    :return: the maximal size of the components for each label
    """

    sizes=[]
    for label,components in connectedComponents:
        maxSize = 0
        for component in components:
            if len(component[0]) > maxSize:
                maxSize=len(component[0])
        sizes.append([label, maxSize])
    return sizes



In [253]:
comps=createConnectedComponents(graphs, ['a','b'])
print(comps)

#sizes=getMaxSizes(comps)
print("\n",sizes)

[['a', [[1], [5]]], ['b', [[3, 4]]]]

 [['a', 1], ['b', 2]]


In [270]:
def inSameVariant(component1, component2, variants):
    """
    Auxiliary function to determine if two connected components corresponding to the same label are in the same variant (according to section 5.4 in the paper)
    
    :param component1: first component given as a list of event IDs
    :param component2: second component given as a list of event IDs
    :param variants: list of variants given as a list of tuples (eventID, event label), i.e., a list of lists of tuples
    :returns: boolean; True if the connected components are in the same variant, False otherwise
    """
    for event1 in component1:
        for event2 in component2:
            for variant in variants:
                contains1=False
                contains2=False
                for eventID,_ in variant:
                    if event1==eventID:
                        contains1=True
                    if event2==eventID:
                        contains2=True
                if contains1 and contains2:
                    return True
        
    return False
            

In [271]:
import itertools as it

def ComponentsInSameVariant(connectedComponents, variants):
    newConnectedComponents=[]

    for component in connectedComponents:
        label=component[0]
        sameVariant=component[1]
        noMatch=[]
        for a, b in it.combinations(sameVariant, 2):
            if inSameVariant(a, b, variants):
                continue
            else: 
                sameVariant.remove(b)
                noMatch.append(b)
        newConnectedComponents.append([label, sameVariant])
        if len(noMatch)>1:
            newConnectedComponents.append(ComponentsInSameVariant([[label,noMatch]], variants))
        if len(noMatch)==1:
            newConnectedComponents.append([label, noMatch])


    return newConnectedComponents

In [272]:
print(comps)

sameVarcomp=ComponentsInSameVariant(comps, [[[1,'a'],[2,'b'],[3,'c']],[[4,'b'],[5,'a']]])
print(sameVarcomp)

[['a', [[1]]], ['b', [[3, 4]]]]
[['a', [[1]]], ['b', [[3, 4]]]]


In [273]:
inSameVariant([1,5],[2,4], [[[1,'a'],[2,'b'],[3,'c']],[[4,'b'],[5,'a']]])

True

In [274]:
def getPosition(event, variants):
    """
    Auxiliary function to determine the position of an event given its event ID    
    
    :param event: event ID of the event we need the position of
    :param variants: list of variants given as a list of tuples (eventID, event label), i.e., a list of lists of tuples
    :returns: integer representing the position of the event in its corresponding variant
    """

    for variant in variants:
        for eventID,_ in variant:
            if event==eventID:
                return event-variant[0][0]+1
        
  

In [275]:
getPosition(5, [[[1,'a'],[2,'b'],[3,'c']],[[4,'b'],[5,'a']]])

2

In [276]:
import numpy as np
def getAveragePositions(connectedComponents, variants):
    averagePositions=[]
    for Labelcomponents in connectedComponents:
        label=Labelcomponents[0]
        components=Labelcomponents[1]
        averages=[]
        for component in components:
            positions = []
            for event in component:
                positions.append(getPosition(event, variants))                    
            averages.append([component, np.average(positions)])
        
        averages = sorted(averages, key= lambda x: x[1])

        averagePositions.append([label, averages])
    return averagePositions


In [277]:
print(comps)
print(getAveragePositions(comps,[[[0, 'a'], [1,'a'],[2,'b'],[3,'c']],[[4,'b']],[[5,'a']]] ))

[['a', [[1]]], ['b', [[3, 4]]]]
[['a', [[[1], 2.0]]], ['b', [[[3, 4], 2.5]]]]


In [298]:
def verticalRefinement(graphList, candidateLabels, variants, threshold):
    
    # first, get the connected components
    connectedComponents=createConnectedComponents(graphList, candidateLabels)
    
    # then, get the connected components in the same variant
    sameVariants=ComponentsInSameVariant(connectedComponents, variants)

    # get the sorted components with their average position
    averagePositions=getAveragePositions(sameVariants, variants)

    # get the maximal sizes for the connected components in the same variant
    maxSizes=getMaxSizes(averagePositions)

    # start the refinement for each set of connected components in the same variant
    counter=0
    lengthComponents=len(averagePositions)
    for i in range(lengthComponents):
        pairs=averagePositions[i]
        label=pairs[0]        
        eventPosition=pairs[1]
        maxSize=maxSizes[i]
        length=len(eventPosition)

        for j in range(length):
            if j==0 or len(eventPosition[j][1])>= threshold*maxSize:
                counter +=1
                
            for subgraph in graphList:
                for node, dict in list(subgraph.nodes(data=True)):
                    for event in eventPosition[j][0]:

                        if node==event:
                            dict['newLabel'] += "."+str(counter)

    return graphList
            
                
            


In [299]:
G = nx.Graph()
G.add_nodes_from([(1, {'curLabel':'a', 'newLabel':'a'}),(3,{'curLabel':'b', 'newLabel':'b'}), (2,{'curLabel':'c', 'newLabel':'c'}),(4,{'curLabel':'b', 'newLabel':'b'}), (5,{'curLabel':'a', 'newLabel':'a'})])
G.add_edges_from([(1, 2, {'weight': 0}), (4, 5, {'weight': 0}), (3, 4, {'weight': 0.05}), (1,5,{'weight':0.7})])

subgraphs=clusterDetection(G,0.5)

graphs=horizontalRefinement(['a','b'], subgraphs)


graphs1=verticalRefinement(subgraphs, ['a','b'], [[(1,'a'),(2,'c')],[(4,'b'),(5,'a')],[(3,'b')]], 0.6)

print(graphs1[0].nodes(data=True))
print(graphs1[1].nodes(data=True))

[(1, {'curLabel': 'a', 'newLabel': 'a1.1'}), (2, {'curLabel': 'c', 'newLabel': 'c'})]
[(3, {'curLabel': 'b', 'newLabel': 'b2.3'}), (4, {'curLabel': 'b', 'newLabel': 'b2.3'}), (5, {'curLabel': 'a', 'newLabel': 'a2.2'})]
