In [5]:
%run load_data.py
import sys
import multiprocessing as mp
import numpy as np
import scipy.sparse.linalg as splin
import scipy.sparse as sparse
import random
import math

np.random.seed(42)
random.seed(42)

In this document the methods described in the methods section are implemented for the network of verdicts. The data is loaded as `networkx`directed graph making it relatively easy to work with. The goal is to set up an easily used interface for running K-folds cross validation on the network for different link prediction algorithms and evaluate them with ROC and precision.

In [6]:
# Find the greatest connected component and work on that
components = []
lengths = []
# Find the greatest component from the undirected version of the graph
for component in nx.connected_component_subgraphs(nx.Graph(G)):
    components.append(component)
    lengths.append(len(component))
# Find the GCC as the largest component and then recreate the directed graph
GCC = components[lengths.index(max(lengths))]
GCC = G.subgraph(GCC.nodes())

In [164]:
def successors_scoped(node, head, G):
    """
    Return the successors of a node scoped to the time specified by the head node
    
    arguments:
    node -- node to return successors for
    head -- node containing the timestamp to scope the graph to
    G -- graph containing the nodes
    """
    max_time = G.node[head]['date']
    return [n for n in G.successors(node) if G.node[n]['date'] <= max_time]

def predecessors_scoped(node, head, G):
    """
    Return the predecessors of a node scoped to the time specified by the head node
    
    arguments:
    node -- node to return predecessors for
    head -- node containing the timestamp to scope the graph to
    G -- graph containing the nodes
    """
    max_time = G.node[head]['date']
    return [n for n in G.predecessors(node) if G.node[n]['date'] <= max_time]

def scoped_neighborhood(node, head_node, G):
    """
    Return the neighborhood of a node at a given timestamp
    
    arguments:
    node -- node to find the neighborhood for
    head_node -- node containing the timestamp to filter the nodes by
    G -- directed graph containing the node
    """
    max_date = G.node[head_node]['date']
    pre = predecessors_scoped(node, head_node, G)
    suc = successors_scoped(node, head_node, G)
    return list(set(pre).union(set(suc)))    
    
def scoped_degree(node, head_node, G):
    """
    Return the degree of a node at a given timestamp
    
    arguments:
    node -- node to return the degree for
    head_node -- node containing the timestamp to filter the nodes by
    G -- graph containing the nodes
    """
    max_date = G.node[head_node]['date']
    neighborhood = [n for n in G.neighbors(node) if G.node[n]['date'] <= max_date]
    return len(neighborhood)

def get_common_neighbors(x,y,G):
    pass

def get_common_referrers(x,y,G):
    common_referrers_source = {n for m in successors_scoped(x, x, G) for n in predecessors_scoped(m, x, G)} - set(x)
    referrers_to_target = set(predecessors_scoped(y,x,G))
    return common_referrers_source.intersection(referrers_to_target)

    
def common_referrers(validation_set, G):
    """
    For a given node pair x and y, return the number of nodes that both refer to a node that x also refers and refers to y
    or f(y) intersection g(f(x)) where f returns the predeccessors of a node and g returns the successors
    
    arguments:
    validation_set -- list of edges to score
    G -- digraph containing the nodes in the edges of the validation set
    
    returns:
    list of edges with score as an attribute
    """
    results = []
    
    for non_edge in validation_set:
        x = non_edge[0]
        y = non_edge[1]
        common_referrers_source = {n for m in successors_scoped(x, x, G) for n in predecessors_scoped(m, x, G)} - set(x)
        referrers_to_target = set(predecessors_scoped(y,x,G))
        s= len(common_referrers_source.intersection(referrers_to_target))
        non_edge[2]['score'] = s
        results.append(non_edge)
    
    return results

def common_neighbors(validation_set, G):
    """
    Perform common neighbors scoring on a list of edges
    
    arguments:
    validation_set -- list of edges to score
    G -- digraph containing the nodes in the edges of the validation set
    
    returns:
    list of edges with score as an attribute
    """
    results = []
    for non_edge in validation_set:
        x = non_edge[0]
        y = non_edge[1]
        u = set(scoped_neighborhood(x,x,G))
        v = set(scoped_neighborhood(y,x,G))
        s = len(u.intersection(v))
        non_edge[2]['score'] = s
        results.append(non_edge)
    return results

def triadic_closeness(validation_set, G):
    results = []
    census = triadic_distribution(G)
    for non_edge in validation_set:
        x = non_edge[0]
        y = non_edge[1]
        u = set(scoped_neighborhood(x,x,G))
        v = set(scoped_neighborhood(y,x,G))
        cn = u.intersection(v)
        t_score = []
        for z in cn:
            triad = get_triad(x,z,y,G)
            try:
                F2 = census[triad+30]
            except KeyError:
                F2 = 0.0
                print "Reciprocal link found, data is being weird"
            F1 = census[triad+10]
            if F1 >= 20 and F1 <= 30:
                print "Score between 20-30, multiple edges between: {} and {}".format(x,y)
                
            score = (1.0*census[triad+10] + F2)/census[triad]
            t_score.append(score)
        s = sum(t_score)
        non_edge[2]['score'] = s
        results.append(non_edge)
    return results

def triadic_distribution(G):
    """
    Return the distribution of closed triad configurations for a graph
    The distribution is labelled as in the paper by Schall
    http://link.springer.com.proxy.findit.dtu.dk/article/10.1007/s13278-014-0157-9
    
    arguments:
    G -- directed graph
    
    returns:
    Dict of labels with counts
    """
    
    # Integer labels as presented in the paper
    TRIAD_NAMES = range(1,10) + range(11, 20) + range(21,30) + range (31, 40)
    census = {name: 0 for name in TRIAD_NAMES}
    for u in G.nodes_iter():
        u_neighbors = list(set(G.successors(u)) | set(G.predecessors(u)))
        for z in u_neighbors:
            z_neighbors = list((set(G.successors(z)) | set(G.predecessors(z))) - {u} )
            for v in z_neighbors:
                name = get_triad(u,z,v, G)
                census[name] += 1
    return census


def get_triad(u,z,v,G):
    """
    Return the triad created by the nodes u,v and z
    This implementation is quite probably awful.
    
    parameters:
    
    u -- starting node
    z -- connecting node
    v -- ending node
    G -- DiGraph containing the nodes
    
    returns:
    Dict containing closed triad counts
    """
    
    u_out = G[u]
    v_out = G[v]
    z_out = G[z]
    id = 0
    
    if v in u_out and u in v_out:
        id = 30
    elif u in v_out:
        id = 20
    elif v in u_out:
        id = 10
    
    if u in z_out and z not in u_out:
        id += 7
        if v in z_out and z not in v_out:
            return id + 2
        elif z in v_out and v not in z_out:
            return id + 1
        elif z in v_out and v in z_out:
            return id
        raise Exception("Error in finding triad")
    elif z in v_out and v not in z_out:
        id += 5
        if z in u_out and u not in z_out:
            return id + 1
        elif z in u_out and u in z_out:
            return id
        raise Exception("Error in finding triad")
    elif z in u_out and u not in z_out:
        id += 3
        if v in z_out and z not in v_out:
            return id + 1
        if v in z_out and z in v_out:
            return id
        raise Exception("Error in finding triad")
    elif z in u_out and u in z_out:
        if v in z_out and z not in v_out:
            return id + 2
        elif v in z_out and z in v_out:
            return id + 1
    
    raise Exception("No triad found")
        

def get_closed_triads(x, y, G):
    """
    Return the closed triads generated by adding a link from x to y.
    Triads are classified according to the triadic_census algorithm of NetworkX based on 
    http://vlado.fmf.uni-lj.si/pub/networks/doc/triads/triads.pdf
    """
    
    # Taken directly from nx.triadic_census source
    TRIAD_NAMES = ('003', '012', '102', '021D', '021U', '021C', '111D', '111U',
                   '030T', '030C', '201', '120D', '120U', '120C', '210', '300')
    census = {name: 0 for name in TRIAD_NAMES}
    
    x_in = set(G.predecessors(x))
    y_in = set(G.predecessors(y))
    x_out = set(G.successors(x))
    y_out = set(G.successors(y))
    
    for node in x_in | y_in | x_out | y_out:
        # y refers to a node that refers to x
        if node in x_in and node in y_out:
            census['030C'] += 1
        # y is being referred to by a node that refers to x
        if node in x_in and node in y_in:
            census['030T'] += 1
        # x refers to a node that refers to y
        if node in x_out and node in y_in:
            census['030T'] += 1
        # x refers to a node that y refers to
        if node in x_out and node in y_out:
            census['030T'] += 1
            
                
def valid_random_non_edges(graph, n):
    """
    Returns randomized, non-existent links between nodes in the graph that are guaranteed to observe causality.

    Parameters
    ----------
    graph : NetworkX graph.
        Graph to find non-existent edges.
    n : integer
        Number of non-existent edges to find

    Returns
    -------
    non_edges : list
        List of n edges that are not in the graph.
    """
    result_pairs = []
    # Sort edges according to age
    sorted_edges =[node for node, data in sorted(graph.nodes(data=True), key=lambda x: x[1]['date'], reverse=True)]
    node_set = set(graph.nodes())
    candidates = list(np.random.choice(sorted_edges, n, replace=True))
    i = 0
    while i < len(candidates):
        u = candidates[i]
        # Make sure the potential neighbors respect causality with a resolution equal to the timestamp
        cand_index = sorted_edges.index(u)
        potential_neighbors = set(sorted_edges[cand_index:])
        if graph.is_directed():
            neighbors = set(graph.successors(u)).union(set(graph.predecessors(u)))
        else:
            neighbors = set(graph.neighbors(u))
        # Make sure the potential neighbors respect causality
        non_neighbors = list(potential_neighbors - neighbors)
        # The oldest node will have a neighborhood of Ø, so add a new candidate to the list in that case
        if len(non_neighbors) == 0:
            candidates.append(random.choice(graph.nodes()))
        else:    
            result_pairs.append((u, random.choice(non_neighbors)))
        i += 1
    return result_pairs
        

def k_fold_validate(G, k, fun, **kwargs):
    """
    K-fold validation of some specified function
    
    arguments:
    G -- Graph to perform the function on
    k -- number of folds
    fun -- function to be evaluated
    kwargs -- arguments to be passed to the evaluated function
    
    return:
    List of lists of scored predictions
    """
    
    edges = G.edges(data=True)
    random.shuffle(edges)
    
    # Find the number of true members in the validation set
    N = len(edges)/k
    validation_sets = []
    for i in range(0,k):
        validation_sets.append(edges[i*N:(i+1)*N])
    results = []
    for true_validation_set in validation_sets:
        # Create a training set and remove all true members of the validation set from it
        G_train = G.copy()
        G_train.remove_edges_from(true_validation_set)
        
        # Fetch random edges guaranteed not to be in the graph
        false_validation_set = valid_random_non_edges(G, len(true_validation_set))
        # Trim out and replace edges if they break causality. Continue doing this until the set is the size of the validation set
        for i in range(len(false_validation_set)):
            edge = false_validation_set[i]
            # Add date information to the non-edge
            false_validation_set[i] = edge + ({'date': G.node[edge[0]]['date']},)
        
        validation_set = true_validation_set + false_validation_set
        # If not shuffled, subsequent sorting algorithms will always rank true edges higher than false edges when they have
        # the same score
        random.shuffle(validation_set)
        results.append(fun(validation_set, G_train, **kwargs))
    return results

def precision(G, results, L):
    """
    Find the ratio of the true positives to trues
    
    arguments:
    G -- graph the results are based on
    results -- list of lists of scored predictions
    L -- number of results to be considered
    
    return:
    List of precisions for each set of results
    """
    # Sort the results with descending scores
    results = [sorted(result, key=lambda x: x[2]['score'], reverse=True) for result in results]
    edge_set = set(G.edges())
    # True positives exist in both the edge set and the result set
    true_positives = [[(edge[0],edge[1]) for edge in result[0:L] if (edge[0], edge[1]) in edge_set] for result in results]
    return [1.0*len(trues)/L for trues in true_positives]

def AUC(G, results):
    """
    Perform n trials where the score of a non-edge and an edge in the result is compared. Count the number of trials where
    the edge had the higher score as n' and the number of times the score was equal as n'' and return the AUC as (n' + n'')/n.
    
    arguments:
    G -- graph the results are based on
    results -- list of lists of scored predictions
    
    return:
    List of precisions for each set of results
    """
    
    edge_set = set(G.edges())
    AUC = []
    for result_set in results:
        true_edges = []
        false_edges = []
        for (x,y,data) in result_set:
            if (x,y) in edge_set:
                true_edges.append((x,y,data))
            else:
                false_edges.append((x,y,data))
        
        random.shuffle(true_edges)
        random.shuffle(false_edges)
        n = len(true_edges)
        n_better = 0.0
        n_same = 0.0
        for i in range(0, n):
            if true_edges[i][2]['score'] > false_edges[i][2]['score']:
                n_better += 1.0
            if true_edges[i][2]['score'] == false_edges[i][2]['score']:
                n_same += 1.0
        AUC.append((n_better + 0.5*n_same)/n)
    return AUC
        

In [165]:
%time tc_resutls = k_fold_validate(GCC, 5, triadic_closeness)

Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal link found, data is being weird
Reciprocal 

In [167]:
print "TC:"
print sum(precision(GCC, tc_resutls, 50))/5
print sum(AUC(GCC, tc_resutls))/5

TC:
0.988
0.84874622828


To check whether my implementation of the triadic census is correct I'll verify it with the graph presented by Schall

In [155]:
H = nx.DiGraph()
H.add_nodes_from(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
H.add_edges_from([('a', 'b'), ('a', 'c'), ('a', 'd')])
H.add_edges_from([('b', 'a'), ('b', 'c')])
H.add_edges_from([('c', 'a'), ('c', 'b'), ('c', 'd')])
H.add_edges_from([('d', 'e'), ('d', 'f')])
H.add_edges_from([('e', 'f')])
H.add_edges_from([('f', 'd'), ('f', 'e'), ('f', 'g'), ('f', 'h')])
H.add_edges_from([('g', 'f')])
H.add_edges_from([('h', 'f')])
dist = triadic_distribution(H)
filter(lambda x: x[1] != 0, sorted(dist.items(), key=lambda x: x[1], reverse=True))

[(1, 10),
 (31, 6),
 (2, 2),
 (3, 2),
 (4, 2),
 (5, 2),
 (7, 2),
 (8, 2),
 (12, 2),
 (27, 2),
 (36, 2),
 (11, 1),
 (21, 1),
 (32, 1),
 (33, 1),
 (35, 1),
 (37, 1)]

Seems like it matches!

In [148]:
%time dist = triadic_distribution(GCC)

Wall time: 36.3 s


In [166]:
tc_resutls

[[('10268/03',
   '33492/96',
   {'date': datetime.date(2005, 10, 4), 'score': 0.020110812982609735}),
  ('4451/70',
   '2832/66',
   {'date': datetime.date(1975, 2, 21), 'score': 0.30550061542971657}),
  ('65040/01', '10862/84', {'date': datetime.date(2007, 10, 23), 'score': 0}),
  ('60957/00', '61530/00', {'date': datetime.date(2005, 12, 22), 'score': 0}),
  ('50278/99',
   '25964/94',
   {'date': datetime.date(2006, 1, 17), 'score': 0.26682849672753806}),
  ('6222/10', '10468/04', {'date': datetime.date(2011, 12, 20), 'score': 0}),
  ('11598/85', '11932/86', {'date': datetime.date(1992, 2, 27), 'score': 0}),
  ('20197/03',
   '33958/96',
   {'date': datetime.date(2011, 6, 28), 'score': 0.1283865451181166}),
  ('68610/01',
   '43662/98',
   {'date': datetime.date(2010, 12, 14), 'score': 0.20882979704855556}),
  ('59857/00',
   '30210/96',
   {'date': datetime.date(2008, 4, 22), 'score': 0.2567730902362332}),
  ('54252/07', '21893/93', {'date': datetime.date(2009, 6, 16), 'score': 0})

In [150]:
%time imp = nx.triadic_census(GCC)

Wall time: 38 s


In [160]:
%pdb

Automatic pdb calling has been turned OFF


In [115]:
u = H.nodes()[1]
z = H.neighbors(u)[2]
v = H.neighbors(z)[0]
u_in = set(H.predecessors(u))
v_in = set(H.predecessors(v))
z_in = set(H.predecessors(z))
u_out = set(H.successors(u))
v_out = set(H.successors(v))
z_out = set(H.successors(z))
id = 0
print u, z, v

c d e


In [146]:
triadic_distribution(H)

{1: 10,
 2: 2,
 3: 2,
 4: 2,
 5: 2,
 6: 0,
 7: 2,
 8: 2,
 9: 0,
 11: 1,
 12: 2,
 13: 0,
 14: 0,
 15: 0,
 16: 0,
 17: 0,
 18: 0,
 19: 0,
 21: 1,
 22: 0,
 23: 0,
 24: 0,
 25: 0,
 26: 0,
 27: 2,
 28: 0,
 29: 0,
 31: 6,
 32: 1,
 33: 1,
 34: 0,
 35: 1,
 36: 2,
 37: 1,
 38: 0,
 39: 0}

In [145]:
%timeit triadic_distribution(H)

10000 loops, best of 3: 179 µs per loop


In [141]:
%timeit nx.triadic_census(H)

10000 loops, best of 3: 86 µs per loop


In [81]:
z in v_out and v not in z_out

False

In [89]:
get_triad('a', 'd', 'c', H)

36

In [168]:
%time cr_results = k_fold_validate(GCC, 5, common_referrers)

Wall time: 3min 57s


In [169]:
print "CR:"
print sum(precision(GCC, cr_results, 50))/5
print sum(AUC(GCC, cr_results))/5

CR:
0.996
0.893340963479


In [170]:
%time cn_results = k_fold_validate(GCC, 5, common_neighbors)

Wall time: 2min 25s


In [171]:
print "CR:"
print sum(precision(GCC, cn_results, 50))/5
print sum(AUC(GCC, cn_results))/5

CR:
0.976
0.846592446155


In [None]:
%time nx.triadic_census(GCC)

In [None]:
c = 0
f = 0
for node in GCC.nodes_iter():
    for n in GCC.neighbors(node):
        nei = GCC.neighbors(n)
        if node in nei:
            c+= 1
            if GCC.node[node]['date'] != GCC.node[n]['date']:
                f+=1


In [None]:
n1 = GCC.nodes()[1234]
n2 = GCC.nodes()[4321]

In [None]:
s = set(GCC.predecessors(n1)) | set(GCC.successors(n1)) | set(GCC.predecessors(n2)) | set(GCC.successors(n2)) | set([n1, n2])

In [None]:
n1

In [None]:
    TRIAD_NAMES = range(1,10) + range(11, 20) + range(21,30)
    census = {name: 0 for name in TRIAD_NAMES}


In [None]:
census[1]

In [None]:
H.add_edge(n1,n2)

In [None]:
nx.triadic_census(H)

In [None]:
GCC[n1]

In [None]:
set(GCC.predecessors(n1)) | set(GCC.successors(n1))

In [None]:
set(H.predecessors(n1)) | set(H.successors(n1))

In [156]:
%pdb

Automatic pdb calling has been turned ON
