In [2]:
import random
import numpy as np
from importlib import reload
import matplotlib.pyplot as plt
import networkx as nx

# real dataset
data_set = nx.read_edgelist('data/sample-ch2017.txt')
no_node = nx.number_of_nodes(data_set)
print (nx.info(data_set))

Name: 
Type: Graph
Number of nodes: 31135
Number of edges: 642287
Average degree:  41.2582


In [3]:
# number of edges if the graph was complete : f(n) = (n*(n-1))/2
#cg = nx.complete_graph(no_node) # takes too long to execute thus hashed
#print(nx.number_of_nodes(cg))

no_edges_possible = (no_node * (no_node - 1)) / 2
no_edges_possible

484678545.0

In [4]:
# Local bridges (lb) (no common contacts)
lb = list(nx.local_bridges(data_set , with_span = False))
print('Number of Local bridges in 2017: ', len(list(nx.local_bridges(data_set, with_span = False))))

# To create a list of nodes that belong to local node. from [('a','b'), ('c','b')] to ['a','b','c','b']
new_lb = []
for tup in lb:
    for element in tup:
        new_lb.append(element)  
        
#nd is to remove duplicates
nodes_in_lb = list(set(new_lb))

Number of Local bridges in 2017:  96038


In [5]:
# to find second degree contacts of a single node

def find_nodes(graph, node, distance):

     # get all nodes within distance around the query node
     nodes = set(nx.ego_graph(graph, node, radius=distance))

     # remove nodes that are not **at** the specified distance but closer: removes all the first degree contact
     if distance > 1:
      nodes -= set(nx.ego_graph(graph, node, radius=distance-1))

     return list(nodes)

# example implementation
#ba = find_nodes(data_set, nodes_in_lb[4], 2) # reeturns a list of all the nodes tat are 2nd deg
#print((len(ba)))
#ba


In [6]:
# second degree contacts that pass through the LB of a single node
# two parameters that it takes are 1) the graph itself here data_set 2) one specific local bridge (pair of nodes)
# it returns : predictive edges between the [second degree node, the node that is the part of LB]  in that same order
# returned variable is a nested list with the order 2
def seconddegree (data_set, node_pair):
    b = node_pair[0]
    sdc_through_lb = []
    n = list(data_set.neighbors(node_pair[1]))
    pair =[]
    
    for a in range(len(n)):
        lis = []
        lis.append(n[a])
        lis.append(b)
        sdc_through_lb.append(lis)
    return sdc_through_lb

# example implementation
#vr = seconddegree(data_set, lb[0])
#print(vr)

In [7]:
# works !! to get all the second degree edges with their respective nodes ! 
# It applies the above mentioned function to all the pair of nodes that form a local bridge
# returns nested list of order 3 . 
# non existing edges ! predicted false edges
# second node in this return set (pe) is always the node that is the part of a local bridge
def allnodesseconddegree (data_set, lb):
    sdc = []
    for a in range(len(lb)):
        var = seconddegree(data_set, lb[a])
        sdc.append(var)
    return sdc

lb_second_degree_nodes = allnodesseconddegree(data_set, lb)
#lb_second_degree_nodes

In [8]:
# This works !!
#now for the last condition ! No other mutual friend apart from ***
# returns the nested list of all the edges(node pair) that have only one mutual friend that is a part of LB
interseclist = []
satisfy_hypothesis = []

for a in range(len(lb_second_degree_nodes)):
    for b in range(len(lb_second_degree_nodes[a])):
        cc = lb_second_degree_nodes[a]
        
        for c in range(len(cc[b])):
            dd = cc[b]
            
            if dd[0] != dd[1]:
                n1 = set(list(data_set.neighbors(dd[0])))
                n2 = set(list(data_set.neighbors(dd[1])))
                intersection = list(n1.intersection(n2))
                ls = []
                
                if len(intersection) == 1 :
                    ls.append(dd[0])
                    ls.append(dd[1])
                    satisfy_hypothesis.append(ls)

print(len(satisfy_hypothesis))

## Post processing clean up!! 

# remove exact replications : eg [a,b] & [a,b] then only one is kept 
sh_tuples = set(map(tuple, satisfy_hypothesis))  #need to convert the inner lists to tuples so they are hashable
no_duplication = list(map(list, sh_tuples))
print(len(no_duplication))

# works : if . [a,b] and [b, a] are in the list then only one either [a,b] or [b,a] is selected
temp = set(frozenset(x) for x in no_duplication)
lst = [list(x) for x in temp]
print(len(lst))

#to remove duplications: [a,a] should be discarded
all_hypothesis_edges = []
for a in range(len(lst)):
    temp1 = lst[a]
    list_holder = []
    if temp1[0] != temp1[1]:
        list_holder.append(temp1[0])
        list_holder.append(temp1[1])
        all_hypothesis_edges.append(list_holder)
print(len(all_hypothesis_edges))

# all_hypothesis_edges is the final list to use ! all_hypothesis_edges give a list of edges that are predicted to not get connected

7086940
3543470
3343196
3343196


In [9]:
# import 2018 dataset
data_set2 = nx.read_edgelist('data/sample-ch2018.txt')
node = nx.number_of_nodes(data_set2)
print (nx.info(data_set2))
fn = (node * (node - 1)) / 2
print('Number of edges that can still be established:', fn)

Name: 
Type: Graph
Number of nodes: 26276
Number of edges: 672589
Average degree:  51.1942
Number of edges that can still be established: 345200950.0


In [10]:
edges = list(data_set2.edges)

# straight nodes to match them all
into_tuples = []
for a in range(len(all_hypothesis_edges)):
    ls = all_hypothesis_edges[a]
    into_tuples.append(tuple(ls))
    

# straight list overrlap
#using sets to solve time consuming stuff
direct_overlap = list(set(into_tuples) & set(edges))
print(len(direct_overlap))
#print(direct_overlap)


517


In [11]:
# reversing the nodes to match them all
# this reverse command operates on the list itself :: inplace reverse operation!!

print((all_hypothesis_edges[7])) # before reversing the list
for a in range(len(all_hypothesis_edges)):
    all_hypothesis_edges[a].reverse()
print((all_hypothesis_edges[7]))

rev = []
for a in range(len(all_hypothesis_edges)):
    ls = all_hypothesis_edges[a]
    rev.append(tuple(ls))


# reversed list of lsst overlap
#using sets to solve time consuming stuff
reversed_overlap = list(set(rev) & set(edges))
print(len(reversed_overlap))
#print(reversed_overlap)


['24018344', '15563685']
['15563685', '24018344']
521


In [12]:
total_edges_overlap = direct_overlap + reversed_overlap
print(len(total_edges_overlap))
# exports
#np.savetxt("edges_that_overlap.csv", total_edges_overlap, delimiter=",", fmt='%s')

1038


In [14]:
# to check the extra edges that was created in 2018 and did not exist in 2017
edg2017 = list(data_set.edges)
edges2018 = list(data_set2.edges)

direct_intesection = list(set(edg2017) & set(edges2018)) # intersection between the two
print(len(direct_intesection))

# reverse checks
print(edg2017[7])

# to convert all the tuples to list so that they can then be reversed
l = []
for a in range(len(edg2017)):
    ls = edg2017[a]
    l.append(list(ls))

# reversing the edg2017 list
for a in range(len(l)):
    l[a].reverse()

# reverse checks
print("Reversed list:", l[7])

#converting the list back to tuples so that the intersectio can be found
lis =[]
for a in range(len(l)):
    ls = l[a]
    lis.append(tuple(ls))
    
print("Reversed list:",  lis[7])    
   
reversed_intersection = list(set(lis).intersection(set(edges2018)))
print(len(reversed_intersection))

#intesecction of 2017 and 2018 
total_intersection = reversed_intersection + direct_intesection
print('Edges intersection length is:', len(total_intersection))

332631
('7394358', '7845466')
Reversed list: ['7845466', '7394358']
Reversed list: ('7845466', '7394358')
290004
Edges intersection length is: 622635


In [15]:
# difference
difference = list((set(edges2018)).difference(set(total_intersection)))
print(len(difference))


49954
