# Common Neighbour test

In [1]:
import networkx as nx 
import random

In [2]:
# Common neighbours
def CN(x,y,graph):
    s = 0
    for i in graph.neighbors(x):
        if i in graph.neighbors(y):
            s += 1
    return s

In [3]:
# Prediction with k the number of links we want to predict
def prediction(graph, k):
    
    n = graph.number_of_nodes()
    nodes = list(graph.nodes)
    random.shuffle(nodes) # so that the prediction is not defined by the order of the nodes
    links = []
    similarities = []
    
    for i in range(n):
        neighbors = [n for n in graph.neighbors(nodes[i])] # list of the neighbors of node i
        for j in range(i+1,n):
            
            if nodes[j] not in neighbors: # We don't want to predict edges already existing
                # in the notation of the dataset the first number is always the smaller one of the vertices linked by an edge
                links.append([min(nodes[i],nodes[j]),max(nodes[i],nodes[j])]) 
                similarities.append(CN(nodes[i],nodes[j],graph))
    
    Z = [x for _,x in sorted(zip(similarities,links), reverse = True)]
    return Z[:k]

In [4]:
# function giving all kind of results on performance
def results(predictions, actual_edges):
    
    precisions = []
    MAP = 0
    relevant_retrieved = 0
    retrieved = 0
    
    for edge in predictions:
        retrieved += 1
        if edge in actual_edges:
            relevant_retrieved += 1
            MAP += (relevant_retrieved/retrieved)
        precisions.append(relevant_retrieved/retrieved)
        
    MAP = MAP / relevant_retrieved
    precision = precisions[len(precisions)-1]
    recall = relevant_retrieved/len(actual_edges)
    F1 = 2*recall*precision/(precision+recall)
    
    return(precisions,MAP,precision,recall,F1)

In [5]:
# Importation of the dataset: INF

data = []
with open('INF_full.net') as inputfile:
    for line in inputfile:
        data.append(line.strip().split(','))
        
# We define the number of vertices in the network
num_vertices = int(data[0][0][10:])

# We have 3 + the number of vertices lines before the lines dealing with the edges
start_edges_INF = num_vertices+3
INF = data[start_edges_INF:] # List of all the edges
random.shuffle(INF) # Edges in a random order
num_edges = len(INF)

# test size
test_size_INF = int(num_edges/5)

# Contains the 5 parts forming the whole dataset
test_INF = []

start = 0
end = test_size_INF
for part in range(5):  # We create the 5 parts
    if end>num_edges:
        test_INF.append(INF[start:])
    else:test_INF.append(INF[start:end])
    start = end
    end = start + test_size_INF


In [6]:
INF_G = nx.MultiGraph()
for edge in range(len(INF)):
    nodes = INF[edge][0].split(' ')
    INF_G.add_edge(int(nodes[0]), int(nodes[1])) 
INF_nodes = INF_G.nodes()

In [7]:
# graphs contains the 5 different training networks 
# targets contains the target links corresponding

graphs = []
targets = []

for i in range(5):     # i is the index of the folder we won't use
    G = nx.MultiGraph()
    G.add_nodes_from(INF_nodes)
    target_links = [] 
    for j in range(5): # We use every other folder
        data = test_INF[j]
        if j!=i:        
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                G.add_edge(int(nodes[0]), int(nodes[1])) 
                
                
        else:
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                target_links.append([int(nodes[0]),int(nodes[1])])
                
                

    graphs.append(G)
    targets.append(target_links)
    

In [8]:
# Presicion
p = 0
for x in range(len(graphs)):
    predictions = prediction(graphs[x],test_size_INF)
    precisions,MAP,precision,recall,F1 = results(predictions,targets[x])
    p = p + precision

print(p/len(graphs))

0.37830018083182637


Expected: 0.3484 We are close to the expected result.

In [9]:
# Importation of the dataset: HMT

data = []
with open('HMT_full.net') as inputfile:
    for line in inputfile:
        data.append(line.strip().split(','))
        
# We define the number of vertices in the network
num_vertices = int(data[0][0][10:])

# We have 3 + the number of vertices lines before the lines dealing with the edges
start_edges_HMT = num_vertices+3
HMT = data[start_edges_HMT:] # List of all the edges
random.shuffle(HMT) # Edges in a random order
num_edges = len(HMT)

# test size
test_size = int(num_edges/5)

# Contains the 5 parts forming the whole dataset
test_HMT = []

start = 0
end = test_size
for part in range(5):  # We create the 5 parts
    if end>num_edges:
        test_HMT.append(HMT[start:])
    else:test_HMT.append(HMT[start:end])
    start = end
    end = start + test_size


In [10]:
HMT_G = nx.MultiGraph()
for edge in range(len(HMT)):
    nodes = HMT[edge][0].split(' ')
    HMT_G.add_edge(int(nodes[0]), int(nodes[1])) 
HMT_nodes = HMT_G.nodes()

In [11]:
# graphs contains the 5 different training networks 
# targets contains the target links corresponding
graphs = []
targets = []

for i in range(5):     # i is the index of the folder we won't use
    G = nx.MultiGraph()
    G.add_nodes_from(HMT_nodes)
    target_links = [] 
    for j in range(5): # We use every other folder
        data = test_HMT[j]
        if j!=i:        
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                G.add_edge(int(nodes[0]), int(nodes[1])) 
                
                
        else:
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                target_links.append([int(nodes[0]),int(nodes[1])])
                
                

    graphs.append(G)
    targets.append(target_links)
   

In [12]:
# Precision
p = 0
for x in range(len(graphs)):
    predictions = prediction(graphs[x],test_size)
    precisions,MAP,precision,recall,F1 = results(predictions,targets[x])
    p = p + precision

print(p/len(graphs))

0.2696933253156945


Expected: 0.2453 We are close to the expected result.