# Random Predictor for algorithmic methods

In [1]:
import networkx as nx 
import random
import matplotlib.pyplot as plt
import numpy as np
import math

In [2]:
def features_directed(graph, actual_edges, multi = False):
      
    n = graph.number_of_nodes()
    nodes = list(graph.nodes)
    
    data0_Y_train = []
    data0_nodes_train = []
    Y_train = []
    Y_test, nodes_test = [],[]
    
    for i in range(n):
        neighbors = [n for n in graph.neighbors(nodes[i])] # list of the neighbors of node i
        for j in range(n):
            if i != j :
                
                if nodes[j] in neighbors: 

                    # there is a link between the nodes in the graph
                    # the training set is composed of all the present edges of the graph
                    Y_train.append(1)
                
                if (multi==False and nodes[j] not in neighbors) or (multi==True): 
                    # there is no link between the nodes in the graph, if it is not a multigraph
                    
                    if [nodes[i],nodes[j]] not in actual_edges: 
                        # this is not one of the removed edges
                        data0_nodes_train.append([nodes[i],nodes[j]])
                        data0_Y_train.append(0)
                    else: # the testing set is composed of all the present edges previously removed
                        nodes_test.append([nodes[i],nodes[j]])
                        Y_test.append(1)
    
    l = int(len(data0_Y_train)/5) # the number of absent edges in the graph to remove
   
    # We shuffle the data of absence of edges in the same way 
    data0 = list(zip(data0_nodes_train, data0_Y_train))
    random.shuffle(data0)
    data0_nodes_train, data0_Y_train = zip(*data0)
    
    # We randomly remove 1/5 of the absent edges of the graph
    Y_train += data0_Y_train[l:]
    
    # We randomly add the 1/5 to the pairs of nodes to predict
    Y_test += data0_Y_train[:l]
    nodes_test += data0_nodes_train[:l]
    
    return Y_train, Y_test, nodes_test

In [3]:
def features_undirected(graph, actual_edges, multi = False):
      
    n = graph.number_of_nodes()
    nodes = list(graph.nodes)
    edges = list(graph.edges)
    
    data0_Y_train = []
    data0_nodes_train = []
    Y_train = []
    Y_test, nodes_test = [],[]

    for i in range(n):
        neighbors = [n for n in graph.neighbors(nodes[i])] # list of the neighbors of node i
        for j in range(i+1,n):
            if i != j :
                
                if nodes[j] in neighbors: 
                    # there is a link between the nodes in the graph
                    # the training set is composed of all the present edges of the graph
                    Y_train.append(1)
                
                if (multi==False and nodes[j] not in neighbors) or (multi==True): 
                    # there is no link between the nodes in the graph, if it is not a multigraph
                    
                    # Notation: the nodes of minimum number first                             
                    if [min(nodes[i],nodes[j]),max(nodes[i],nodes[j])] not in actual_edges: 
                        # this is not one of the removed edges
                        data0_nodes_train.append([min(nodes[i],nodes[j]),max(nodes[i],nodes[j])])
                        data0_Y_train.append(0)

                    else: # the testing set is composed of all the present edges previously removed
                        nodes_test.append([min(nodes[i],nodes[j]),max(nodes[i],nodes[j])])
                        Y_test.append(1)      
                

    l = int(len(data0_Y_train)/5) # the number of absent edges in the graph to remove

    # We shuffle the data of absence of edges in the same way 
    data0 = list(zip(data0_nodes_train, data0_Y_train))
    random.shuffle(data0)
    data0_nodes_train, data0_Y_train = zip(*data0)
    
    # We randomly remove 1/5 of the absent edges of the graph
    Y_train += data0_Y_train[l:]
    
    # We randomly add the 1/5 to the pairs of nodes to predict
    Y_test += data0_Y_train[:l]
    nodes_test += data0_nodes_train[:l]
    
    return Y_train, Y_test, nodes_test

In [4]:
def random_prediction_undirected(graph, nodes_test, k, multi = False):
    

    n = graph.number_of_nodes()
    nodes = list(graph.nodes)
    links = []
    random.shuffle(nodes) # so that the prediction is not defined by the order of the nodes
    random.shuffle(nodes_test)

    if multi==True:
        while len(links) < k:
            i = random.randint(0,n-1)
            j = random.randint(0,n-1)
            # if multigraph, we can predict another edge, where we already have one
            # so we can predict any pair of nodes (but only once)
            if [min(nodes[i],nodes[j]),max(nodes[i],nodes[j])] not in links: 
                # We don't want to predict the same edge twice
                links.append([min(nodes[i],nodes[j]),max(nodes[i],nodes[j])])
    else:
        while len(links) < k:
            # We don't want to predict edges already existing, if not multigraph
            # so we randomly select pairs of the testing set
            l = random.randint(0,len(nodes_test)-1)
            if nodes_test[l] not in links: 
                # We don't want to predict the same edge twice
                links.append(nodes_test[l])
    
    return links

In [5]:
def random_prediction_directed(graph, nodes_test, k, multi = False):
    

    n = graph.number_of_nodes()
    nodes = list(graph.nodes)
    links = []
    random.shuffle(nodes) # so that the prediction is not defined by the order of the nodes
    random.shuffle(nodes_test)
    
    if multi==True:
        while len(links) < k:
            i = random.randint(0,n-1)
            j = random.randint(0,n-1)
            # if multigraph, we can predict another edge, where we already have one
            # so we can predict any pair of nodes (but only once)
            if [nodes[i],nodes[j]] not in links: 
                # We don't want to predict the same edge twice
                links.append([nodes[i],nodes[j]])
    else:
        while len(links) < k:
            # We don't want to predict edges already existing, if not multigraph
            # so we randomly select pairs of the testing set
            l = random.randint(0,len(nodes_test)-1)
            if nodes_test[l] not in links: 
                # We don't want to predict the same edge twice
                links.append(nodes_test[l])
    
    return links

In [6]:
# function giving MAP for random predictor

def random_results(predictions, actual_edges, graph):
    
    n = graph.number_of_nodes()
    nodes = list(graph.nodes)
    n_edges = graph.number_of_edges()
    test_size = len(actual_edges)
    
    precisions = []
    MAP = 0
    tp = 0
    fp = 0
    
    for k in range(len(predictions)):
            
        if predictions[k] in actual_edges:
            tp += 1
            MAP += (tp /(tp + fp))   
        else:
            fp += 1
                
    if tp == 0:  # Not even one good prediction
        return 0
    
    return(MAP/tp)

In [7]:
# 5-fold cross validation results for random predictor

def CV_random_results(list_graphs, list_targets, network_type='undirected', multi = False):
    test_size = len(list_targets[0])
    MAP = 0
    
    for part in range(5):
        
            if network_type == 'directed':
                Y_train, Y_test, nodes_test = features_directed(list_graphs[part], list_targets[part], multi)
            else:
                Y_train, Y_test, nodes_test = features_undirected(list_graphs[part], list_targets[part], multi)
                
            for i in range(100):
                
                if network_type == 'directed':
                    predictions = random_prediction_directed(list_graphs[part],nodes_test,test_size,multi)
                else: predictions = random_prediction_undirected(list_graphs[part],nodes_test,test_size,multi)
            
                MAP_N = random_results(predictions,list_targets[part],list_graphs[part])
                MAP = MAP+MAP_N
  
    return(MAP/500)

### First dataset: PROTEINS 

In [8]:
# Importation of the dataset: PROTEINS
# http://konect.uni-koblenz.de/networks/maayan-Stelzl 

PROT = []
with open('out.maayan-Stelzl') as inputfile:
    for line in inputfile:
        PROT.append(line.strip().split(','))
PROT = PROT[1:] # list of all the edges
random.shuffle(PROT) # we randomly shuffle the edges

# test size
num_edges_PROT = len(PROT)
num_vertices_PROT = 1706
test_size_PROT = int(num_edges_PROT/5)

# Contains the 5 parts forming the whole dataset
parts_PROT = []

start = 0
end = test_size_PROT
for part in range(5):  # We create the 5 parts
    if end>num_edges_PROT:
        parts_PROT.append(PROT[start:])
    else:parts_PROT.append(PROT[start:end])
    start = end
    end = start + test_size_PROT

In [9]:
PROT_G = nx.DiGraph()
for edge in range(len(PROT)):
    nodes = PROT[edge][0].split(' ')
    PROT_G.add_edge(int(nodes[0]), int(nodes[1])) 
PROT_nodes = PROT_G.nodes()

In [10]:
# graphs contains the 5 different training networks 
# targets contains the target links corresponding

PROT_graphs = []
PROT_targets = []

for i in range(5):     # i is the index of the folder we won't use
    G = nx.DiGraph()
    G.add_nodes_from(PROT_nodes)
    target_links = [] 
    for j in range(5): # We use every other folder
        data = parts_PROT[j]
        if j!=i:        
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                G.add_edge(int(nodes[0]), int(nodes[1])) 
                
                
        else:
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                target_links.append([int(nodes[0]),int(nodes[1])])
                
                

    PROT_graphs.append(G)
    PROT_targets.append(target_links)

In [11]:
MAP_R = CV_random_results(PROT_graphs,PROT_targets,'directed')

In [12]:
MAP_R

0.006636022533431954

### Second dataset: INFECTIOUS

In [13]:
# Importation of the dataset: INFECTIOUS
# http://konect.uni-koblenz.de/networks/sociopatterns-infectious 

INF = []
with open('sociopatterns-infectious\out.sociopatterns-infectious') as inputfile:
    for line in inputfile:
        INF.append(line.strip().split(','))
INF = INF[2:] # list of all the edges
random.shuffle(INF) # we randomly shuffle the edges

# test size
num_edges_INF = len(INF)
num_vertices_INF = 410
test_size_INF = int(num_edges_INF/5)

# Contains the 5 parts forming the whole dataset
parts_INF = []

start = 0
end = test_size_INF
for part in range(5):  # We create the 5 parts
    if end>num_edges_INF:
        parts_INF.append(INF[start:])
    else:parts_INF.append(INF[start:end])
    start = end
    end = start + test_size_INF

In [14]:
INF_G = nx.MultiGraph()
for edge in range(len(INF)):
    nodes = INF[edge][0].split(' ')
    INF_G.add_edge(int(nodes[0]), int(nodes[1])) 
INF_nodes = INF_G.nodes()

In [15]:
# graphs contains the 5 different training networks 
# targets contains the target links corresponding

INF_graphs = []
INF_targets = []

for i in range(5):     # i is the index of the folder we won't use
    G = nx.MultiGraph()
    G.add_nodes_from(INF_nodes)
    target_links = [] 
    for j in range(5): # We use every other folder
        data = parts_INF[j]
        if j!=i:        
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                G.add_edge(int(nodes[0]), int(nodes[1])) 
                
                
        else:
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                # For undirected network, we write the min node first
                # We don't want duplicates because we cannot predict the number of edges to add
                if [min(int(nodes[0]),int(nodes[1])),max(int(nodes[0]),int(nodes[1]))] not in target_links:
                    target_links.append([min(int(nodes[0]),int(nodes[1])),max(int(nodes[0]),int(nodes[1]))])
                
                
    INF_graphs.append(G)
    INF_targets.append(target_links)

In [16]:
MAP_INF_R = CV_random_results(INF_graphs,INF_targets,'undirected',True)

In [17]:
MAP_INF_R

0.020540572978007626

### Third dataset: ADOLESCENT

In [18]:
# Importation of the dataset: ADOLESCENT
# http://konect.uni-koblenz.de/networks/moreno_health

ADO = []
with open('moreno_health\out.moreno_health_health') as inputfile:
    for line in inputfile:
        ADO.append(line.strip().split(','))
ADO = ADO[2:] # list of all the edges
random.shuffle(ADO) # we randomly shuffle the edges

# test size
num_edges_ADO = len(ADO)
num_vertices_ADO = 2539
test_size_ADO = int(num_edges_ADO/5)

# Contains the 5 parts forming the whole dataset
parts_ADO = []

start = 0
end = test_size_ADO
for part in range(5):  # We create the 5 parts
    if end>num_edges_ADO:
        parts_ADO.append(ADO[start:])
    else:parts_ADO.append(ADO[start:end])
    start = end
    end = start + test_size_ADO

In [19]:
ADO_G = nx.DiGraph()
for edge in range(len(ADO)):
    nodes = ADO[edge][0].split(' ')
    ADO_G.add_edge(int(nodes[0]), int(nodes[1]), weight=float(nodes[2]))
ADO_nodes = ADO_G.nodes()

In [20]:
# graphs contains the 5 different training networks 
# targets contains the target links corresponding

ADO_graphs = []
ADO_targets = []

for i in range(5):     # i is the index of the folder we won't use
    G = nx.DiGraph()
    G.add_nodes_from(ADO_nodes)
    target_links = [] 
    for j in range(5): # We use every other folder
        data = parts_ADO[j]
        if j!=i:        
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                G.add_edge(int(nodes[0]), int(nodes[1]), weight=float(nodes[2])) 
                
                
        else:
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                target_links.append([int(nodes[0]),int(nodes[1])])
                
                

    ADO_graphs.append(G)
    ADO_targets.append(target_links)

In [21]:
MAP_ADO_R = CV_random_results(ADO_graphs,ADO_targets,'directed')

In [22]:
MAP_ADO_R

0.004748975372132157

### Fourth dataset: MISERABLES

In [23]:
# Importation of the dataset: MISERABLES
# http://konect.uni-koblenz.de/networks/moreno_lesmis

MIS = []
with open('moreno_lesmis\out.moreno_lesmis_lesmis') as inputfile:
    for line in inputfile:
        MIS.append(line.strip().split(','))
MIS = MIS[2:] # list of all the edges
random.shuffle(MIS) # we randomly shuffle the edges

# test size
num_edges_MIS = len(MIS)
num_vertices_MIS = 77
test_size_MIS = int(num_edges_MIS/5)

# Contains the 5 parts forming the whole dataset
parts_MIS = []

start = 0
end = test_size_MIS
for part in range(5):  # We create the 5 parts
    if end>num_edges_MIS:
        parts_MIS.append(MIS[start:])
    else:parts_MIS.append(MIS[start:end])
    start = end
    end = start + test_size_MIS

In [24]:
MIS_G = nx.Graph()
for edge in range(len(MIS)):
    nodes = MIS[edge][0].split(' ')
    MIS_G.add_edge(int(nodes[0]), int(nodes[1]), weight=float(nodes[2]))
MIS_nodes = MIS_G.nodes()

In [25]:
# graphs contains the 5 different training networks 
# targets contains the target links corresponding

MIS_graphs = []
MIS_targets = []

for i in range(5):     # i is the index of the folder we won't use
    G = nx.Graph()
    G.add_nodes_from(MIS_nodes)
    target_links = [] 
    for j in range(5): # We use every other folder
        data = parts_MIS[j]
        if j!=i:        
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                G.add_edge(int(nodes[0]), int(nodes[1]), weight=float(nodes[2])) 
                
                
        else:
            for edge in range(len(data)):
                nodes = data[edge][0].split(' ')
                # For undirected network, we write the min node first
                target_links.append([min(int(nodes[0]),int(nodes[1])),max(int(nodes[0]),int(nodes[1]))])
                
                

    MIS_graphs.append(G)
    MIS_targets.append(target_links)

In [26]:
MAP_MISS_R = CV_random_results(MIS_graphs,MIS_targets)

In [27]:
MAP_MISS_R

0.15298700158840625