In [2]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import itertools
from scipy.optimize import curve_fit

random.seed(4)
# Reducing the size of the network

df_edges = pd.read_csv("com-Amazon.csv", delimiter = " ")
OG = nx.from_pandas_edgelist(df_edges, source="From", target="To", create_using= nx.DiGraph) #the original co-purchasing network
edges = random.sample(list(OG.nodes()), int(OG.number_of_nodes() * 0.26)) # will use only 25% of the graph
G=OG.subgraph(edges)
largest_cc = max(nx.weakly_connected_components(G), key=len) #find the largest weakly component
G1=G.subgraph(largest_cc) # the final graph with the largest weakly connected component
G2= G1.to_undirected()
G2_node_list= [] # add node that have the degree > 2
for node,degree in G1.degree():
    if degree > 2:
        G2_node_list.append(node)
G2= G1.subgraph(G2_node_list) #creating the graph base on the degree > 2
        
removed_edges = random.sample(list(G2.edges()), int(G2.number_of_edges() * 0.30)) #we will removed 20% of the edges
G_train = G2.copy()
G_train.remove_edges_from(removed_edges)
G_test = G2.copy()
#G_test.add_edges_from(removed_edges)

G_train=G_train.to_undirected()
print(G_train)


Graph with 5465 nodes and 4991 edges


In [None]:
# jacard
n_star= np.arange(10,110,10)

# Calculating for Jacard value
def JA_cal(graph): 
    jacard = nx.jaccard_coefficient(graph)
    jacard_pred = []
    for u, v, p in jacard:
        jacard_pred.append([u, v, p])
    jacard_pred = pd.DataFrame(jacard_pred, columns=['u', 'v', 'p']) 
    jacard_pred = jacard_pred.sort_values(by='p', ascending=False)
    pred= jacard_pred
    return pred

# Calculating for Adamic Adar value

def AA_cal(graph):
    adamic = nx.adamic_adar_index(graph)
    adamic_pred = []
    for u, v, p in adamic:
        adamic_pred.append([u, v, p])
    adamic_pred = pd.DataFrame(adamic_pred, columns=['u', 'v', 'p'])
    adamic_pred = adamic_pred.sort_values(by='p', ascending=False)
    pred= adamic_pred
    return pred

# Calculating for Preferential attachment value

def PA_cal(graph):
    pref = nx.preferential_attachment(graph)
    PA_prec= []
    for u, v, p in pref:
        PA_prec.append([u, v, p])
    PA_prec = pd.DataFrame(PA_prec, columns=['u', 'v', 'p'])
    PA_prec = PA_prec.sort_values(by='p', ascending=False)
    pred= PA_prec
    return pred

# Calculating for Resource Allocation value

def RA_cal(graph):
    res = nx.resource_allocation_index(graph)
    RA_prec= []
    for u, v, p in res:
        RA_prec.append([u, v, p])
    RA_prec = pd.DataFrame(RA_prec, columns=['u', 'v', 'p'])
    RA_prec = RA_prec.sort_values(by='p', ascending=False)
    pred= RA_prec
    return pred

# Check to see if all of the link prediction is accurately predicting the value
def pred_accuracy(predEdges,removed_edges,k):
    correctly_nodes = [value for value in predEdges if value in removed_edges]
    number_of_correct= len(correctly_nodes)
    rate= number_of_correct/k
    return (number_of_correct,rate)
    
def FinalValue(graph, link):
    Ks = np.arange(10,310,10) # starting from 10, start incrementing by 10 -> 10,20,30,...
    if link == "JA":
        pred= JA_cal(graph)
    elif link == 'AA':
        pred= AA_cal(graph)
    elif link == 'PA':
        pred= PA_cal(graph)
    elif link == 'RA':
        pred= RA_cal(graph)

    
    print("For {} ".format(link))
    Final_Value= []
    for k in Ks:     
        predEdges= []
        score= []
        for i in range(k):
            predEdges.append((int(pred.iloc[i]['u']),int(pred.iloc[i]['v'])))
            score.append(((pred.iloc[i]['p'])))
        result = pred_accuracy(predEdges,removed_edges,k)
        Final_Value.append(result)
        print("for {}, the number of correct edges being predicted {} and the rate of it is {}".format(k,result[0],result[1]))
    return Final_Value
FinalValue(G_train, link= "JA")
FinalValue(G_train, link= "AA")
FinalValue(G_train, link= "PA")
FinalValue(G_train, link= "RA")






For JA 
for 10, the number of correct edges being predicted 1 and the rate of it is 0.1
for 20, the number of correct edges being predicted 2 and the rate of it is 0.1
for 30, the number of correct edges being predicted 2 and the rate of it is 0.06666666666666667
for 40, the number of correct edges being predicted 3 and the rate of it is 0.075
for 50, the number of correct edges being predicted 4 and the rate of it is 0.08
for 60, the number of correct edges being predicted 7 and the rate of it is 0.11666666666666667
for 70, the number of correct edges being predicted 7 and the rate of it is 0.1
for 80, the number of correct edges being predicted 8 and the rate of it is 0.1
for 90, the number of correct edges being predicted 10 and the rate of it is 0.1111111111111111
for 100, the number of correct edges being predicted 11 and the rate of it is 0.11
for 110, the number of correct edges being predicted 11 and the rate of it is 0.1
for 120, the number of correct edges being predicted 12 

In [25]:
Ks = np.arange(5,205,5)
for k in Ks:
    print(k)

5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200


In [None]:
n_star= np.arange(10,110,10)
adamic = nx.adamic_adar_index(G_train)
adamic_pred = []
for u, v, p in adamic:
    adamic_pred.append([u, v, p])

adamic_pred = pd.DataFrame(adamic_adar_index, columns=['u', 'v', 'adamic'])

for n in n_star:
    adamic_pred_n = adamic_pred.sort_values(by='adamic', ascending=False).head(n) #sorting 
    adamic_pred_new= []
    for i in range(len(adamic_pred_n)):
        if adamic_pred_n.iloc[i]['adamic'] >= 0.50: # number of predicion 
            comparing= (int(adamic_pred_n.iloc[i]['u']),int(adamic_pred_n.iloc[i]['v']),(adamic_pred_n.iloc[i]['adamic']))
            adamic_pred_new.append(comparing)

    
    for u, v, p in adamic_pred_new:
        success= 0
        failure= 0
        true=[]
        if ((u,v)) in removed_edges:
          true.append(p)
          print(u,v)
          success+=1
        else:
          failure+=1
    print(true)
    print("The number of success given the top {}* trial is {}".format(n,success))


In [10]:
# adamic adar

    adamic = nx.adamic_adar_index(graph)
    adamic_pred= []
    for u, v, p in adamic:
        adamic_pred.append([u,v,p])
    adamic_pred = pd.DataFrame(adamic_pred, columns=['u', 'v', 'adamic'])
    adamic_pred = adamic_pred.sort_values(by='adamic', ascending=False).head(n)
    adamic_pred['pred'] = 0
    adamic_pred.loc[adamic_pred['adamic'] > 0.15, 'pred'] = 1


    pred=[]
    for i in range(len(adamic_pred)): 
        if adamic_pred['pred'][i] == 1: # number of predicion 
            comparing= (int(adamic_pred.iloc[i]['u']),int(adamic_pred.iloc[i]['v']),(adamic_pred.iloc[i]['adamic']))
            pred.append(comparing)
    success=0
    failure=0
    true=[]
    for u, v, j in pred:
        if ((u,v)) in G_test.edges():
          true.append(j)
          success+=1
        else:
          failure+=1
    return success

adamic(G_train,10)

0