In [121]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from ge.classify import read_node_label, Classifier
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from ge import DeepWalk
import networkx as nx
from sklearn.manifold import TSNE
from ge import LINE
import itertools
from sklearn.model_selection import train_test_split

In [122]:
def get_proximity_score(G, edges,feature):
    jc_list = [[] for i in itertools.repeat(None, len(edges))]
    pa_list = [[] for i in itertools.repeat(None, len(edges))]
    adar_list = [[] for i in itertools.repeat(None, len(edges))]
    jc = nx.jaccard_coefficient(G, edges)
    pa = nx.preferential_attachment(G, edges)
    cn = [len(list(nx.common_neighbors(G, edge[0], edge[1]))) for edge in edges]
    for i, data in enumerate(jc):
        jc_list[i]=data[2]
    for i, data in enumerate(pa):
        pa_list[i]=data[2]
    
    if feature == 'jc':
        return jc_list
    elif feature == 'pa':
        return pa_list
    elif feature == 'cn':
        return list(cn)
    else:
        return jc_list,pa_list,list(cn) 
    return jc_list,pa_list,list(cn)

In [123]:
def get_accuracy(prediction, label):
    return sum(1 for x, y in zip(prediction, label) if x == y) / len(label)

In [124]:
ori_df=pd.read_csv('facebook_edges.csv', header=0)

In [125]:
ori_G = nx.from_pandas_edgelist(ori_df, 'id_1', 'id_2')

In [155]:
train_df=pd.read_csv('train_pos_data.csv',header=0)

In [156]:
train_df

Unnamed: 0,id_1,id_2
0,12924,21054
1,16427,2768
2,1969,2239
3,10187,17328
4,4382,6128
...,...,...
85496,365,13286
85497,1803,3862
85498,1387,19483
85499,3743,6741


In [157]:
train_G = nx.from_pandas_edgelist(train_df, 'id_1', 'id_2')

In [158]:
print(nx.info(train_G))

Name: 
Type: Graph
Number of nodes: 20214
Number of edges: 85501
Average degree:   8.4596


In [159]:
for i in range(len(list(ori_G.nodes))):
    train_G.add_node(list(ori_G.nodes)[i])

In [160]:
print(nx.info(train_G))

Name: 
Type: Graph
Number of nodes: 22470
Number of edges: 85501
Average degree:   7.6102


In [161]:
pos_edges = [(data.id_1, data.id_2) for data in train_df.itertuples()]

In [162]:
train_df['jc'],train_df['pa'],train_df['cn'] = get_proximity_score(train_G, pos_edges,'all')

In [163]:
train_df

Unnamed: 0,id_1,id_2,jc,pa,cn
0,12924,21054,0.000000,232,0
1,16427,2768,0.036585,616,3
2,1969,2239,0.062500,594,3
3,10187,17328,0.000000,162,0
4,4382,6128,0.170732,495,7
...,...,...,...,...,...
85496,365,13286,0.021277,215,1
85497,1803,3862,0.282609,814,13
85498,1387,19483,0.011236,3584,3
85499,3743,6741,0.238095,165,5


In [164]:
test_pos_df=pd.read_csv('test_pos_data.csv',header=0)
test_pos_edges=[(data.id_1, data.id_2) for data in test_pos_df.itertuples()]

In [165]:
test_pos_df['jc'],test_pos_df['pa'],test_pos_df['cn'] = get_proximity_score(train_G, test_pos_edges,'all')

In [166]:
file = open("negative_edges.p","rb")
negative_edges = pickle.load(file)
file.close()

In [167]:
df_neg = pd.DataFrame(list(negative_edges), columns=['id_1', 'id_2'])
neg_edges = list(negative_edges)

In [168]:
df_neg['jc'],df_neg['pa'],df_neg['cn'] = get_proximity_score(train_G, neg_edges,'all')

In [140]:
X_train_neg,X_test_neg = train_test_split(df_neg, test_size=0.5)

In [141]:
y_train_pos=np.ones(len(train_df))
y_train_neg=np.zeros(len(X_train_neg))
y_test_pos=np.ones(len(test_pos_df))
y_test_neg=np.zeros(len(X_test_neg))

In [142]:
X_train = np.concatenate((train_df,X_train_neg))
y_train = np.concatenate((y_train_pos,y_train_neg))
X_test = np.concatenate((test_pos_df,X_test_neg))
y_test = np.concatenate((y_test_pos,y_test_neg)) 

In [145]:
clf1 = RandomForestClassifier(n_estimators=400)
clf1.fit(X_train, y_train)

RandomForestClassifier(n_estimators=400)

In [146]:
#jc,pa,cn
predict_Y = clf1.predict(X_test)
print(get_accuracy(predict_Y, y_test))

0.8566449515210348


In [154]:
#jc
predict_Y = clf1.predict(X_test)
print(get_accuracy(predict_Y, y_test))

0.8448205284148723


In [196]:
#pa
predict_Y = clf1.predict(X_test)
print(get_accuracy(predict_Y, y_test))

0.8041718810306312


In [221]:
#cn
predict_Y = clf1.predict(X_test)
print(get_accuracy(predict_Y, y_test))

0.8443702412837277


In [55]:
def n2v_embedding(train_G):  #https://github.com/eliorc/node2vec
    node2vec  = Node2Vec(train_G, dimensions=128, walk_length=80, num_walks=10, workers=4,p=0.25,q=0.25)
    model = node2vec .fit(window=10, min_count=1, batch_words=4)
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    return edges_embs

In [56]:
edges_embs = n2v_embedding(train_G)

Computing transition probabilities: 100%|██████████████████████████████████████| 22470/22470 [00:15<00:00, 1441.17it/s]


In [57]:
pickle.dump(edges_embs ,open("edges_embs_paper2.p",'wb'))

In [147]:
file = open("edges_embs_paper2.p","rb")
embeddings = pickle.load(file)
file.close()

In [59]:
embeddings

<node2vec.edges.HadamardEmbedder at 0x1b01ba95700>

In [154]:
def n2v_combine_embedding(data):
    i=0
    X = []
    for edge in data:
        X.append(np.concatenate((data[i],embeddings[(str(int(edge[0])), str(int(edge[1])))])))
   # print(embeddings[str(int(data[0]))])
        i+=1
    return X

In [150]:
train_df=pd.read_csv('train_pos_data.csv',header=0)
test_pos_df=pd.read_csv('test_pos_data.csv',header=0)
df_neg = pd.DataFrame(list(negative_edges), columns=['id_1', 'id_2'])

In [169]:
train_df = train_df.values
test_pos_df=test_pos_df.values
df_neg=df_neg.values

In [170]:
train_df=n2v_combine_embedding(train_df)
test_pos_df=n2v_combine_embedding(test_pos_df)
df_neg=n2v_combine_embedding(df_neg)

In [176]:
X_train_neg,X_test_neg = train_test_split(df_neg, test_size=0.5)

In [177]:
y_train_pos=np.ones(len(train_df))
y_train_neg=np.zeros(len(X_train_neg))
y_test_pos=np.ones(len(test_pos_df))
y_test_neg=np.zeros(len(X_test_neg))

In [178]:
X_train = np.concatenate((train_df,X_train_neg))
y_train = np.concatenate((y_train_pos,y_train_neg))
X_test = np.concatenate((test_pos_df,X_test_neg))
y_test = np.concatenate((y_test_pos,y_test_neg)) 

In [179]:
clf2 = RandomForestClassifier(n_estimators=400)
clf2.fit(X_train, y_train)

RandomForestClassifier(n_estimators=400)

In [68]:
#node2vec
predict_Y2 = clf2.predict(X_test)
print(get_accuracy(predict_Y2, y_test))

0.9115507420965837


In [180]:
#jc,pa,cn,node2vec
predict_Y2 = clf2.predict(X_test)
print(get_accuracy(predict_Y2, y_test))

0.9074513748377212


In [47]:
def combine_embedding(data,embeddings):
    i=0
    X = []
    for node1,node2 in data:
        X.append(np.concatenate((data[i],embeddings[int(node1)],embeddings[int(node2)])))
   # print(embeddings[str(int(data[0]))])
        i+=1
    return X

In [181]:
train_df=pd.read_csv('train_pos_data.csv',header=0)
test_pos_df=pd.read_csv('test_pos_data.csv',header=0)
df_neg = pd.DataFrame(list(negative_edges), columns=['id_1', 'id_2'])

In [182]:
train_df = train_df.values
test_pos_df=test_pos_df.values
df_neg=df_neg.values

In [183]:
model = LINE(train_G, embedding_size=128, order='all')

In [184]:
model.train(batch_size=10240, epochs=50, verbose=2)

Epoch 1/50
51/51 - 5s - loss: 1.3863 - first_order_loss: 0.6931 - second_order_loss: 0.6931
Epoch 2/50
51/51 - 5s - loss: 1.3851 - first_order_loss: 0.6927 - second_order_loss: 0.6924
Epoch 3/50
51/51 - 5s - loss: 1.3814 - first_order_loss: 0.6920 - second_order_loss: 0.6894
Epoch 4/50
51/51 - 5s - loss: 1.3610 - first_order_loss: 0.6914 - second_order_loss: 0.6696
Epoch 5/50
51/51 - 6s - loss: 1.2899 - first_order_loss: 0.6899 - second_order_loss: 0.6000
Epoch 6/50
51/51 - 6s - loss: 1.1730 - first_order_loss: 0.6886 - second_order_loss: 0.4844
Epoch 7/50
51/51 - 6s - loss: 1.1010 - first_order_loss: 0.6852 - second_order_loss: 0.4158
Epoch 8/50
51/51 - 6s - loss: 1.0391 - first_order_loss: 0.6826 - second_order_loss: 0.3565
Epoch 9/50
51/51 - 5s - loss: 1.0275 - first_order_loss: 0.6762 - second_order_loss: 0.3513
Epoch 10/50
51/51 - 5s - loss: 0.9885 - first_order_loss: 0.6732 - second_order_loss: 0.3153
Epoch 11/50
51/51 - 6s - loss: 0.9885 - first_order_loss: 0.6645 - second_order

<tensorflow.python.keras.callbacks.History at 0x1b0a6e00c70>

In [185]:
LINE_embeddings = model.get_embeddings()

In [186]:
train_df=combine_embedding(train_df,LINE_embeddings)
test_pos_df=combine_embedding(test_pos_df,LINE_embeddings)
df_neg=combine_embedding(df_neg,LINE_embeddings)

In [187]:
X_train_neg,X_test_neg = train_test_split(df_neg, test_size=0.5)

In [188]:
y_train_pos=np.ones(len(train_df))
y_train_neg=np.zeros(len(X_train_neg))
y_test_pos=np.ones(len(test_pos_df))
y_test_neg=np.zeros(len(X_test_neg))

In [189]:
X_train = np.concatenate((train_df,X_train_neg))
y_train = np.concatenate((y_train_pos,y_train_neg))
X_test = np.concatenate((test_pos_df,X_test_neg))
y_test = np.concatenate((y_test_pos,y_test_neg)) 

In [190]:
clf3 = RandomForestClassifier(n_estimators=400)
clf3.fit(X_train, y_train)

RandomForestClassifier(n_estimators=400)

In [191]:
predict_Y3 = clf3.predict(X_test)
print(get_accuracy(predict_Y3, y_test))

0.8403702880668062


In [94]:
train_df=pd.read_csv('train_pos_data.csv',header=0)
train_df=train_df.astype(str)
ori_df = ori_df.astype(str)
test_pos_df=pd.read_csv('test_pos_data.csv',header=0)
df_neg = pd.DataFrame(list(negative_edges), columns=['id_1', 'id_2'])

In [95]:
ori_G = nx.from_pandas_edgelist(ori_df, 'id_1', 'id_2')
train_G = nx.from_pandas_edgelist(train_df, 'id_1', 'id_2')
for i in range(len(list(ori_G.nodes))):
    train_G.add_node(list(ori_G.nodes)[i])

In [96]:
train_df = train_df.values
test_pos_df=test_pos_df.values
df_neg=df_neg.values

In [97]:
model = DeepWalk(train_G, walk_length=10, num_walks=80, workers=4)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   13.5s remaining:   13.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   15.8s finished


In [98]:
model.train(window_size=5, iter=3)

Learning embedding vectors...
Learning embedding vectors done!


<gensim.models.word2vec.Word2Vec at 0x1b0f1352d60>

In [99]:
DeepWalk_embeddings = model.get_embeddings()

In [100]:
pickle.dump(edges_embs ,open("DeepWalk_embeddings.p",'wb'))

In [103]:
def deep_walk_combine_embedding(data,embeddings):
    i=0
    X = []
    for node1,node2 in data:
        X.append(np.concatenate((data[i],embeddings[str(int(node1))],embeddings[str(int(node2))])))
   # print(embeddings[str(int(data[0]))])
        i+=1
    return X

In [104]:
train_df=deep_walk_combine_embedding(train_df,DeepWalk_embeddings)
test_pos_df=deep_walk_combine_embedding(test_pos_df,DeepWalk_embeddings)
df_neg=deep_walk_combine_embedding(df_neg,DeepWalk_embeddings)

In [105]:
X_train_neg,X_test_neg = train_test_split(df_neg, test_size=0.5)

In [106]:
y_train_pos=np.ones(len(train_df))
y_train_neg=np.zeros(len(X_train_neg))
y_test_pos=np.ones(len(test_pos_df))
y_test_neg=np.zeros(len(X_test_neg))

In [107]:
X_train = np.concatenate((train_df,X_train_neg))
y_train = np.concatenate((y_train_pos,y_train_neg))
X_test = np.concatenate((test_pos_df,X_test_neg))
y_test = np.concatenate((y_test_pos,y_test_neg)) 

In [108]:
clf4 = RandomForestClassifier(n_estimators=400)
clf4.fit(X_train, y_train)

RandomForestClassifier(n_estimators=400)

In [109]:
predict_Y4 = clf4.predict(X_test)
print(get_accuracy(predict_Y4, y_test))

0.9065040174968714


In [192]:
groups = ['Jaccard Coefficient', 'Pref. Attachment', 'Common Neighbors','Node2vec','LINE','DeepWalk','jc,pa,cn']
ironmen = [0.8448205284148723, 0.8041718810306312, 0.8443702412837277, 0.9115507420965837, 0.8412650144442755, 0.9065040174968714,0.8566449515210348]

ironmen_dict = {
                "Algorithm": groups,
                "auc": ironmen
}

# 建立 data frame
ironmen_df = pd.DataFrame(ironmen_dict)

In [193]:
ironmen_df

Unnamed: 0,Algorithm,auc
0,Jaccard Coefficient,0.844821
1,Pref. Attachment,0.804172
2,Common Neighbors,0.84437
3,Node2vec,0.911551
4,LINE,0.841265
5,DeepWalk,0.906504
6,"jc,pa,cn",0.856645
