In [1]:
import numpy as np
from collections import defaultdict
from itertools import combinations
import networkx as nx
from networkx.algorithms.link_prediction import *
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
import pandas as pd
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time

In [2]:
pos_samples = np.load("data/pos_samples.npy").tolist()
neg_samples = np.load("data/neg_samples.npy").tolist()

In [3]:
train_set = []
for edge in pos_samples:
    train_set.append([edge[0], edge[1], 1])
for edge in neg_samples:
    train_set.append([edge[0], edge[1], 0])

train_set = np.array(train_set)
train_X = train_set[:, :2]
train_y = train_set[:, 2]

In [4]:
edge_ind_count = defaultdict(int)
with open("./Data/train.txt") as f:
    total_line = 0
    for line in f:
        total_line+=1
        nodes = list(map(int, line.strip('\n').split(' ')))
        for comb in combinations(nodes,2):
            sorted_comb = tuple(sorted(comb))
            edge_ind_count[sorted_comb] += 1

In [5]:
graph = nx.Graph()

for edge in edge_ind_count:
    e1, e2 = edge
    weight = edge_ind_count[edge]
    for i in range(weight):
        graph.add_edge(e1, e2)

graph.remove_edges_from(nx.selfloop_edges(graph))

In [6]:
rai_f = [rai for n1,n2,rai in resource_allocation_index(graph, train_X)]
jc_f = [jc for n1,n2,jc in jaccard_coefficient(graph, train_X)]
aai_f = [aai for n1,n2,aai in adamic_adar_index(graph, train_X)]
pa_f = [pa for n1,n2,pa in preferential_attachment(graph, train_X)]

In [7]:
community_list = greedy_modularity_communities(graph)

for node in graph.nodes():
    for i in range(len(community_list)):
        community = community_list[i]
        if node in community:
            graph.nodes()[node]["community"] = i  

In [8]:
csh_f = [csh for n1,n2,csh in cn_soundarajan_hopcroft(graph, train_X)]
rsh_f = [rsh for n1,n2,rsh in ra_index_soundarajan_hopcroft(graph, train_X)]
wic_f = [wic for n1,n2,wic in within_inter_cluster(graph, train_X)]

In [9]:
sorted(nx.common_neighbors(graph,0,356))

[1236, 1655, 1797, 2414, 2568, 3649, 3760]

In [10]:
sp_f = []
dc_f1 = []
dc_f2 = []
c_f1 = []
c_f2 = []
cluster_f1 = []
cluster_f2 = []
avg_neigh_f1 = []
avg_neigh_f2 = []
kcore_f1 = []
kcore_f2 = []
load_f1 = []
load_f2 = []
close_f1 = []
close_f2 = []
cn_f =[] 

dc_g = nx.degree_centrality(graph)
c_g = nx.eigenvector_centrality(graph)
load_g = nx.load_centrality(graph)
close_g = nx.closeness_centrality(graph)
cluster_g = nx.algorithms.cluster.clustering(graph) 
kcore_g = nx.core_number(graph)
avg_neigh_g = nx.average_neighbor_degree(graph)

for i in train_X:
    try:
        cn=len(sorted(nx.common_neighbors(graph, i[0],i[1])))
        sp=nx.shortest_path_length(graph,i[0],i[1])
        dc1 = dc_g.get(i[0])
        dc2 = dc_g.get(i[1])
        c1 = c_g.get(i[0])
        c2 = c_g.get(i[1])
        cluster1 = cluster_g.get(i[0])
        cluster2 = cluster_g.get(i[1])
        avg_neigh1 = avg_neigh_g.get(i[0])
        avg_neigh2 = avg_neigh_g.get(i[1])
        kcore1 = kcore_g.get(i[0])
        kcore2 = kcore_g.get(i[1])
        load1 = load_g.get(i[0])
        load2 = load_g.get(i[1])
        close1 = close_g.get(i[0])
        close2 = close_g.get(i[1])
    except:
        sp=0
    sp_f.append(sp)
    cn_f.append(cn)
    dc_f1.append(dc1)
    dc_f2.append(dc2)
    c_f1.append(c1)
    c_f2.append(c2)
    cluster_f1.append(cluster1)
    cluster_f2.append(cluster2)
    avg_neigh_f1.append(avg_neigh1)
    avg_neigh_f2.append(avg_neigh2)
    kcore_f1.append(kcore1)
    kcore_f2.append(kcore2)
    load_f1.append(load1)
    load_f2.append(load2)
    close_f1.append(close1)
    close_f2.append(close2)

In [11]:
topo_train_X = pd.DataFrame({"rai":rai_f, "jc":jc_f, "aai":aai_f, "pa":pa_f, "csh":csh_f, "rsh":rsh_f, "wic":wic_f, "cn_f": cn_f, "sp_f": sp_f, "dc_source": dc_f1, "dc_sink":dc_f2, "c_source": c_f1, "c_sink":c_f2,
"cluster_source": cluster_f1, "cluster_sink":cluster_f2,
"avg_neigh_source": avg_neigh_f1, "avg_neigh_sink":avg_neigh_f2, "kcore_source": kcore_f1, "kcore_sink":kcore_f2,"load_source": load_f1, "load_sink":load_f2, "close_source": close_f1, "close_sink":close_f2})

In [13]:
test_set = pd.read_csv("Data/test-public.csv")

In [14]:
topo_test_X = []
missing_indices = []
all_training_nodes = list(graph.nodes())

for i in range(len(test_set)):
    idd, source, sink = test_set.iloc[i]
    if source not in all_training_nodes or sink not in all_training_nodes:
        topo_test_X.append([0 for _ in range(23)])
        missing_indices.append(idd)
        continue
        
    
    features = [list(e)[0][2] for e in [resource_allocation_index(graph, [[source, sink]]), 
                jaccard_coefficient(graph, [[source, sink]]), 
                adamic_adar_index(graph, [[source, sink]]), 
                preferential_attachment(graph, [[source, sink]]),
                cn_soundarajan_hopcroft(graph, [[source, sink]]),
                ra_index_soundarajan_hopcroft(graph, [[source, sink]]),
                within_inter_cluster(graph, [[source, sink]])
            ]]
    try:
        cn=len(sorted(nx.common_neighbors(graph,source, sink)))
        sp = nx.shortest_path_length(graph,source,sink)
        dc1 = dc_g.get(source)
        dc2 = dc_g.get(sink)
        c1 = c_g.get(source)
        c2 = c_g.get(sink)
        cluster1 = cluster_g.get(source)
        cluster2 = cluster_g.get(sink)
        avg_neigh1 = avg_neigh_g.get(source)
        avg_neigh2 = avg_neigh_g.get(sink)
        kcore1 = kcore_g.get(source)
        kcore2 = kcore_g.get(sink)
        load1 = load_g.get(source)
        load2 = load_g.get(sink)
        close1 = close_g.get(source)
        close2 = close_g.get(sink)
    except:
        sp=0
    
    features.append(cn)
    features.append(sp)
    features.append(dc1)
    features.append(dc2)
    features.append(c1)
    features.append(c2)
    features.append(cluster1)
    features.append(cluster2)
    features.append(avg_neigh1)
    features.append(avg_neigh2)
    features.append(kcore1)
    features.append(kcore2)
    features.append(load1)
    features.append(load2)
    features.append(close1)
    features.append(close2)
    topo_test_X.append(features)

topo_test_X = pd.DataFrame(topo_test_X, columns=["rai", "jc", "aai", "pa", "csh", "rsh", "wic", "cn_f", "sp","dc_source", "dc_sink", "c_source", "c_sink", "cluster_source", "cluster_sink", "avg_neigh_source", "avg_neigh_sink", "kcore_source", "kcore_sink", "load_source", "load_sink", "close_source", "close_sink"])

In [15]:
topo_train = topo_train_X.copy()
topo_train["label"] = train_y

In [16]:
topo_train.to_csv("data/topo_train22.csv", index=False)
topo_test_X.to_csv("data/topo_test_X.csv", index=False)