In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('data/training_set.txt', delimiter=' ', header=None)
train.columns = ['id1', 'id2', 'link']
train.head()

Unnamed: 0,id1,id2,link
0,9510123,9502114,1
1,9707075,9604178,1
2,9312155,9506142,0
3,9911255,302165,0
4,9701033,209076,0


In [3]:
test = pd.read_csv('data/testing_set.txt', delimiter=' ', header=None)
test.columns = ['id1', 'id2']
test.head()

Unnamed: 0,id1,id2
0,9807076,9807139
1,109162,1182
2,9702187,9510135
3,111048,110115
4,9910176,9410073


In [4]:
nodes = list(set(train['id1']) | set(train['id2']) | set(test['id1']) | set(test['id2']))
num_nodes = len(nodes)
len(nodes), nodes[:10]

(27770,
 [9306112,
  9306114,
  9306115,
  9306116,
  9306117,
  9306118,
  9306119,
  9306120,
  9306122,
  9306123])

In [5]:
id2name = {i:n for i, n in enumerate(nodes)}
name2id = {n:i for i, n in id2name.items()}

In [6]:
train.loc[:, 'id1'] = train.loc[:, 'id1'].apply(lambda x: name2id[x])
train.loc[:, 'id2'] = train.loc[:, 'id2'].apply(lambda x: name2id[x])
test.loc[:, 'id1'] = test.loc[:, 'id1'].apply(lambda x: name2id[x])
test.loc[:, 'id2'] = test.loc[:, 'id2'].apply(lambda x: name2id[x])

links = train[train['link']==1]
links.head()

Unnamed: 0,id1,id2,link
0,13835,11288,1
1,3699,8230,1
5,5102,4794,1
6,12494,13847,1
8,2691,1070,1


In [7]:
adjacency = np.zeros((num_nodes, num_nodes), dtype='int16')
adjacency[links['id1'], links['id2']] = 1
adjacency[links['id2'], links['id1']] = 1

In [8]:
# check correcteness of adjacency matrix
sum(links['id1']==2691) + sum(links['id2']==2691) == sum(adjacency[2691])

True

In [9]:
from scipy.sparse import lil_matrix

adjacency = lil_matrix(adjacency).astype('float')
adjacency[:10, :10].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [10]:
def similarities(adj_mat, steps=2, fade=0.8):
    assert steps >= 2
    tmp = adj_mat.dot(adj_mat)
    rst = tmp.copy() * fade
    factor = fade
    for i in range(steps-2):
        tmp = tmp.dot(adj_mat)
        factor *= fade
        rst += factor * tmp
    del tmp
    return rst      

In [11]:
path_l = 3
sim = similarities(adjacency, steps=path_l)
sim[:10, :10].todense()

matrix([[ 1.6 ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
          0.  ],
        [ 0.  ,  1.6 ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
          0.  ],
        [ 0.  ,  0.  ,  2.88,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
          0.  ],
        [ 0.  ,  0.  ,  0.  ,  1.6 ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
          0.  ],
        [ 0.  ,  0.  ,  0.  ,  0.  , 40.8 ,  0.  ,  0.64,  0.  ,  0.  ,
          0.  ],
        [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  7.84,  0.  ,  0.  ,  0.  ,
          0.  ],
        [ 0.  ,  0.  ,  0.  ,  0.  ,  0.64,  0.  , 20.8 ,  0.  ,  0.  ,
          0.  ],
        [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  9.44,  0.  ,
          0.  ],
        [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  , 23.84,
          0.  ],
        [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
          2.88]])

In [12]:
sim.shape

(27770, 27770)

In [13]:
train['l_{}'.format(path_l)] = train.apply(lambda row: sim[row['id1'], row['id2']], axis=1)
# for idx in train.index:
#     train.loc[idx, 'l_{}'.format(path_l)] = sim[train.loc[idx, 'id1'], train.loc[idx, 'id2']]
#     if idx%2000 == 0:
#         print('{} rows treated'.format(idx))
train.to_csv('data/train_with_pl{}.csv'.format(path_l), index=False)
train.head()

Unnamed: 0,id1,id2,link,l_3
0,13835,11288,1,11.68
1,3699,8230,1,1066.88
2,2961,12023,0,0.0
3,18033,9585,0,15.36
4,385,16529,0,0.64


In [None]:
#train['pred'] = train.apply(lambda row: int(row['l_{}'.format(path_l)]>=threshold))
# wrong_pred = train[train.loc[:, 'link']!=train.loc[:, 'pred']]
# wrong_pred.head()
#train.head()

In [14]:
link_sim = train[['link', 'l_{}'.format(path_l)]].values
indice = np.argsort(link_sim[:, 1])[::-1]
link_sim = link_sim[indice]
link_sim

array([[1.000000e+00, 3.171728e+04],
       [1.000000e+00, 2.842784e+04],
       [1.000000e+00, 2.764896e+04],
       ...,
       [0.000000e+00, 0.000000e+00],
       [0.000000e+00, 0.000000e+00],
       [0.000000e+00, 0.000000e+00]])

In [15]:
p_best, r_best, f1_best = 0, 0, 0
threshold = 0
num_ones = len(links)
tp = 0
for cnt in range(len(train)):
    if link_sim[cnt, 0] == 1:
        tp += 1
    p = tp / (cnt + 1)
    r = tp / num_ones
    f_1 = 2 * p * r / (p + r)
    if f_1 > f1_best:
        p_best, r_best = p, r
        f1_best, threshold = f_1, link_sim[cnt, 1]
print('p: {}\nr: {}\nF1: {}\nThreshold: {}'.format(p_best, r_best, f1_best, threshold))

p: 0.9460280077119338
r: 0.9780562766687554
F1: 0.9617755712244317
Threshold: 11.680000000000003


In [16]:
test['l_{}'.format(path_l)] = test.apply(lambda row: sim[row['id1'], row['id2']], axis=1)
test.head()

Unnamed: 0,id1,id2,l_3
0,23525,23586,1.28
1,24287,314,1150.08
2,1056,13847,2609.76
3,25182,24717,518.56
4,17426,21417,9.6


In [26]:
# the rows which sit at the threshold may hold 0 or 1
# predict by a majority vote
at_threshold = train[train['l_{}'.format(path_l)]==threshold]
link_at_threshold = sum(at_threshold['link'].values) / len(at_threshold)
pred_at_threshold = 1 if link_at_threshold > 0.5 else 0

In [27]:
def predictRow(row):
    if row['l_{}'.format(path_l)] > threshold:
        return 1
    elif row['l_{}'.format(path_l)] == threshold:
        return pred_at_threshold
    else:
        return 0

In [28]:
test['category'] = test.apply(lambda row: predictRow(row), axis=1)
test.index.name='id'
test.to_csv('data/test_with_pl{}.csv'.format(path_l))
test.head()

Unnamed: 0_level_0,id1,id2,l_3,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,23525,23586,1.28,0
1,24287,314,1150.08,1
2,1056,13847,2609.76,1
3,25182,24717,518.56,1
4,17426,21417,9.6,0


In [29]:
sub = test[['category']]
sub.to_csv('data/graph_{}_0.8.csv'.format(path_l))
sub.head()

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
0,0
1,1
2,1
3,1
4,0


## check year differences

In [None]:
node_info = pd.read_csv('data/node_information.csv', header=None)
node_info.columns = ['id', 'year', 'title', 'authors', 'journal', 'abstract']

In [None]:
print(node_info[node_info['id']==id2name[13825]])
print(node_info[node_info['id']==id2name[11288]])

In [None]:
print(node_info[node_info['id']==id2name[1056]])
print(node_info[node_info['id']==id2name[13847]])
print(node_info[node_info['id']==id2name[4656]])
print(node_info[node_info['id']==id2name[1031]])