In [1]:
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
train = pd.read_csv('data/training_set.txt', delimiter=' ', header=None)
train.columns = ['id1', 'id2', 'link']
train.head()

Unnamed: 0,id1,id2,link
0,9510123,9502114,1
1,9707075,9604178,1
2,9312155,9506142,0
3,9911255,302165,0
4,9701033,209076,0


In [3]:
test = pd.read_csv('data/testing_set.txt', delimiter=' ', header=None)
test.columns = ['id1', 'id2']
test.head()

Unnamed: 0,id1,id2
0,9807076,9807139
1,109162,1182
2,9702187,9510135
3,111048,110115
4,9910176,9410073


In [4]:
nodes = tuple(set(train['id1']) | set(train['id2']) | set(test['id1']) | set(test['id2']))
num_nodes = len(nodes)
num_nodes, nodes[10:21]

(27770,
 (9306125,
  9306129,
  9306130,
  9306131,
  9306132,
  9306134,
  9306135,
  9306136,
  9306137,
  9306139,
  9306140))

In [None]:
# id2name = {i:n for i, n in enumerate(nodes)}
# name2id = {n:i for i, n in id2name.items()}

In [None]:
# train.loc[:, 'id1'] = train.loc[:, 'id1'].apply(lambda x: name2id[x])
# train.loc[:, 'id2'] = train.loc[:, 'id2'].apply(lambda x: name2id[x])
# test.loc[:, 'id1'] = test.loc[:, 'id1'].apply(lambda x: name2id[x])
# test.loc[:, 'id2'] = test.loc[:, 'id2'].apply(lambda x: name2id[x])

In [5]:
links = train[train['link']==1]
nodes_ = tuple(set(links['id1']) | set(links['id2']))
len(nodes_), len(links)

(27684, 335130)

In [6]:
edgelist = links[['id1', 'id2']].values
edgelist = [tuple(e) for e in edgelist]
len(edgelist)

335130

In [7]:
G = nx.Graph(edgelist)
G.number_of_nodes(), G.number_of_edges()

(27684, 334690)

In [8]:
orph_nodes = set(nodes) - set(nodes_)
G.add_nodes_from(orph_nodes)
G.number_of_nodes(), G.number_of_edges()

(27770, 334690)

In [9]:
preds = nx.jaccard_coefficient(G, [(9510123, 9502114), (9312155, 9506142), (109162, 1182)])
for u, v, p in preds:
    print(u, v, p)

9510123 9502114 0.058823529411764705
9312155 9506142 0.0
109162 1182 0.07430340557275542


In [10]:
def predScore(row, method):
    pred = method(G, [(row['id1'], row['id2'])])
    for _, _, p in pred:
        return p
# train['jc'] = train.apply(lambda row: predScore(row, nx.jaccard_coefficient), axis=1)

In [11]:
train_pairs = train[['id1', 'id2']].values
train_pairs = [tuple(e) for e in train_pairs]
test_pairs = test[['id1', 'id2']].values
test_pairs = [tuple(e) for e in test_pairs]

In [12]:
methods = {'jc': nx.jaccard_coefficient,
           'rai': nx.resource_allocation_index,
           'aai': nx.adamic_adar_index,
           'pa': nx.preferential_attachment}
for m_name, method in methods.items():
    train_preds = method(G, train_pairs)
    train_scores = np.zeros(len(train))
    for i, (_, _, p) in enumerate(train_preds):
        train_scores[i] = p
    train[m_name] = pd.Series(train_scores, index=train.index)
    
    test_preds = method(G, test_pairs)
    test_scores = np.zeros(len(test))
    for i, (_, _, p) in enumerate(test_preds):
        test_scores[i] = p
    test[m_name] = pd.Series(test_scores, index=test.index)

In [13]:
train.head()

Unnamed: 0,id1,id2,link,pa,rai,aai,jc
0,9510123,9502114,1,72.0,0.142857,0.513898,0.058824
1,9707075,9604178,1,11613.0,0.226401,4.320366,0.097087
2,9312155,9506142,0,5.0,0.0,0.0,0.0
3,9911255,302165,0,280.0,0.0,0.0,0.0
4,9701033,209076,0,168.0,0.0,0.0,0.0


In [14]:
test.head()

Unnamed: 0,id1,id2,pa,rai,aai,jc
0,9807076,9807139,1062.0,0.0,0.0,0.0
1,109162,1182,13590.0,0.311535,5.377973,0.074303
2,9702187,9510135,164797.0,1.342594,15.053612,0.065338
3,111048,110115,3315.0,0.298419,4.899424,0.221053
4,9910176,9410073,1050.0,0.0,0.0,0.0


In [20]:
measures = ['method', 'Precision', 'Recall', 'F1', 'Threshold']
bm = pd.DataFrame(columns=measures)
bm.head()

Unnamed: 0,method,Precision,Recall,F1,Threshold


In [23]:
def predictRow(row, m_name, ts, pred_ts):
    if row[m_name] > ts:
        return 1
    elif row[m_name] == threshold:
        return pred_ts
    else:
        return 0

In [27]:
for m_name in methods.keys():
    link_score = train[['link', m_name]].values
    idx_sort = np.argsort(link_score[:, 1])[::-1]
    link_score = link_score[idx_sort]
    
    p_best, r_best, f1_best = 0, 0, 0
    threshold = 0
    num_ones = len(links)
    tp = 0
    for cnt in range(len(train)):
        if link_score[cnt, 0] == 1:
            tp += 1
        p = tp / (cnt + 1)
        r = tp / num_ones
        f_1 = 2 * p * r / (p + r)
        if f_1 > f1_best:
            p_best, r_best = p, r
            f1_best, threshold = f_1, link_score[cnt, 1]
    
    at_threshold = train[train[m_name]==threshold]
    link_at_threshold = sum(at_threshold['link'].values) / len(at_threshold)
    pred_at_threshold = 1 if link_at_threshold > 0.5 else 0
    
    bm = bm.append(pd.DataFrame([[m_name, p_best, r_best, f1_best, threshold]], columns=measures),
                   ignore_index=True)
    
    # predict
    train['pred_'+m_name] = train.apply(lambda row: predictRow(row, m_name, threshold, pred_at_threshold), axis=1)
    test['pred_'+m_name] = test.apply(lambda row: predictRow(row, m_name, threshold, pred_at_threshold), axis=1)

In [28]:
bm

Unnamed: 0,method,Precision,Recall,F1,Threshold
0,pa,0.761253,0.836872,0.797274,372.0
1,rai,0.981924,0.944326,0.962758,0.002774
2,aai,0.979439,0.944669,0.96174,0.18021
3,jc,0.968756,0.949891,0.959231,0.000415


In [29]:
train.head()

Unnamed: 0,id1,id2,link,pa,rai,aai,jc,pred_pa,pred_rai,pred_aai,pred_jc
0,9510123,9502114,1,72.0,0.142857,0.513898,0.058824,0,1,1,1
1,9707075,9604178,1,11613.0,0.226401,4.320366,0.097087,1,1,1,1
2,9312155,9506142,0,5.0,0.0,0.0,0.0,0,0,0,0
3,9911255,302165,0,280.0,0.0,0.0,0.0,0,0,0,0
4,9701033,209076,0,168.0,0.0,0.0,0.0,0,0,0,0


In [30]:
test.head()

Unnamed: 0,id1,id2,pa,rai,aai,jc,pred_pa,pred_rai,pred_aai,pred_jc
0,9807076,9807139,1062.0,0.0,0.0,0.0,1,0,0,0
1,109162,1182,13590.0,0.311535,5.377973,0.074303,1,1,1,1
2,9702187,9510135,164797.0,1.342594,15.053612,0.065338,1,1,1,1
3,111048,110115,3315.0,0.298419,4.899424,0.221053,1,1,1,1
4,9910176,9410073,1050.0,0.0,0.0,0.0,1,0,0,0


In [31]:
train.to_csv('train_by_nx.csv')
test.to_csv('test_by_nx')
bm.to_csv('bm.csv')

In [50]:
cv_measures = ['method', 'accuracy', 'precision', 'recall', 'f1']
cv_bm = pd.DataFrame(columns=cv_measures)
len_train = len(train)
for m_name in methods.keys():
    accuracy = sum(train['link']==train['pred_'+m_name]) / len_train
    
    tp = len(train[(train['link']==1) & (train['pred_'+m_name]==1)])
    precision = tp / sum(train['pred_'+m_name])
    recall = tp / sum(train['link'])
    f1 = 2 * precision * recall / (precision + recall)
    
    cv_bm = cv_bm.append(pd.DataFrame([[m_name, accuracy, precision, recall, f1]], columns=cv_measures),
                         ignore_index=True)

In [51]:
cv_bm

Unnamed: 0,method,accuracy,precision,recall,f1
0,pa,0.768302,0.761337,0.836762,0.79727
1,rai,0.960222,0.981924,0.944326,0.962758
2,aai,0.959075,0.979442,0.944663,0.961738
3,jc,0.956037,0.968756,0.949891,0.959231


In [52]:
cv_bm.to_csv('cv_bm.csv')

In [53]:
test.index.name = 'id'
for m_name in methods.keys():
    sub = test[['pred_'+m_name]]
    sub.columns = ['category']
    sub.to_csv('data/sub_{}.csv'.format(m_name))

## Playing

In [None]:
missing1 = test[test['id1'].isin(neph_nodes)].index
missing2 = test[test['id2'].isin(neph_nodes)].index
len(set(missing1) | set(missing2))

In [32]:
bbb

Unnamed: 0,x,y
0,1,why


In [33]:
bbb.iloc[0, 1] = 1

In [34]:
bbb

Unnamed: 0,x,y
0,1,1


In [35]:
bbb = bbb.append(pd.DataFrame([[0,1]],columns=bbb.columns), ignore_index=True)

In [36]:
bbb

Unnamed: 0,x,y
0,1,1
1,0,1


In [49]:
bbb[(bbb['x']==1) & (bbb['y']==1)]

KeyError: '[1 0] not in index'

In [48]:
sum(bbb['x']==bbb['y'])

1