In [2]:
import numpy as np
import pandas as pd

In [4]:
docs_vec = np.loadtxt('gensim/docs_vec.txt', delimiter=' ')
docs_vec.shape

(27770, 4200)

In [22]:
docs_vec.dtype

dtype('float64')

In [8]:
node_info = pd.read_csv('data/node_information.csv', header=None)
node_info.columns = ['id', 'year', 'title', 'authors', 'journal', 'abstract']
node_info.head()

Unnamed: 0,id,year,title,authors,journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [9]:
train = pd.read_csv('data/training_set.txt', delimiter=' ', header=None)
train.columns = ['id1', 'id2', 'link']
train.head()

Unnamed: 0,id1,id2,link
0,9510123,9502114,1
1,9707075,9604178,1
2,9312155,9506142,0
3,9911255,302165,0
4,9701033,209076,0


In [10]:
test = pd.read_csv('data/testing_set.txt', delimiter=' ', header=None)
test.columns = ['id1', 'id2']
test.head()

Unnamed: 0,id1,id2
0,9807076,9807139
1,109162,1182
2,9702187,9510135
3,111048,110115
4,9910176,9410073


In [11]:
id2rno = {}
for idx, row in enumerate(node_info['id']):
    id2rno[row] = idx
id2rno

{9306112: 12159,
 9306114: 12160,
 9306115: 12161,
 9306116: 12162,
 9306117: 12163,
 9306118: 12164,
 9306119: 12165,
 9306120: 12166,
 9306122: 12167,
 9306123: 12168,
 9306125: 12169,
 9306129: 12170,
 9306130: 12171,
 9306131: 12172,
 9306132: 12173,
 9306134: 12174,
 9306135: 12175,
 9306136: 12176,
 9306137: 12177,
 9306139: 12178,
 9306140: 12179,
 9306141: 12180,
 9306142: 12181,
 9306144: 12182,
 9306145: 12183,
 9306146: 12184,
 9306147: 12185,
 9306148: 12186,
 9306149: 12187,
 9306150: 12188,
 9306151: 12189,
 9306152: 12190,
 9306153: 12191,
 9306154: 12192,
 9306155: 12193,
 9306156: 12194,
 9306157: 12195,
 9306161: 12196,
 9306162: 12197,
 9306163: 12198,
 9306164: 12199,
 9302042: 11546,
 9807081: 23779,
 9903101: 25509,
 9903102: 25510,
 9903103: 25511,
 9903105: 25513,
 9503001: 15494,
 9503002: 15495,
 9503003: 15496,
 9503005: 15497,
 9503008: 15498,
 9503009: 15499,
 9503010: 15500,
 9503011: 15501,
 9503012: 15502,
 9503013: 15503,
 9503014: 15504,
 9503015: 1550

In [12]:
train.loc[0,:]['id1']

9510123

In [13]:
train['rno1'] = train.apply(lambda row: id2rno[row['id1']], axis=1)
train['rno2'] = train.apply(lambda row: id2rno[row['id2']], axis=1)
train.head()

Unnamed: 0,id1,id2,link,rno1,rno2
0,9510123,9502114,1,16827,15446
1,9707075,9604178,1,21154,18059
2,9312155,9506142,0,13074,16171
3,9911255,302165,0,27486,9702
4,9701033,209076,0,19856,8212


In [25]:
test['rno1'] = test.apply(lambda row: id2rno[row['id1']], axis=1)
test['rno2'] = test.apply(lambda row: id2rno[row['id2']], axis=1)
test.head()

Unnamed: 0,id1,id2,rno1,rno2
0,9807076,9807139,23774,23835
1,109162,1182,5227,172
2,9702187,9510135,20185,16838
3,111048,110115,5621,5397
4,9910176,9410073,27159,14643


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
def getSim(rn1, rn2):
    return cosine_similarity(docs_vec[rn1:(rn1+1), :],
                             docs_vec[rn2:(rn2+1), :])[0][0]

In [26]:
train['sim'] = train.apply(lambda row: getSim(row['rno1'], row['rno2']), axis=1)
train.head()

Unnamed: 0,id1,id2,link,rno1,rno2,sim
0,9510123,9502114,1,16827,15446,0.064373
1,9707075,9604178,1,21154,18059,0.021211
2,9312155,9506142,0,13074,16171,0.017202
3,9911255,302165,0,27486,9702,0.012634
4,9701033,209076,0,19856,8212,0.059588


In [27]:
test['sim'] = test.apply(lambda row: getSim(row['rno1'], row['rno2']), axis=1)
test.head()

Unnamed: 0,id1,id2,rno1,rno2,sim
0,9807076,9807139,23774,23835,0.07187
1,109162,1182,5227,172,0.16304
2,9702187,9510135,20185,16838,0.138004
3,111048,110115,5621,5397,0.101857
4,9910176,9410073,27159,14643,0.091231


In [28]:
train.to_csv('train_treated.csv')
test.to_csv('test_treated.csv')