In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

from gensim.models.word2vec import Word2Vec
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer

### load abstract

In [2]:
node_info = pd.read_csv('../data/node_information.csv', header=None)
node_info.columns = ['id', 'year', 'title', 'authors', 'journal', 'abstract']
node_info.head()

Unnamed: 0,id,year,title,authors,journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [3]:
stemmer = PorterStemmer()
def tokenizer(line):
    tokens = [token for token in line.strip().split() if len(token)>1 and token not in STOPWORDS]
    tokens = [''.join([elt for elt in token if not elt.isdigit()]) for token in tokens]
    tokens = [token for token in tokens if len(token)>1 and token not in STOPWORDS]
    return tokens

In [4]:
texts = [tokenizer(node_info.loc[i, 'abstract']) for i in range(len(node_info))]

In [5]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token]>2] for text in texts]
len(texts)

27770

### abstract word2vec

In [6]:
my_q = 300
mcount = 1
w2v = Word2Vec(size=my_q, min_count=mcount)
w2v.build_vocab(texts)
w2v.intersect_word2vec_format('../../GoogleNews-vectors-negative300.bin', binary=True)
w2v.save('w2v_abstract.bin')

### word2vec centroids similarity & wmd

In [7]:
centroids = np.empty(shape=(len(texts),my_q))
for idx,doc in enumerate(texts):
    centroid = np.mean(np.concatenate([w2v[token].reshape(1,-1) for token in doc]), axis=0)
    centroids[idx,:] = centroid

  app.launch_new_instance()


In [8]:
train = pd.read_csv('../data/train_treated_with_journal.csv')
train.head()

Unnamed: 0,id1,id2,link,rno1,rno2,sim,cn,aai,year1,year2,year_diff,common_authors,title_overlap,journal_overlap
0,9510123,9502114,1,16827,15446,0.064373,1,0.513898,1995,1995,0,0.0,0.285714,1.0
1,9707075,9604178,1,21154,18059,0.021211,20,4.320366,1997,1996,1,0.0,0.25,0.0
2,9312155,9506142,0,13074,16171,0.017202,0,0.0,1993,1995,-2,0.0,0.0,0.0
3,9911255,302165,0,27486,9702,0.012634,0,0.0,1999,2003,-4,0.0,0.0,0.0
4,9701033,209076,0,19856,8212,0.059588,0,0.0,1997,2002,-5,0.0,0.0,0.0


In [9]:
test = pd.read_csv('../data/test_treated_with_journal.csv')
test.head()

Unnamed: 0,id1,id2,rno1,rno2,sim,cn,aai,year1,year2,year_diff,common_authors,title_overlap,journal_overlap
0,9807076,9807139,23774,23835,0.07187,0,0.0,1998,1998,0,0.0,0.0,0.5
1,109162,1182,5227,172,0.16304,24,5.377973,2001,2000,1,0.0,0.444444,0.0
2,9702187,9510135,20185,16838,0.138004,59,15.053612,1997,1995,2,0.0,0.285714,1.0
3,111048,110115,5621,5397,0.101857,21,4.899424,2001,2001,0,0.0,0.153846,1.0
4,9910176,9410073,27159,14643,0.091231,0,0.0,1999,1994,5,0.0,0.0,0.0


In [10]:
def computeCentroidSim(rno1, rno2):
    rno1 = int(rno1)
    rno2 = int(rno2)  
    c1 = centroids[rno1:(rno1+1),:]
    c2 = centroids[rno2:(rno2+1),:]
    return cosine_similarity(c1, c2)[0][0]

In [11]:
def computeWMD(rno1, rno2):
    rno1 = int(rno1)
    rno2 = int(rno2)
    sent1 = texts[rno1]
    sent2 = texts[rno2]
    return w2v.wv.wmdistance(sent1, sent2)

In [12]:
train['centroid_sim'] = train.apply(lambda row: computeCentroidSim(row['rno1'], row['rno2']), axis=1)
train['wmd'] = train.apply(lambda row: computeWMD(row['rno1'], row['rno2']), axis=1)
train.head()

Unnamed: 0,id1,id2,link,rno1,rno2,sim,cn,aai,year1,year2,year_diff,common_authors,title_overlap,journal_overlap,centroid_sim,wmd
0,9510123,9502114,1,16827,15446,0.064373,1,0.513898,1995,1995,0,0.0,0.285714,1.0,0.71625,2.992504
1,9707075,9604178,1,21154,18059,0.021211,20,4.320366,1997,1996,1,0.0,0.25,0.0,0.657709,2.335481
2,9312155,9506142,0,13074,16171,0.017202,0,0.0,1993,1995,-2,0.0,0.0,0.0,0.804457,2.87029
3,9911255,302165,0,27486,9702,0.012634,0,0.0,1999,2003,-4,0.0,0.0,0.0,0.779212,2.767666
4,9701033,209076,0,19856,8212,0.059588,0,0.0,1997,2002,-5,0.0,0.0,0.0,0.800134,2.67069


In [13]:
test['centroid_sim'] = test.apply(lambda row: computeCentroidSim(row['rno1'], row['rno2']), axis=1)
test['wmd'] = test.apply(lambda row: computeWMD(row['rno1'], row['rno2']), axis=1)
test.head()

Unnamed: 0,id1,id2,rno1,rno2,sim,cn,aai,year1,year2,year_diff,common_authors,title_overlap,journal_overlap,centroid_sim,wmd
0,9807076,9807139,23774,23835,0.07187,0,0.0,1998,1998,0,0.0,0.0,0.5,0.744686,2.544625
1,109162,1182,5227,172,0.16304,24,5.377973,2001,2000,1,0.0,0.444444,0.0,0.742522,2.517749
2,9702187,9510135,20185,16838,0.138004,59,15.053612,1997,1995,2,0.0,0.285714,1.0,0.773785,2.528663
3,111048,110115,5621,5397,0.101857,21,4.899424,2001,2001,0,0.0,0.153846,1.0,0.754036,2.519363
4,9910176,9410073,27159,14643,0.091231,0,0.0,1999,1994,5,0.0,0.0,0.0,0.702717,2.408418


In [14]:
train.to_csv('../data/train_treated_with_journal_w2v.csv', index=False)
test.to_csv('../data/test_treated_with_journal_w2v.csv', index=False)