# I. Prepare data

In [1]:
from tqdm import tqdm
from lxml import etree
import os, nltk
from keras.callbacks import EarlyStopping, TensorBoard

Using TensorFlow backend.


In [2]:
topic_tree = etree.parse('data/topics2016.xml')

def get_topic(i):# returns the summary string of the ith topic
    summary = topic_tree.xpath('//topic[@number="%d"]/summary/text()'%i)
    return str(summary).lower()

# build a mapping of article name (PMCID) to its file path

PMC_PATH = '/local/XW/DATA/TREC/PMCs/'
pmcid2fpath = {}

for subdir1 in os.listdir(PMC_PATH):
    for subdir2 in os.listdir(os.path.join(PMC_PATH, subdir1)):
        diry = os.path.join(PMC_PATH, subdir1, subdir2)
#         print diry, len(os.listdir(diry))
        for fn in os.listdir(diry):
            pmcid = fn[:-5]
            fpath = os.path.join(diry, fn)
            pmcid2fpath[pmcid] = fpath

def get_article_abstract(pmcid):
    fpath = pmcid2fpath[pmcid]
    tree = etree.parse(fpath)
    abstract = tree.xpath('//abstract')[0]
    ret = u''+abstract.xpath('string(.)')
    return ret.lower()

In [3]:
query = get_topic(1)

In [4]:
corpus = []
pmcid_2relevance = [{} for i in xrange(31)] # list of dict mapping pmcid to relevance
with open('data/qrels.txt') as f:
    for line in tqdm(f, total=37707): 
        topicid, _, pmcid, relevance = line.split()
        topicid = int(topicid)
        if topicid>3: break
        try:
            corpus.append(get_article_abstract(pmcid))
            pmcid_2relevance[topicid][pmcid] = int(relevance)
        except: pass

 11%|█         | 4176/37707 [00:04<00:39, 840.53it/s]


In [5]:
len(corpus)

3871

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

def get_idf(wd):
    if wd not in vectorizer.vocabulary_: return 1.0
    return vectorizer.idf_[ vectorizer.vocabulary_[wd] ]

In [7]:
vocab = set(vectorizer.vocabulary_.keys())

In [8]:
import os, sys, time
import numpy as np
from numpy.linalg import norm
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) # to be reproducible

In [9]:
W2V_FPATH = '/local/XW/DATA/WORD_EMBEDDINGS/biomed-w2v-200.txt'
GLOVE_FPATH = '/local/XW/DATA/WORD_EMBEDDINGS/glove.6B.200d.txt'

In [10]:
word2vec = {} # maps word ---> embedding vector
with open(W2V_FPATH) as f:
    for line in tqdm(f, total=5443657): #5443657 400000
        vals = line.split()
        word = vals[0]
        if word in vocab:
            vec = np.asarray(vals[1:], dtype='float')
            word2vec[word] = vec
print 'found %d word vectors.' % len(word2vec)

100%|██████████| 5443657/5443657 [01:02<00:00, 87451.54it/s]

found 22981 word vectors.





# II. Define the deep relevance model

In [11]:
# define a function for visualization of model
import pydot
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
def viz_model(model):
    return SVG(model_to_dot(model).create(prog='dot', format='svg'))

### construct the relevance IR model

In [12]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, InputLayer, Flatten, Input, Merge, merge, Reshape
import keras.backend as K
import tensorflow as tf

N = len(query.split())

In [15]:
# 2 main components of the structure: feed forward network and gating
feed_forward = Sequential(
    [Dense(input_dim=30, output_dim=5, activation='tanh'),
     Dense(output_dim=1, activation='tanh')], 
    name='feed_forward_nw')

# ***note: have to wrap ops into Lambda layers !!***
# cf: https://groups.google.com/forum/#!topic/keras-users/fbRS-FkZw_Q
from keras.layers.core import Lambda

input_idf = Input(shape=(N,), name='input_idf')
def scale(x): 
    w = K.variable(1, name='w_g')
    return K.mul(x,w)
def scale_output_shape(input_shape): return input_shape

scaled = Lambda(scale, scale_output_shape, name='softmax_scale')(input_idf)
gs_out = Activation('softmax', name='softmax')(scaled)
gating = Model(input=input_idf, output=gs_out, name='gating')

# first input: hist vectors
input_hists = Input(shape=(N,30), name='input_hists')

def slicei(x, i): return x[:,i,:]
def slicei_output_shape(input_shape): return (input_shape[0], input_shape[2])
zs = [ feed_forward( Lambda(lambda x:slicei(x,i), slicei_output_shape, name='slice%d'%i)(input_hists) )\
          for i in xrange(N) ]

def concat(x): return K.concatenate(x) 
def concat_output_shape(input_shape): return (input_shape[0][0], N)
zs = Lambda(concat, concat_output_shape, name='concat_zs')(zs)

# second input: idf scores of each query term 
input_idf = Input(shape=(N,), name='input_idf')
gs = gating(input_idf)

def innerprod(x): return K.sum( K.mul(x[0],x[1]), axis=1)
def innerprod_output_shape(input_shape): return (input_shape[0][0],1)
scores = Lambda(innerprod, innerprod_output_shape, name='innerprod_zs_gs')([zs, gs])

scoring_model = Model(input=[input_idf, input_hists], output=[scores], name='scoring_model')

# third input -- the negative hists vector 
input_hists_neg = Input(shape=(N,30), name='input_hists_neg')

zs_neg = [ feed_forward( Lambda(lambda x:slicei(x,i), slicei_output_shape, name='slice%d_neg'%i)(input_hists_neg) )\
          for i in xrange(N) ]

zs_neg = Lambda(concat, concat_output_shape, name='concat_zs_neg')(zs_neg)

scores_neg = Lambda(innerprod, innerprod_output_shape, name='innerprod_zs_gs_neg')([zs_neg, gs])

two_score_model = Model(input=[input_idf, input_hists, input_hists_neg], 
                        output=[scores, scores_neg], 
                        name='two_score_model')

def diff(x): return tf.sub(x[0], x[1]) #x[0]-x[1]
def diff_output_shape(input_shape): return input_shape[0]
posneg_score_diff = Lambda(diff, diff_output_shape, name='posneg_score_diff')([scores, scores_neg])
ranking_model = Model(input=[input_idf, input_hists,  input_hists_neg]
                      , output=[posneg_score_diff]
                      , name='ranking_model')

# define my loss function: hinge of score_pos - score_neg
def pairwise_hinge(y_true, y_pred): # y_pred = score_pos - score_neg, **y_true doesn't matter here**
    return K.mean( K.maximum(1. - y_pred, y_true*0.0) )  

ranking_model.compile(optimizer='adagrad', loss=pairwise_hinge)

# III. train model (for topic-1)

## helper functions

In [20]:
randvec = np.random.randn(200)
def get_histvec(q_wd, doc):
    qvec = word2vec.get(q_wd, randvec)
#     dvecs = np.vstack( [word2vec.get(wd, randvec) for wd in nltk.word_tokenize(doc)] )
    words_doc = filter(lambda wd:wd in word2vec, nltk.word_tokenize(doc))
    dvecs = np.vstack( [ word2vec[wd] for wd in words_doc ] )
    cossims = np.dot(dvecs, qvec) / norm(qvec) / norm(dvecs, axis=1)
    hist, _ = np.histogram( cossims[cossims<1.0], bins=29, range=(-1,1) )
    ones = len(cossims) - sum(hist)
    ret = np.array( list(hist) + [ones] )
    return ret # np.reshape(ret, (-1, 30))

In [21]:
def get_query_doc_feature(query, pmcid):
    doc = get_article_abstract(pmcid)
    return np.array([ get_histvec(wd, doc) for wd in query.split()])

## prepare positive/negative pairs

In [22]:
relevance = pmcid_2relevance[1]

In [23]:
pos_ids, neg_ids = [], []
for pmcid in relevance.keys():
    if relevance[pmcid]==0: neg_ids.append(pmcid)
    else: pos_ids.append(pmcid)
print len(pos_ids), len(neg_ids)

125 1209


In [24]:
hists_pos = np.array( [get_query_doc_feature(query, posid) for posid in tqdm(pos_ids)] )
hists_neg = np.array( [get_query_doc_feature(query, negid) for negid in tqdm(neg_ids)] )

100%|██████████| 125/125 [00:04<00:00, 25.59it/s]
100%|██████████| 1209/1209 [00:46<00:00, 26.02it/s]


## train model using `fit_generator`

In [25]:
VALDATION_SPLIT = 0.2
BATCH_SZ = 128
NB_EPOCH = 20

In [26]:
idx_pairs = []
for pidx in xrange(len(hists_pos)):
    for nidx in xrange(len(hists_neg)):
        idx_pairs.append( (pidx, nidx) )
idx_pairs = np.array(idx_pairs)
val_sz = int(len(idx_pairs)*VALDATION_SPLIT)
idx_pairs_train, idx_pairs_val = idx_pairs[val_sz:], idx_pairs[:val_sz]

In [27]:
def batch_generator(idx_pairs, batch_size=BATCH_SZ): 
    np.random.shuffle(idx_pairs)
    batches_pre_epoch = len(idx_pairs) //batch_size
    samples_per_epoch = batches_pre_epoch * batch_size # make samples_per_epoch a multiple of batch size
    counter = 0
    _idf = np.array([get_idf(wd) for wd in query.split()])
    idfs_batch = np.vstack([_idf]*batch_size)
    y_true_batch_dummy = np.ones((batch_size))
    while 1:
        idx_batch = idx_pairs[batch_size*counter: min(samples_per_epoch, batch_size*(counter+1))]
        pos_batch = hists_pos[idx_batch[:,0]]
        neg_batch = hists_neg[idx_batch[:,1]]
        counter += 1
        if (counter >= batches_pre_epoch):
            np.random.shuffle(idx_pairs)
            counter=0
        yield [idfs_batch, pos_batch, neg_batch], y_true_batch_dummy

In [28]:
# self-defined metrics
def ranking_acc(y_true, y_pred):
    y_pred = y_pred > 0 
    return K.mean(y_pred)

In [29]:
ranking_model.compile(optimizer='adagrad', loss=pairwise_hinge, metrics=[ranking_acc])

In [30]:
logdir = './log/relevance_matching'
_callbacks = [EarlyStopping(monitor='val_loss', patience=2),
              TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=False) # 
             ]
# 30-5-1 tanh
ranking_model.fit_generator( batch_generator(idx_pairs_train), 
                    samples_per_epoch = len(idx_pairs_train)//BATCH_SZ*BATCH_SZ,
                    nb_epoch=NB_EPOCH,
                    validation_data=batch_generator(idx_pairs_val),
                    nb_val_samples=len(idx_pairs_val)//BATCH_SZ*BATCH_SZ, 
                    callbacks = _callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


<keras.callbacks.History at 0x7f90f08c6d10>

------------------

## below are some testing stuff

In [31]:
zip(pos_ids[:10], neg_ids[:10])

[('3429740', '3921765'),
 ('2999735', '4085271'),
 ('3526517', '4532844'),
 ('1750992', '2769307'),
 ('3503351', '3809984'),
 ('4130891', '4685986'),
 ('4471306', '2996340'),
 ('3286730', '4395018'),
 ('4659952', '28994'),
 ('4724023', '4716452')]

In [32]:
pos_sample = get_query_doc_feature(query, '3429740')
neg_sample = get_query_doc_feature(query, '3921765')
pair_sample = np.array([pos_sample, neg_sample])

_idf = np.array([get_idf(wd) for wd in query.split()])
idf_sample = np.vstack([_idf]*2)

print idf_sample.shape, pair_sample.shape

(2, 11) (2, 11, 30)


### test `scoring_model`

In [33]:
scoring_model.predict([idf_sample,pair_sample])

array([ 0.3721953 ,  0.44491634], dtype=float32)

In [34]:
a = feed_forward.predict(pos_sample)
print a

[[-0.43848625]
 [ 0.47157994]
 [-0.98957956]
 [-0.99888313]
 [ 0.41266179]
 [-0.82911295]
 [-0.9381572 ]
 [ 0.99194616]
 [ 0.98716438]
 [ 0.98718643]
 [-0.43848625]]


In [35]:
b = gating.predict(idf_sample)[0]
print b

[ 0.00482637  0.20997439  0.0174815   0.02042374  0.0538551   0.20764136
  0.00537621  0.12458481  0.34606892  0.00494123  0.00482637]


In [36]:
b.dot(a)

array([ 0.37219518], dtype=float32)

In [37]:
c = feed_forward.predict(neg_sample)
print c
print b.dot(c)

[[ 0.87186098]
 [ 0.7142241 ]
 [-0.96034747]
 [ 0.94236648]
 [-0.3230921 ]
 [ 0.99326599]
 [-0.93802834]
 [ 0.99049699]
 [-0.08088312]
 [ 0.98430783]
 [ 0.87186098]]
[ 0.44491631]


==> the scoring model works all right

### test ranking_model

In [38]:
ranking_model.predict( [idf_sample, pair_sample, np.array([neg_sample, pos_sample]) ])

array([-0.07272103,  0.07272103], dtype=float32)

In [39]:
-0.57720977 - -0.09897145

-0.47823831999999994

In [40]:
def predict_score(pmcid):
    _idf = np.array([get_idf(wd) for wd in query.split()])
    _idf = np.vstack([_idf])
    _hist = get_query_doc_feature(query, pmcid).reshape(1,11,30)
    return scoring_model.predict([_idf, _hist])[0]

### see some results

In [41]:
zip( map(predict_score, pos_ids[:10]), map(predict_score, neg_ids[:10]))

[(0.37219524, 0.44491634),
 (-0.071498334, -0.15975329),
 (-0.032746524, -0.80821633),
 (0.81949872, 0.30661809),
 (0.1990094, -0.18697436),
 (0.92787206, -0.3595199),
 (0.50025171, -0.54916126),
 (0.48231193, 0.13923873),
 (0.7508564, -0.60553944),
 (0.79059291, -0.93590719)]

In [42]:
def predict_score_diff( (pmcid_pos, pmcid_neg) ):
    _idf = np.array([get_idf(wd) for wd in query.split()])
    _idf = np.vstack([_idf])
    hist_pos = get_query_doc_feature(query, pmcid_pos).reshape((1,11,30))
    hist_neg = get_query_doc_feature(query, pmcid_neg).reshape((1,11,30))
    return ranking_model.predict([_idf, hist_pos, hist_neg])[0]

### test the scoring model (metrics=AP )

In [83]:
def AP(pos_scores, neg_scores):
    Q = len(pos_scores)
    pos_tags = [1] * len(pos_scores)
    neg_tags = [0] * len(neg_scores)
    all_tagged = zip(pos_scores, pos_tags) + zip(neg_scores, neg_tags)
    ranked_list = sorted(all_tagged, reverse=True)
    print ranked_list[:20]
    ranked_tag = zip(*ranked_list)[1]
    print ranked_tag[:20]
    precision_at_i = []
    corr, total = 0.0, 0
    while corr<Q:
        if ranked_tag[total]==1: 
            corr += 1
            precision_at_i.append(corr*1.0 / (total+1) )
        total += 1
    print precision_at_i[:20]
    return np.mean(precision_at_i)

In [44]:
_idfs = np.vstack([_idf]*len(hists_pos))
pos_scores = scoring_model.predict( [ _idfs, hists_pos])
_idfs = np.vstack([_idf]*len(hists_neg))
neg_scores = scoring_model.predict( [ _idfs, hists_neg])

In [54]:
print pos_scores.mean(), neg_scores.mean()
print pos_scores.max(), neg_scores.max()
print pos_scores.min(), neg_scores.min()

0.459192 -0.286091
0.984027 0.916831
-0.624351 -0.987414


In [84]:
AP(pos_scores, neg_scores)

[(0.98402655, 1), (0.9368118, 1), (0.92787206, 1), (0.92328721, 1), (0.91716737, 1), (0.91683054, 0), (0.91507411, 0), (0.90818655, 0), (0.89776915, 1), (0.89175165, 0), (0.86966276, 0), (0.86896461, 1), (0.86194515, 1), (0.86036122, 0), (0.85995638, 1), (0.85855299, 0), (0.85561204, 0), (0.85223681, 1), (0.84764636, 0), (0.8459121, 1)]
(1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1)
[1.0, 1.0, 1.0, 1.0, 1.0, 0.6666666666666666, 0.5833333333333334, 0.6153846153846154, 0.6, 0.5555555555555556, 0.55, 0.5714285714285714, 0.5909090909090909, 0.6086956521739131, 0.5769230769230769, 0.5714285714285714, 0.5666666666666667, 0.5806451612903226, 0.5277777777777778, 0.5405405405405406]


0.46100751843546001