In [1]:
from tqdm import tqdm
from lxml import etree
import nltk

import os, sys, time
import numpy as np
from numpy.linalg import norm
import pandas as pd
from tqdm import tqdm
import cPickle as pk
np.random.seed(1) 

In [2]:
W2V_FPATH = '/local/XW/DATA/WORD_EMBEDDINGS/W2V_BIO/wikipedia-pubmed-and-PMC-w2v.bin'
# GLOVE_FPATH = '/local/XW/DATA/WORD_EMBEDDINGS/glove.6B.200d.txt'
WD_PLACEHOLDER = '</s>'
PMC_PATH = '/local/XW/DATA/TREC/PMCs/'

In [3]:
with open('data/IRdata.pk') as f:
    data_pickle = pk.load(f)

In [4]:
data_pickle.keys()

['querypmc2histvec',
 'pos_ids',
 'pmcid_2relevance',
 'hists_pos',
 'hists_neg',
 'word2vec',
 'neg_ids',
 'corpus']

# I. Prepare data (run once, then pickle to file)

Things to be pickled: 

* `pmcid2fpath`    
    mapping pmcid to the corresponding file path
* `corpus: dict[str, str]`   
    mapping pmcid to it's content, the content is a BOW representataion (a Counter), it is pickled into another file called `corpus.pk`
* `QUERIES: dict[int, list<str>]`     
    mapping qid to it's query represented as a list of words
* `QUERIES_padded`   
    same as `QUERIES`, but all queries are padded to the same length (=`MAX_QLEN`) with `WD_PLACEHOLDER`
* `IDF: dict[str, float]`   
    mapping a word to its idf
* `relevance: dict[(int,str), int]`   
    mapping (qid,docid) pairs to relevance (0,1,2)
* `pos_ids, neg_ids`    
    mapping `qid` to its list of [pos/neg docids]
* `candidates: dict[int, list<str>]`  
    mapping qid to list of its candidate docids (that appeared in the qrel)
* `qid_docid2histvec: dict[(int,str), array]`    
    mapping from (qid, docid) to the corresponding histvec
* `instances: dict[int, list<(str,str)>]`    
    mapping qid to list, instances[qid] = list of (pos_docid, neg_docid) pairs for qid,

### helper function

In [6]:
topic_tree = etree.parse('data/topics2016.xml')

def get_topic(i):# returns the summary string of the ith topic
    summary = topic_tree.xpath('//topic[@number="%d"]/summary/text()'%i)[0]
    return str(summary).lower().strip()

# build a mapping of article name (PMCID) to its file path
pmcid2fpath = {}

for subdir1 in os.listdir(PMC_PATH):
    for subdir2 in os.listdir(os.path.join(PMC_PATH, subdir1)):
        diry = os.path.join(PMC_PATH, subdir1, subdir2)
#         print diry, len(os.listdir(diry))
        for fn in os.listdir(diry):
            pmcid = fn[:-5]
            fpath = os.path.join(diry, fn)
            pmcid2fpath[pmcid] = fpath

def get_article_abstract(pmcid): # get article title and abstract
    fpath = pmcid2fpath[pmcid]
    tree = etree.parse(fpath)
    ret = u'' + tree.xpath('string(//article-title)') + '\n'
    abstracts = tree.xpath('//abstract')
#     abstracts = tree.xpath('//p')
    ret += u' '.join( [abstract.xpath('string(.)') for abstract in abstracts] )
    if len(ret.split())<20: 
        raise Exception(u'abstraction too short: '+pmcid + ret)
    return ret.lower()

In [7]:
get_article_abstract('2362203')

u'evaluation of a follow-up programme after curative resection for colorectal cancer\nfrequent liver imaging can detect liver metastases from colorectal cancer at an asymptomatic stage. \xa9 1999 cancer research campaign'

In [8]:
pmcid2fpath['1036271']

'/local/XW/DATA/TREC/PMCs/pmc-02/17/1036271.nxml'

### word2vec

### queries (stopwords removed)

In [None]:
from nltk.corpus import stopwords
stopwds = set(stopwords.words('english'))

In [17]:
QUERIES = {} # dict[int, list<str>] mapping qid to it's query represented as a list of words
for qid in xrange(1,31):
    q = [wd for wd in q.split() if (wd not in stopwds) and (wd in word2vec)]
    QUERIES[qid] = get_topic(qid).split()

In [18]:
MAX_QLEN = max( map(len, QUERIES.values()) ) 
print MAX_QLEN

91


### relevance and corpus

In [9]:
relevance = {} # dict[(int,str), int] mapping (qid,docid) pairs to its relevance (0,1,2)
candidates = {} # dict[int, list<str>] mapping qid to list of its candidate docids (that appeared in the qrel)
corpus = {} # dict[str, str] mapping pmcid to it's content

with open('data/qrels.txt') as f:
    for line in tqdm(f, total=37707): 
        qid, _, pmcid, rel = line.split()
        qid = int(qid)
        try:
            if pmcid not in corpus: 
                corpus[pmcid] = get_article_abstract(pmcid)
            relevance[(qid,pmcid)] = int(rel)
        except: 
            pass

100%|██████████| 37707/37707 [11:25<00:00, 55.05it/s]


In [10]:
print '%d articles are retrieved' % len(corpus)

26255 articles are retrieved


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)
vocab = vectorizer.vocabulary_ # mapping word to its internal index

In [12]:
def get_idf(wd):
    if wd ==WD_PLACEHOLDER: return -10.0
    return vectorizer.idf_[ vectorizer.vocabulary_[wd] ]

In [None]:
word2vec = {} # maps word ---> embedding vector
with open(W2V_FPATH) as f:
    for line in tqdm(f, total=5443657): # 5443657 400000
        vals = line.split()
        word = vals[0]
        if word in vocab:
            vec = np.asarray(vals[1:], dtype='float')
            word2vec[word] = vec
print 'found %d word vectors.' % len(word2vec)

In [None]:
from nltk.corpus import stopwords
stopwds = set(stopwords.words('english'))

In [None]:
get_topic(17)

In [None]:
_queries = [get_topic(i) for i in xrange(1,31)] 
QUERIES = []
for q in _queries:
    q2 =  # filter out stopword and words not in w2v
    QUERIES.append(q2)

In [None]:
print map(len, QUERIES)

In [None]:
N = max(map(len, QUERIES)) # = max query length
print N

In [None]:
# padding queries to the same length N
WD_PLACEHOLDER = '</s>'
def pad_query(q, SZ=N):
    return q + [WD_PLACEHOLDER]*(SZ-len(q))
QUERIES = map(pad_query, QUERIES)

## helper functions

In [None]:
def get_histvec(q_wd, doc):
    if q_wd == WD_PLACEHOLDER: 
        return np.zeros(30)
    qvec = word2vec[q_wd]
#     dvecs = np.vstack( [word2vec.get(wd, randvec) for wd in nltk.word_tokenize(doc)] )
    dvecs = np.vstack( [ word2vec[wd] for wd in nltk.word_tokenize(doc) if wd in word2vec] )
    cossims = np.dot(dvecs, qvec) / norm(qvec) / norm(dvecs, axis=1)
    hist, _ = np.histogram( cossims[cossims<1.0], bins=29, range=(-1,1) )
    ones = len(cossims) - sum(hist)
    ret = np.array( list(hist) + [ones] )
    ret = np.log(ret+1)
    return ret # np.reshape(ret, (-1, 30))

In [None]:
def get_query_doc_feature(query, pmcid): # query: list of words
    doc = get_article_abstract(pmcid)
    return np.array([ get_histvec(wd, doc) for wd in query])

### prepare data

In [None]:
T = 30 # nubmer of used topics

In [None]:
pos_ids, neg_ids = [], [] # pos_ids[q] is a list (positive pmcids for query `q`)
hists_pos, hists_neg = [], [] # hists_pos[q] is a list (positive hists for query `q`)
                              # hists_pos[q][i] is an array of size N*30 (the ith hists-feature array for query q)
for topic in xrange(1,T+1):
    query = QUERIES[topic-1]
    pos_ids_q, neg_ids_q = [], []
    hists_pos_q, hists_neg_q = [], []
    relevance = pmcid_2relevance[topic]
    for pmcid in tqdm(relevance.keys()):
        if relevance[pmcid]==0: 
            neg_ids_q.append(pmcid)
            hists_neg_q.append(get_query_doc_feature(query,pmcid))
        else: 
            pos_ids_q.append(pmcid)
            hists_pos_q.append(get_query_doc_feature(query,pmcid))
    hists_pos_q, hists_neg_q = map(np.array, [hists_pos_q, hists_neg_q])
    hists_pos.append(hists_pos_q); hists_neg.append(hists_neg_q)
    pos_ids.append(pos_ids_q); neg_ids.append(neg_ids_q)
print len(pos_ids), len(neg_ids)

In [None]:
print map(len, hists_pos)
print map(len, hists_neg)

### prepare generator

In [None]:
VALDATION_SPLIT = 0.2
BATCH_SZ = 128
NB_EPOCH = 50

In [None]:
idx_pairs = [] # list of list, where idx_pairs[q] is a list of tuples of the form: `(q, pos_idx, neg_idx)` for query q
for q in xrange(T):
    hists_pos_q, hists_neg_q = hists_pos[q], hists_neg[q]
    idx_pairs_q = []
    for pidx in xrange(len(hists_pos_q)): # here the idx are just row index in hists array
        for nidx in xrange(len(hists_neg_q)):
            idx_pairs_q.append( (q, pidx, nidx) )
    idx_pairs.append(idx_pairs_q)

n_val_queries = int(T * VALDATION_SPLIT)
idx_pairs_train = reduce( lambda x,y: x+y, idx_pairs[:-n_val_queries] )
idx_pairs_val   = reduce( lambda x,y: x+y, idx_pairs[-n_val_queries:] )

idx_pairs_train, idx_pairs_val = map(np.array, [idx_pairs_train, idx_pairs_val])

print idx_pairs_train.shape, idx_pairs_val.shape

In [None]:
IDFs = [ np.array ([ get_idf(wd) for wd in query]) for query in QUERIES]
IDFs = np.array(IDFs)

In [None]:
def batch_generator(idx_pairs, batch_size=BATCH_SZ): 
    np.random.shuffle(idx_pairs)
    batches_pre_epoch = len(idx_pairs) // batch_size
    samples_per_epoch = batches_pre_epoch * batch_size # make samples_per_epoch a multiple of batch size
    counter = 0
    y_true_batch_dummy = np.ones((batch_size))
    while 1:
        idx_batch = idx_pairs[batch_size*counter: min(samples_per_epoch, batch_size*(counter+1))]
        idfs_batch, pos_batch, neg_batch = [], [], []
        for q, pidx, nidx in idx_batch:
            idfs_batch.append(IDFs[q])
            pos_batch.append(hists_pos[q][pidx])
            neg_batch.append(hists_neg[q][nidx])
        idfs_batch, pos_batch, neg_batch = map(np.array, [idfs_batch, pos_batch, neg_batch])
#         print idfs_batch.shape, pos_batch.shape, neg_batch.shape
        counter += 1
        if (counter >= batches_pre_epoch):
            np.random.shuffle(idx_pairs)
            counter=0
        
        yield [idfs_batch, pos_batch, neg_batch], y_true_batch_dummy

In [None]:
[idfs, pos, neg], ytrue = batch_generator(idx_pairs_train).next()

In [None]:
pos[0][:10]

In [None]:
IDFs[0]

# II. Define the deep relevance model

In [None]:
# define a function for visualization of model
import pydot
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
def viz_model(model):
    return SVG(model_to_dot(model).create(prog='dot', format='svg'))

### construct the relevance IR model

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, InputLayer, Flatten, Input, Merge, merge, Reshape
import keras.backend as K
from keras.callbacks import EarlyStopping, TensorBoard
import tensorflow as tf

In [None]:
# 2 main components of the structure: feed forward network and gating
feed_forward = Sequential(
    [Dense(input_dim=30, output_dim=10, activation='relu', bias=False),
     Dense(output_dim=5, activation='relu', bias=False),
     Dense(output_dim=1, activation='tanh', bias=False)], 
    name='feed_forward_nw')

# ***note: have to wrap ops into Lambda layers !!***
# cf: https://groups.google.com/forum/#!topic/keras-users/fbRS-FkZw_Q
from keras.layers.core import Lambda

input_idf = Input(shape=(N,), name='input_idf')

w = K.variable(1, name='w_g')
def scale(x): 
    return tf.mul(x,w)
def scale_output_shape(input_shape): return input_shape

scaled = Lambda(scale, scale_output_shape, name='softmax_scale')(input_idf)
gs_out = Activation('softmax', name='softmax')(scaled)
gating = Model(input=input_idf, output=gs_out, name='gating')

# first input: hist vectors
input_hists = Input(shape=(N,30), name='input_hists')

def slicei(x, i): return x[:,i,:]
def slicei_output_shape(input_shape): return (input_shape[0], input_shape[2])
zs = [ feed_forward( Lambda(lambda x:slicei(x,i), slicei_output_shape, name='slice%d'%i)(input_hists) )\
          for i in xrange(N) ]

def concat(x): return K.concatenate(x) 
def concat_output_shape(input_shape): return (input_shape[0][0], N)
zs = Lambda(concat, concat_output_shape, name='concat_zs')(zs)

# second input: idf scores of each query term 
input_idf = Input(shape=(N,), name='input_idf')
gs = gating(input_idf)

def innerprod(x): return K.sum( tf.mul(x[0],x[1]), axis=1)
def innerprod_output_shape(input_shape): return (input_shape[0][0],1)
scores = Lambda(innerprod, innerprod_output_shape, name='innerprod_zs_gs')([zs, gs])

scoring_model = Model(input=[input_idf, input_hists], output=[scores], name='scoring_model')

# third input -- the negative hists vector 
input_hists_neg = Input(shape=(N,30), name='input_hists_neg')

zs_neg = [ feed_forward( Lambda(lambda x:slicei(x,i), slicei_output_shape, name='slice%d_neg'%i)(input_hists_neg) )\
          for i in xrange(N) ]

zs_neg = Lambda(concat, concat_output_shape, name='concat_zs_neg')(zs_neg)

scores_neg = Lambda(innerprod, innerprod_output_shape, name='innerprod_zs_gs_neg')([zs_neg, gs])

two_score_model = Model(input=[input_idf, input_hists, input_hists_neg], 
                        output=[scores, scores_neg], 
                        name='two_score_model')

def diff(x): return tf.sub(x[0], x[1]) #x[0]-x[1]
def diff_output_shape(input_shape): return input_shape[0]
posneg_score_diff = Lambda(diff, diff_output_shape, name='posneg_score_diff')([scores, scores_neg])
ranking_model = Model(input=[input_idf, input_hists,  input_hists_neg]
                      , output=[posneg_score_diff]
                      , name='ranking_model')

# define my loss function: hinge of score_pos - score_neg
def pairwise_hinge(y_true, y_pred): # y_pred = score_pos - score_neg, **y_true doesn't matter here**
    return K.mean( K.maximum(1. - y_pred, y_true*0.0) )  

# self-defined metrics
def ranking_acc(y_true, y_pred):
    y_pred = y_pred > 0 
    return K.mean(y_pred)

ranking_model.compile(optimizer='adagrad', loss=pairwise_hinge, metrics=[ranking_acc])

In [None]:
initial_weights = ranking_model.get_weights()

In [None]:
gating.predict(IDFs[0].reshape(-1,N))

In [None]:
viz_model(ranking_model)

In [None]:
viz_model(scoring_model)

# III. train model

In [None]:
querypmc2histvec = {} # mapping from (query_idx, pmcid) to the corresponding input vectors (idf_input, hist_input) tuple

In [None]:
for q in tqdm(xrange(30)):
    query = QUERIES[q]
    for pmcid in pmcid_2relevance[q+1].keys():
        _idf = IDFs[q].reshape(-1,N)
        _hist = get_query_doc_feature(query, pmcid).reshape(1,N,30)
        querypmc2histvec[(q, pmcid)] = (_idf, _hist)

In [None]:
data_to_pickle = {
    'querypmc2histvec': querypmc2histvec,
    'hists_pos': hists_pos,
    'hists_neg': hists_neg,
    'pos_ids': pos_ids,
    'neg_ids': neg_ids,
    'pmcid_2relevance': pmcid_2relevance,
    'corpus': corpus,
    'word2vec': word2vec
}
with open('data/IRdata.pk', 'wb') as f:
    pk.dump(data_to_pickle, f, pk.HIGHEST_PROTOCOL)

## train model using `fit_generator`

In [None]:
logdir = './logs/relevance_matching'
_callbacks = [ EarlyStopping(monitor='val_loss', patience=2),
               TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=False) ]

In [None]:
# 30-5-1 tanh
ranking_model.fit_generator( batch_generator(idx_pairs_train), 
                    samples_per_epoch = len(idx_pairs_train)//BATCH_SZ*BATCH_SZ,
                    nb_epoch=NB_EPOCH,
                    validation_data=batch_generator(idx_pairs_val),
                    nb_val_samples=len(idx_pairs_val)//BATCH_SZ*BATCH_SZ, 
                    callbacks = _callbacks)

In [None]:
gating.predict(IDFs[0].reshape(-1,N))

In [None]:
def shuffle_weights(model, weights=None):
    """Randomly permute the weights in `model`, or the given `weights`.
    This is a fast approximation of re-initializing the weights of a model.
    Assumes weights are distributed independently of the dimensions of the weight tensors
      (i.e., the weights have the same distribution along each dimension).
    :param Model model: Modify the weights of the given model.
    :param list(ndarray) weights: The model's weights will be replaced by a random permutation of these weights.
      If `None`, permute the model's current weights.
    """
    if weights is None:
        weights = model.get_weights()
    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
    model.set_weights(weights)

In [None]:
def TREC_output(topic_id, pmcids, run_name = 'my_run', fpath = None):
    query = QUERIES[topic_id]
    res = [] # list of (score, pmcid) tuples
    for pmcid in tqdm(pmcids):
        input_idf, input_hist = querypmc2histvec[(topic_id,pmcid)] # get_query_doc_feature(query, pmcid).reshape(1,N,30)
        score = scoring_model.predict([input_idf, input_hist])[0]
        res.append( (score, pmcid) )
    res = sorted(res, reverse=True)
#     print res[:10]
    fout = sys.stdout if fpath==None else open(fpath, 'a')
    for rank, (score, pmcid) in enumerate(res[:1000],1):
        print >>fout, '%d  Q0  %s  %d  %f  %s' % (topic_id+1, pmcid, rank, score, run_name)

In [None]:
def LOO(fpath):
    for q in xrange(30):
        print '### loo for topic %d ###' % (q+1)
        shuffle_weights(ranking_model, initial_weights)
        idx_pairs_train = reduce( lambda x,y: x+y, idx_pairs[:q]+idx_pairs[q+1:] )
        idx_pairs_val   = idx_pairs[q]
        ranking_model.fit_generator( batch_generator(idx_pairs_train), 
                    samples_per_epoch = len(idx_pairs_train)//BATCH_SZ*BATCH_SZ,
                    nb_epoch=20,
                    validation_data=batch_generator(idx_pairs_val),
                    nb_val_samples=len(idx_pairs_val)//BATCH_SZ*BATCH_SZ, 
                    callbacks = _callbacks)
        sys.stderr.flush()
        TREC_output(q, pmcid_2relevance[q+1].keys(), fpath=fpath)

In [None]:
fpath = '/local/SOFT/Trec_eval/my_run_loo.txt'
open(fpath, 'w').close() # clear previous content of the file
LOO(fpath)

------------------

## below are some testing stuff

In [None]:
zip(pos_ids[0][:10], neg_ids[0][:10])

In [None]:
query = QUERIES[0]
pos_sample = get_query_doc_feature(query, '3429740')
neg_sample = get_query_doc_feature(query, '3921765')
pair_sample = np.array([pos_sample, neg_sample])
_idf = np.array([get_idf(wd) for wd in query])
idf_sample = np.vstack([_idf]*2)

print idf_sample.shape, pair_sample.shape

In [None]:
pos_sample[-3:]

In [None]:
feed_forward.predict(pos_sample[-1].reshape(-1,30))

### test `scoring_model`

In [None]:
scoring_model.predict([idf_sample,pair_sample])

In [None]:
print QUERIES[0]

In [None]:
a = feed_forward.predict(pos_sample)
print a

In [None]:
b = gating.predict(idf_sample)[0]
print b

In [None]:
b.dot(a)

In [None]:
c = feed_forward.predict(neg_sample)
# print c
print b.dot(c)

==> the scoring model works all right

### test ranking_model

In [None]:
ranking_model.predict( [idf_sample, pair_sample, np.array([neg_sample, pos_sample]) ])

In [None]:
0.08474284 - -0.15973815

In [None]:
def predict_score(pmcid):
    query = QUERIES[0]
    _idf = np.array([get_idf(wd) for wd in query])
    _idf = np.vstack([_idf])
    _hist = get_query_doc_feature(query, pmcid).reshape(1,N,30)
    return scoring_model.predict([_idf, _hist])[0]

In [None]:
 query = QUERIES[0]
_idf = np.array([get_idf(wd) for wd in query])
_idf = np.vstack([_idf])
# pos_sample = get_query_doc_feature(query, '3429740')
scoring_model.predict( [_idf, pos_sample.reshape(1,N,30)])

In [None]:
predict_score('3429740')

### see some results

In [None]:
zip( map(predict_score, pos_ids[0][:10]), map(predict_score, neg_ids[0][:10]))

In [None]:
def predict_score_diff( (pmcid_pos, pmcid_neg) ):
    query = QUERIES[0]
    _idf = np.array([get_idf(wd) for wd in query])
    _idf = np.vstack([_idf])
    hist_pos = get_query_doc_feature(query, pmcid_pos).reshape((1,11,30))
    hist_neg = get_query_doc_feature(query, pmcid_neg).reshape((1,11,30))
    return ranking_model.predict([_idf, hist_pos, hist_neg])[0]

### test the scoring model (metrics=AP )

In [None]:
def AP(pos_scores, neg_scores):
    Q = len(pos_scores)
    pos_tags = [1] * len(pos_scores)
    neg_tags = [0] * len(neg_scores)
    all_tagged = zip(pos_scores, pos_tags) + zip(neg_scores, neg_tags)
    ranked_list = sorted(all_tagged, reverse=True)
#     print ranked_list[:20]
    ranked_tag = zip(*ranked_list)[1]
#     print ranked_tag[:20]
    precision_at_i = []
    corr, total = 0.0, 0
    while corr<Q:
        if ranked_tag[total]==1: 
            corr += 1
            precision_at_i.append(corr*1.0 / (total+1) )
        total += 1
#     print precision_at_i[:20]
    return np.mean(precision_at_i)

In [None]:
def AP_of_topic(q):
    query = QUERIES[q]
    _idf = np.array([get_idf(wd) for wd in query])
    _idfs = np.vstack([_idf]*len(hists_pos[q]))
    pos_scores = scoring_model.predict( [ _idfs, hists_pos[q]])
    _idfs = np.vstack([_idf]*len(hists_neg[q]))
    neg_scores = scoring_model.predict( [ _idfs, hists_neg[q]])
#     print 'mean:', pos_scores.mean(), neg_scores.mean()
#     print 'max:',pos_scores.max(), neg_scores.max()
#     print 'min:',pos_scores.min(), neg_scores.min()
    return AP(pos_scores, neg_scores)

In [None]:
for i in xrange(T):
    print i+1, AP_of_topic(i) 

In [None]:
for q in xrange(30):
    TREC_output(q, pmcid_2relevance[q+1].keys(), fpath='/local/SOFT/Trec_eval/my_run_validation.txt')

To evaluate:

trec_eval -q -c -M1000 official_qrels submitted_results