# Information Retrieval Generative Advarsarial Network

Here is presented the implementation of the following parper https://arxiv.org/abs/1705.10513 with help of their source code https://github.com/geek-ai/irgan

Implementation is done in tensorflow using layers API

Created by: Tiago Almeida 13/02/2018


In [1]:
##### imports
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import utils as ut # auxiliar file to help in data visualization
from os.path import join

### not mine, authors
from eval_irgan.precision import precision_at_k
from eval_irgan.ndcg import ndcg_at_k

import random

#tensorflow version when notebook was created - 1.4.0
tf.__version__

'1.4.0'

### IRGAN parameters

From the github

In [2]:
#vector size of features of the query-document pair
FEATURE_SIZE = 46
HIDDEN_SIZE = 46
BATCH_SIZE = 8
WEIGHT_DECAY = 0.01
D_LEARNING_RATE = 0.001
G_LEARNING_RATE = 0.001
TEMPERATURE = 0.2
LAMBDA = 0.5

work_directory = join("data","MQ2008-semi")

DIS_TRAIN_FILE = join(work_directory,"run-train-gan.txt")

### Dataset MQ2008-semi

I'm using the same dataset as the authors https://drive.google.com/drive/folders/0B-dulzPp3MmCM01kYlhhNGQ0djA

In [3]:




train_data = join(work_directory,"train.txt")
test_data = join(work_directory,"test.txt")
large_norm = join(work_directory,"Large_norm.txt")

#get features of all query-document par from test and train set
query_url_feature, query_url_index, query_index_url = ut.load_all_query_url_feature(large_norm, FEATURE_SIZE)

#query-document pairs with positive relevance from train set
query_pos_train = ut.get_query_pos(train_data)

#query-document pairs with positive relevance from test set
query_pos_test = ut.get_query_pos(test_data)



### Generator



In [4]:

X = tf.placeholder(tf.float32, shape=[None, FEATURE_SIZE], name="input_generator")

def generator(x,name):
    
    with tf.variable_scope("generator",reuse=tf.AUTO_REUSE):
        
        fc1 = tf.layers.dense(x, HIDDEN_SIZE, activation=tf.nn.tanh, name = name+'_hidden1')
        
        #output 
        return tf.reshape(tf.layers.dense(fc1, 1, activation=None, name = name+'_output'), [-1]) / TEMPERATURE

    

### Discriminator pointwise

In [5]:
Y = tf.placeholder(tf.float32, shape=[None], name="pred_data_label")

def discriminator(x,name):
    
    with tf.variable_scope("discriminator",reuse=tf.AUTO_REUSE):
        
        fc1 = tf.layers.dense(x, HIDDEN_SIZE, activation=tf.nn.tanh, name = name+'_hidden1')
        
        #output 
        return tf.reshape(tf.layers.dense(fc1, 1, activation=None, name = name+'_output'), [-1])

### Loss functions

In [6]:
g_reward = tf.placeholder(tf.float32, shape=[None], name='reward')
sample_index = tf.placeholder(tf.int32, shape=[None], name='sample_index')
important_sampling = tf.placeholder(tf.float32, shape=[None], name='important_sampling')

gen_scores = generator(X,"g")
dis_pred = discriminator(X,"d")

generator_variables = [var for var in tf.trainable_variables() if 'g_' in var.name]
discriminator_variables = [var for var in tf.trainable_variables() if 'd_' in var.name]

with tf.name_scope("generator_loss"):
    gen_scores_prob = tf.nn.softmax(tf.reshape(gen_scores, [1, -1]))
    
    gan_prob = tf.gather(tf.reshape(gen_scores_prob, [-1]), sample_index)

    g_weight_decay = tf.reduce_sum([tf.nn.l2_loss(var) for var in generator_variables])
    
    g_loss = -tf.reduce_mean(tf.log(gan_prob) * g_reward * important_sampling) + WEIGHT_DECAY * g_weight_decay
        
with tf.name_scope('discriminator_loss'):
    d_weight_decay = tf.reduce_sum([tf.nn.l2_loss(var) for var in discriminator_variables])
    
    d_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=dis_pred, labels=Y)) + WEIGHT_DECAY * d_weight_decay

### Generate (select) samples

In [7]:
import time
def generate_for_d(sess, filename):
    input_pos = []
    input_neg = []
    
    print('negative sampling for d using g ...')
    for query in query_pos_train:
        
        
        pos_list = query_pos_train[query] #positive documents
        all_list = query_index_url[query] #all documents
        candidate_list = all_list

        #get features for possible selected documents
        candidate_list_feature = [query_url_feature[query][url] for url in candidate_list]
        candidate_list_feature = np.asarray(candidate_list_feature)

        # softmax for candidate
        prob_candidates = tf.nn.softmax(gen_scores - tf.reduce_max(gen_scores))
        prob = sess.run(prob_candidates, feed_dict={X: candidate_list_feature})

        #exp_rating = np.exp(candidate_list_score - np.max(candidate_list_score))
        #prob = exp_rating / np.sum(exp_rating)
        
        # from all candidate choose same numbers of positive ones
        neg_list = np.random.choice(candidate_list, size=[len(pos_list)], p=prob)
        
        for i in range(len(pos_list)):
            input_pos.append(query_url_feature[query][pos_list[i]])
            input_neg.append(query_url_feature[query][neg_list[i]])
    

    #save in disk
    return input_pos, input_neg
    
    return data
    print("num elements",len(data))
    with open(filename, 'w') as fout:
        for (q, pos, neg) in data:
            fout.write(','.join([str(f) for f in query_url_feature[q][pos]])
                       + '\t'
                       + ','.join([str(f) for f in query_url_feature[q][neg]]) + '\n')
            fout.flush()

### Train

In [8]:

with tf.name_scope("generator_train"):
    g_optimizer = tf.train.GradientDescentOptimizer(G_LEARNING_RATE)
    g_train_op = g_optimizer.minimize(g_loss, var_list=generator_variables)
    
with tf.name_scope("discriminator_train"):
    d_optimizer = tf.train.GradientDescentOptimizer(D_LEARNING_RATE)
    d_train_op = d_optimizer.minimize(d_loss, var_list=discriminator_variables)

    d_reward = (tf.sigmoid(dis_pred) - 0.5) * 2


## Start graph computations and algorithm

sess = tf.Session()
sess.run(tf.global_variables_initializer())

p_best_val = 0.0
ndcg_best_val = 0.0

generate_samples_time = 0


for epoch in range(30):
    if epoch >= 0:
        # G generate negative for D, then train D
        print('Training D ...')
        print("Epoch:",epoch)
        for d_epoch in range(100):

            if d_epoch % 30 == 0:
                generate_samples_time = time.time()
                selected_samples_pos, selected_samples_neg = generate_for_d(sess, DIS_TRAIN_FILE)
                print("time used in generation",time.time()-generate_samples_time)
                train_size = len(selected_samples)

            index = 1
            while True:
                if index > train_size:
                    break
                if index + BATCH_SIZE <= train_size + 1:
                    input_pos = selected_samples_pos[index:index+BATCH_SIZE]
                    input_neg = selected_samples_neg[index:index+BATCH_SIZE]#ut.get_batch_data(DIS_TRAIN_FILE, index, BATCH_SIZE)
                else:
                    input_pos = selected_samples_pos[index:index+train_size - index + 1] #ut.get_batch_data(DIS_TRAIN_FILE, index, train_size - index + 1)
                    input_neg = selected_samples_neg[index:index+train_size - index + 1]
                index += BATCH_SIZE

                pred_data = []
                pred_data.extend(input_pos)
                pred_data.extend(input_neg)
                pred_data = np.asarray(pred_data)

                pred_data_label = [1.0] * len(input_pos)
                pred_data_label.extend([0.0] * len(input_neg))
                pred_data_label = np.asarray(pred_data_label)

                _ = sess.run(d_train_op,
                             feed_dict={X: pred_data,
                                        Y: pred_data_label})
    # Train G
    print('Training G ...')
    for g_epoch in range(30):
        if g_epoch%10==0:
            print("Epoch generator:",g_epoch)
        for query in query_pos_train.keys():
            pos_list = query_pos_train[query]
            pos_set = set(pos_list)
            all_list = query_index_url[query]

            all_list_feature = [query_url_feature[query][url] for url in all_list]
            all_list_feature = np.asarray(all_list_feature)
            all_list_score = sess.run(gen_scores, {X: all_list_feature})

            # softmax for all
            exp_rating = np.exp(all_list_score - np.max(all_list_score))
            prob = exp_rating / np.sum(exp_rating)

            prob_IS = prob * (1.0 - LAMBDA)

            for i in range(len(all_list)):
                if all_list[i] in pos_set:
                    prob_IS[i] += (LAMBDA / (1.0 * len(pos_list)))

            choose_index = np.random.choice(np.arange(len(all_list)), [5 * len(pos_list)], p=prob_IS)
            choose_list = np.array(all_list)[choose_index]
            choose_feature = [query_url_feature[query][url] for url in choose_list]
            choose_IS = np.array(prob)[choose_index] / np.array(prob_IS)[choose_index]

            choose_index = np.asarray(choose_index)
            choose_feature = np.asarray(choose_feature)
            choose_IS = np.asarray(choose_IS)

            choose_reward = sess.run(d_reward, feed_dict={X: choose_feature})

            _ = sess.run(g_train_op,
                         feed_dict={X: all_list_feature,
                                    sample_index: choose_index,
                                    g_reward: choose_reward,
                                    important_sampling: choose_IS})


        p_5 = precision_at_k(sess,X, gen_scores, query_pos_test, query_pos_train, query_url_feature, k=5)
        ndcg_5 = ndcg_at_k(sess,X, gen_scores, query_pos_test, query_pos_train, query_url_feature, k=5)

        if p_5 > p_best_val:
            p_best_val = p_5
            ndcg_best_val = ndcg_5
            print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)
        elif p_5 == p_best_val:
            if ndcg_5 > ndcg_best_val:
                ndcg_best_val = ndcg_5
                print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Training D ...
Epoch: 0
negative sampling for d using g ...
time used in generation 48.096176862716675


NameError: name 'selected_samples' is not defined