In [None]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense
import tensorflow.contrib.keras as keras
from keras.preprocessing.sequence import pad_sequences
import os
import numpy as np
import pandas as pd
from pandas import DataFrame
import time
import random
from PIL import Image
from trec_eval import trec_eval
import nltk
import copy

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [None]:
random.seed(1)
np.random.seed(1)

# Useful Functions

In [None]:
def read_comments(file_name):
    with open(file_name,'r') as f:
        file_content = f.readlines()
    comments = []
    for line in file_content:
        comments.append(line[:-1].split())
    return comments

In [None]:
def build_vocab(file_name,min_num):
    with open(file_name,'r') as f:
        file_content = f.readlines()
    word_to_int = {}
    int_to_word = {}
    words_num = 0
    for line in file_content:
        line = line[:-1].split('\t')
        if int(line[2]) >= min_num:
            word_to_int[line[1]] = int(line[0])
            int_to_word[int(line[0])] = line[1] 
            words_num += 1
        else:
            break
    word_to_int['<PAD>'] = 0
    word_to_int['<UNK>'] = words_num+1
    word_to_int['<GO>'] = words_num+2
    word_to_int['<EOS>'] = words_num+3
    int_to_word[0] = '<PAD>'
    int_to_word[words_num+1] = '<UNK>'
    int_to_word[words_num+2] = '<GO>'
    int_to_word[words_num+3] = '<EOS>'
    return word_to_int,int_to_word

In [None]:
def convert_comments(comments,word_to_int,int_to_word):
    comments_to_int = []
    for comment in comments:
        comment_to_int = [word_to_int[word] if word_to_int.get(word) != None else word_to_int['<UNK>'] for word in comment]  
        comment_to_int.insert(0,word_to_int['<GO>'])
        comment_to_int.append(word_to_int['<EOS>'])
        comments_to_int.append(comment_to_int)
    return comments_to_int

In [None]:
def negative_samples(num_samples,toplist,downlist,combinationlist):
    sampledata = []
    num = 0
    while num < num_samples:
        top = random.sample(toplist,1)[0]
        down = random.sample(downlist,1)[0]
        if top+down not in combinationlist:
            sampledata.append((top,down,-1))
            num += 1
    return sampledata

In [None]:
def pad_batch(batch,pad_int):
    max_length = max([len(comment) for comment in batch])
    pad_batch = pad_sequences(batch,maxlen=max_length,value=pad_int,padding='post')
    return pad_batch

In [None]:
def batch_to_input(batch,comments,imglist,topidlist,downidlist,pad_int):
    img1 = []#for top
    img2 = []#for down
    img1id = []
    img2id = []
    label = []
    sequence = []
    sequence_length = []
    weight = []
    for instance in batch:
        img1.append(imglist[instance[0]])
        img2.append(imglist[instance[1]])
        img1id.append(topidlist[instance[0]])
        img2id.append(downidlist[instance[1]])
        commentid = instance[2]
        if commentid == -1:
            label.append([1,0])
            weight.append(0)
        else:
            label.append([0,1])
            weight.append(1)
        sequence.append(comments[commentid])
        sequence_length.append(len(comments[commentid])-1)
    sequence = pad_batch(sequence,pad_int)
    sequence_input = sequence[:,:-1]
    sequence_output = sequence[:,1:]
    max_sequence_length = np.max(sequence_length)
    return np.array(img1),np.array(img2),np.array(img1id),np.array(img2id),np.array(label),sequence_input,sequence_output,sequence_length,max_sequence_length,np.array(weight)

In [None]:
def get_batches(data,batch_size,comments,toplist,downlist,combinationlist,imglist,topidlist,downidlist,pad_int):
    datacopy = copy.copy(data)
    datacopy.extend(negative_samples(len(datacopy),toplist,downlist,combinationlist))
    random.shuffle(datacopy)
    for batch_i in range(0,len(datacopy)//batch_size+1):
        start_i = batch_i*batch_size
        batch = datacopy[start_i:start_i+batch_size]          
        yield batch_to_input(batch,comments,imglist,topidlist,downidlist,pad_int)

In [None]:
def build_evaluation_batch(fixitem,itemlist,state,imglist,topidlist,downidlist):
    img1 = []
    img2 = []
    img1id = []
    img2id = []
    if state == 0:#top,downs
        for item in itemlist:
            img1.append(imglist[fixitem])
            img2.append(imglist[item])
            img1id.append(topidlist[fixitem])
            img2id.append(downidlist[item])
    if state == 1:#down,tops
        for item in itemlist:
            img1.append(imglist[item])
            img2.append(imglist[fixitem])
            img1id.append(topidlist[item])
            img2id.append(downidlist[fixitem])
    return np.array(img1),np.array(img2),np.array(img1id),np.array(img2id)

In [None]:
def id_seq_to_word_seq(id_seq,id_vocab,eos):
    index = 0
    while index < len(id_seq):
        if id_seq[index] == eos:
            break
        index += 1
    valid_id_seq = id_seq[:index+1]
    return ' '.join([id_vocab[id] for id in valid_id_seq])

In [None]:
def accuracy(label,prediction):
    return (label.argmax(axis=1) == prediction.argmax(axis=1)).sum()/float(len(label))

In [None]:
def prepare_evaluation(data_path,comments,int_to_word,word_to_int):
    with open(data_path,'r') as f:
        content = f.readlines()
    data = {}
    orderlist = []
    model_comments = {}
    labellist = {}
    query_number = 0
    for line in content:
        line = line[:-1].split('\t')
        if data.get(line[0]) != None:
            data[line[0]].append(line[1])
        else:
            data[line[0]] = [line[1]] 
            labellist[query_number] = {}
            query_number += 1
            orderlist.append(line[0])
        if int(line[2]) == 1:
            model_comments[(line[0],line[1])] = [id_seq_to_word_seq(comments[int(comment)],int_to_word,word_to_int['<EOS>']).split()[1:-1] for comment in line[3].split('|')]
            labellist[query_number-1][line[1]] = 1
        else:
            labellist[query_number-1][line[1]] = 0
    return data,orderlist,model_comments,labellist

In [None]:
def trec_evaluation(qrel_file_path,trec_file_path,trec):
    with open(trec_file_path,'w') as f:
        i = 0
        while i < len(trec):
            j = 0 
            while j < len(trec[i]):
                f.write(str(i)+' '+'Q0 '+trec[i][j][0]+' '+str(j+1)+' '+str(trec[i][j][1])+' '+'Exp'+'\n')
                j += 1
            i += 1   
    result = trec_eval(qrel_file_path,trec_file_path)
    print(result)
    return result

In [None]:
def bleu_evalaution(model_comments,system_comments,beamsearch):
    select = {}
    bleus = []
    if beamsearch:
        for combination,comments in system_comments.items():
            scores = []
            for comment in comments:
                scores.append(nltk.translate.bleu_score.sentence_bleu(model_comments[combination],comment,weights=[1.0]))
            scores = np.array(scores)
            bleus.append(scores.max())
            select[combination] = scores.argmax()#we only select the best for evaluation
    else:
        for combination,comment in system_comments.items():
            bleus.append(nltk.translate.bleu_score.sentence_bleu(model_comments[combination],comment,weights=[1.0]))  
    bleus = np.array(bleus)
    print(bleus.mean())
    return bleus.mean(),select

In [None]:
def auc_evaluation(labellist,trec):
    query_number = 0
    record = []
    while query_number < len(trec):
        negative = 0
        temp = []
        for combination in trec[query_number]:
            if labellist[query_number][combination[0]] == 1:
                temp.append(negative)
            else:
                negative += 1
        record.extend([(negative-val)/float(negative) for val in temp])
        query_number += 1
    auc = np.array(record).mean()
    print(auc)
    return auc

# Prepare Datasets

In [None]:
comments_path = 'dataset/text.dat'
vocab_path = 'dataset/vocab.dat'
min_num = 5

In [None]:
comments = read_comments(comments_path)
comments.append([])
comments

In [None]:
word_to_int,int_to_word = build_vocab(vocab_path,min_num)
vocab_size = len(word_to_int)
print(vocab_size)

In [None]:
comments = convert_comments(comments,word_to_int,int_to_word)
comments

In [None]:
toplist = []
topidlist = {}
with open('dataset/toplist.dat','r') as f:#in toplist, the first col is img_name of top, the second col is comments_index
    content = f.readlines()
for line in content:
    line = line.split('\t')
    toplist.append(line[0])
    topidlist[line[0]] = len(topidlist)
toplist

In [None]:
topidlist

In [None]:
downlist = []
downidlist = {}
with open('dataset/downlist.dat','r') as f:#in downlist, the first col is img_name of down(i.e. bottom), the second col is comments_index
    content = f.readlines()
for line in content:
    line = line.split('\t')
    downlist.append(line[0])
    downidlist[line[0]] = len(downidlist)
downlist

In [None]:
downidlist

In [None]:
combinationlist = set()
with open('dataset/combinationlist.dat','r') as f:#in combinationlist, the first col is img_name of top, the second col is img_name of down(i.e. bottom), the third col is comments_index    
    content = f.readlines()
for line in content:
    line = line[:-1].split('\t')
    combinationlist.add(line[0]+line[1])
combinationlist

In [None]:
imglist = {}
for img_idx in toplist:
    img = Image.open('img/'+img_idx+'.jpg')
    img = np.array(img)
    img = img/255.0
    imglist[img_idx] = img
for img_idx in downlist:
    img = Image.open('img/'+img_idx+'.jpg')
    img = np.array(img)
    img = img/255.0
    imglist[img_idx] = img
imglist

# Build Model

In [None]:
def get_input():
    img1 = tf.placeholder(tf.float32,[None,224,224,3],'img1')
    img2 = tf.placeholder(tf.float32,[None,224,224,3],'img2')
    img1id = tf.placeholder(tf.int32,[None,],'img1id')
    img2id = tf.placeholder(tf.int32,[None,],'img2id')
    label = tf.placeholder(tf.float32,[None,2],'label')
    sequence_input = tf.placeholder(tf.int32,[None,None],name='sequence_input')
    sequence_output = tf.placeholder(tf.int32,[None,None],name='sequence_output')
    sequence_length = tf.placeholder(tf.int32,[None,],name='sequence_length')
    max_sequence_length = tf.placeholder(tf.int32,[],name='max_sequence_length')
    batch_size = tf.placeholder(tf.int32,[],name='batch_size')
    learning_rate = tf.placeholder(tf.float32,[],name='learning_rate')
    keep_prob = tf.placeholder(tf.float32,[],name='keep_prob')
    ratio_c = tf.placeholder(tf.float32,[],name='ratio_c')
    ratio_g = tf.placeholder(tf.float32,[],name='ratio_g')
    weight = tf.placeholder(tf.float32,[None,],name='weight')
    flag = tf.placeholder(tf.bool,name='flag')
    return img1,img2,img1id,img2id,label,sequence_input,sequence_output,sequence_length,max_sequence_length,batch_size,learning_rate,keep_prob,ratio_c,ratio_g,weight,flag

In [None]:
def extractor(img):
    conv1 = keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1,1),padding='same',activation='relu',data_format='channels_last',kernel_initializer='glorot_normal')(img)
    conv2 = keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1,1),padding='same',activation='relu',data_format='channels_last',kernel_initializer='glorot_normal')(conv1)
    pool1 = keras.layers.MaxPool2D(pool_size=(16,16),padding='same')(conv1)
    #print(pool1)
    pool2 = keras.layers.MaxPool2D(pool_size=(16,16),padding='same')(conv2)
    #print(pool2)
    concat = keras.layers.Concatenate(axis=-1)([pool1,pool2])
    #print(concat)
    globalpool = keras.layers.GlobalAveragePooling2D()(concat)
    #print(globalpool)
    return concat,globalpool

In [None]:
def image_to_image_attention(conv,globalpool):#conv=[batch_size,14,14,64]，globalpool=[batch_size,64]  
    weights1 = tf.get_variable('weights1',shape=[64,64],initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    weights2 = tf.get_variable('weights2',shape=[64,64],initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    weights3 = tf.get_variable('weights3',shape=[64,1],initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    attn_from = tf.matmul(globalpool,weights1)#attn_form=[batch_size,64]
    features = keras.layers.Reshape([-1,64])(conv)#features=[batch_size,196,64] 
    attn_to = tf.matmul(tf.reshape(features,[-1,64]),weights2)#tf.reshape(features,[-1,64])=[batch_size*196,64]，attn_to=[batch_size*196,64]
    attn_from  = tf.expand_dims(attn_from,1)#attn_from=[batch_size,1,64]
    attn_to = tf.reshape(attn_to,tf.shape(features))#attn_to=[batch_size,196,64] 
    attn_logit = tf.add(attn_from,attn_to)#attn_logit=[batch_size,196,64]
    attn_logit = tf.reshape(attn_logit,[-1,64])#attn_logit=[batch_size*196,64]
    attn_logit = tf.tanh(attn_logit)
    attn_weight = tf.matmul(attn_logit,weights3)#attn_weight=[batch_size*196,1]
    attn_weight = tf.reshape(attn_weight,shape=[tf.shape(conv)[0],tf.shape(conv)[1]*tf.shape(conv)[2]])#attn_weight=[batch_size,196]
    attn_weight = tf.nn.softmax(attn_weight,name='attention_img2img')   
    attn_weight = tf.expand_dims(attn_weight,-1)#attn_weight=[batch_size,196,1]
    attn_conv = tf.multiply(features,attn_weight)#attn_conv=[batch_size,196,64]
    attn_conv = tf.reduce_sum(attn_conv,axis=1)#attn_conv=[batch_size,64]
    return features,attn_conv#e=v^Ttanh(W1s+W2h)，a=softmax(e)

In [None]:
def img2vec(conv):
    extractor_output = keras.layers.Dense(300,activation='relu',kernel_initializer='glorot_normal')(conv)
    return extractor_output

In [None]:
def img_embedding(img1id,img2id):
    top_embedding_matrix = tf.get_variable('top_embedding_matrix',shape=[len(toplist),embedding_size],initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    down_embedding_matrix = tf.get_variable('down_embedding_matrix',shape=[len(downlist),embedding_size],initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    img1_embedding = tf.nn.embedding_lookup(top_embedding_matrix,img1id)
    img2_embedding = tf.nn.embedding_lookup(down_embedding_matrix,img2id)
    return img1_embedding,img2_embedding

In [None]:
def classifier(extractor_output,keep_prob):
    dense = keras.layers.Dense(256,activation='relu',kernel_initializer='glorot_normal')(extractor_output)
    dropout = tf.nn.dropout(dense,keep_prob)
    classifier_output = keras.layers.Dense(2,activation='softmax',kernel_initializer='glorot_normal')(dropout) 
    return classifier_output

In [None]:
def get_gru_cell(keep_prob):
    gru_cell = tf.contrib.rnn.GRUCell(512,kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    dropout_gru_cell = tf.contrib.rnn.DropoutWrapper(gru_cell,input_keep_prob=keep_prob,output_keep_prob=keep_prob,state_keep_prob=keep_prob)
    return dropout_gru_cell

In [None]:
def generator(sequence_input,initial_state,encoder_output,batch_size,sequence_length,max_sequence_length,vocab_size,embedding_size,keep_prob):
    embedding_matrix = tf.get_variable('embedding_matrix',shape=[vocab_size,embedding_size],initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    generator_embed_sequence = tf.nn.embedding_lookup(embedding_matrix,sequence_input)
    generator_cell = tf.contrib.rnn.MultiRNNCell([get_gru_cell(keep_prob) for _ in range(1)])
    output_layer = Dense(vocab_size,kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False))
    with tf.variable_scope('generator'):
        training_helper = tf.contrib.seq2seq.TrainingHelper(generator_embed_sequence,sequence_length=sequence_length,time_major=False)
        #attention
        training_LuongAttention = tf.contrib.seq2seq.LuongAttention(num_units=512,memory=encoder_output,memory_sequence_length=None)   
        training_attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell=generator_cell,attention_mechanism=training_LuongAttention,attention_layer_size=512,alignment_history=False,output_attention=True)    
        training_attn_state = training_attn_cell.zero_state(batch_size,tf.float32).clone(cell_state=initial_state) 
        #attention
        training_decoder = tf.contrib.seq2seq.BasicDecoder(training_attn_cell,helper=training_helper,initial_state=training_attn_state,output_layer=output_layer)
        training_generator_output,training_generator_state,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_sequence_length)
    with tf.variable_scope('generator',reuse=True):
        start_tokens = tf.tile(tf.constant([word_to_int['<GO>']],dtype=tf.int32),[batch_size])
        #attention
        predicting_LuongAttention = tf.contrib.seq2seq.LuongAttention(num_units=512,memory=encoder_output,memory_sequence_length=None)
        predicting_attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell=generator_cell,attention_mechanism=predicting_LuongAttention,attention_layer_size=512,alignment_history=True,output_attention=True) 
        predicting_attn_state = predicting_attn_cell.zero_state(batch_size,tf.float32).clone(cell_state=initial_state) 
        #attention
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_matrix,start_tokens,word_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(predicting_attn_cell,predicting_helper,predicting_attn_state,output_layer)
        predicting_generator_output,predicting_generator_state,_ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_sequence_length)
        attention_matrix = tf.identity(predicting_generator_state.alignment_history.stack(),name='attention_matrix')
        #print(attention_matrix)
    with tf.variable_scope('generator',reuse=True):
        start_tokens = tf.tile(tf.constant([word_to_int['<GO>']],dtype=tf.int32),[batch_size])
        beamsearch_initial_state = tf.contrib.seq2seq.tile_batch(initial_state,multiplier=3)
        #attention
        beamsearch_encoder_output = tf.contrib.seq2seq.tile_batch(encoder_output,multiplier=3)
        beamsearch_LuongAttention = tf.contrib.seq2seq.LuongAttention(num_units=512,memory=beamsearch_encoder_output,memory_sequence_length=None)
        beamsearch_attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell=generator_cell ,attention_mechanism=beamsearch_LuongAttention,attention_layer_size=512,alignment_history=False,output_attention=True) 
        beamsearch_attn_state = beamsearch_attn_cell.zero_state(batch_size*3,tf.float32).clone(cell_state=beamsearch_initial_state)
        #attention
        beamsearch_predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(beamsearch_attn_cell,embedding=embedding_matrix,start_tokens=start_tokens,end_token=word_to_int['<EOS>'],initial_state=beamsearch_attn_state,beam_width=3,output_layer=output_layer,length_penalty_weight=0.6)
        beamsearch_generator_output,beamsearch_generator_state,_ = tf.contrib.seq2seq.dynamic_decode(beamsearch_predicting_decoder,output_time_major=False,impute_finished=False,maximum_iterations=max_sequence_length)
    return training_generator_output,predicting_generator_output,beamsearch_generator_output   

In [None]:
def loss(classifier_output,label,training_generator_output,sequence_output,sequence_length,max_sequence_length,ratio_c,ratio_g,weight,flag):
    classifier_loss = tf.reduce_mean(tf.contrib.keras.losses.categorical_crossentropy(label,classifier_output),name='classifier_loss')
    classifier_loss_freeze = tf.stop_gradient(classifier_loss)
    classifier_loss = tf.where(flag,classifier_loss,classifier_loss_freeze)
    training_logits = tf.identity(training_generator_output.rnn_output,name='training_logits')
    masks = tf.sequence_mask(sequence_length,max_sequence_length,dtype=tf.float32,name='mask')  
    generator_loss = tf.contrib.seq2seq.sequence_loss(training_logits,sequence_output,masks,average_across_timesteps=False,average_across_batch=False)  
    generator_loss = tf.reduce_sum(generator_loss,axis=1)
    generator_loss = tf.multiply(weight,generator_loss)
    generator_loss = tf.reduce_mean(generator_loss,name='generator_loss')
    classifier_loss = tf.multiply(ratio_c,classifier_loss)
    generator_loss = tf.multiply(ratio_g,generator_loss)
    tv = tf.trainable_variables()
    reg_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in tv])
    reg_loss_gen = tf.reduce_sum([tf.nn.l2_loss(v) for v in tv if ('generator' in v.name)])
    reg_loss = tf.where(flag,reg_loss,reg_loss_gen)
    loss = tf.add_n([classifier_loss,generator_loss,0.0001*reg_loss],name='loss')      
    return loss

In [None]:
def optimizer(loss,learning_rate):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(loss)
    capped_gradients = [(tf.clip_by_value(grad,-5.,5.),var) for grad,var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    return train_op

In [None]:
def prediction(classifier_output):
    prediction = tf.identity(classifier_output,name='prediction')
    return prediction

In [None]:
def generation(predicting_generator_output,beamsearch_generator_output):
    greedysearch_sequence = tf.identity(predicting_generator_output.sample_id,name='greedysearch_sequence')
    beamsearch_sequence = tf.identity(beamsearch_generator_output.predicted_ids,name='beamsearch_sequence')
    return greedysearch_sequence,beamsearch_sequence

In [None]:
embedding_size = 300
train_graph = tf.Graph()
with train_graph.as_default():
    tf.set_random_seed(1)
    with tf.name_scope('inputs'):
        img1,img2,img1id,img2id,label,sequence_input,sequence_output,sequence_length,max_sequence_length,batch_size,learning_rate,keep_prob,ratio_c,ratio_g,weight,flag = get_input()
    with tf.name_scope('extractor'):
        with tf.variable_scope('extractor'):
            conv_img1,globalpool_img1 = extractor(img1)
        with tf.variable_scope('extractor',reuse=True):
            conv_img2,globalpool_img2 = extractor(img2)
        with tf.variable_scope('image_to_image_attention'):
            features_img1,attn_conv_img1 = image_to_image_attention(conv_img1,globalpool_img2)
        with tf.variable_scope('image_to_image_attention',reuse=True):
            features_img2,attn_conv_img2 = image_to_image_attention(conv_img2,globalpool_img1)
        with tf.variable_scope('img2vec'):
            extractor_output_img1 = img2vec(attn_conv_img1)
        with tf.variable_scope('img2vec',reuse=True):
            extractor_output_img2 = img2vec(attn_conv_img2)
        with tf.variable_scope('img_embedding'):
            img1_embedding,img2_embedding = img_embedding(img1id,img2id)
        extractor_output = tf.concat([extractor_output_img1,extractor_output_img2,img1_embedding,img2_embedding],axis=1)
        encoder_output = tf.concat([features_img1,features_img2],axis=1)
        encoder_output_freeze = tf.stop_gradient(encoder_output)
        extractor_output_freeze = tf.stop_gradient(extractor_output)
        encoder_output = tf.where(flag,encoder_output,encoder_output_freeze)
        extractor_output = tf.where(flag,extractor_output,extractor_output_freeze)
    with tf.name_scope('classifier'):
        classifier_output = classifier(extractor_output,keep_prob)
    with tf.name_scope('prediction'):
        prediction = prediction(classifier_output)  
    with tf.name_scope('generator'):
        dense_output = keras.layers.Dense(512,activation='tanh',kernel_initializer='glorot_normal')(extractor_output)
        initial_state = (dense_output,)    
        training_generator_output,predicting_generator_output,beamsearch_generator_output = generator(sequence_input,initial_state,encoder_output,batch_size,sequence_length,max_sequence_length,vocab_size,embedding_size,keep_prob)  
    with tf.name_scope('generation'):
        greedysearch_sequence,beamsearch_sequence = generation(predicting_generator_output,beamsearch_generator_output) 
    with tf.name_scope('loss'):
        loss = loss(classifier_output,label,training_generator_output,sequence_output,sequence_length,max_sequence_length,ratio_c,ratio_g,weight,flag)
    with tf.name_scope('optimizer'): 
        train_op = optimizer(loss,learning_rate)                                                                    

# Train Model

In [None]:
with open('dataset/traindata.dat','r') as f:#in traindata, the first col is img_name of top, the second col is img_name of down(i.e. bottom), the third col is comment_index  
    content = f.readlines()
traindata = []
for line in content:
    line = line[:-1].split('\t')
    traindata.append((line[0],line[1],int(line[2])))
traindata

In [None]:
tops_qrel_file_path = 'evaluation/devdata_tops_qrel.dat'
tops_trec_file_path = 'evaluation/devdata_tops_trec.dat'
#downs_qrel_file_path = 'evaluation/devdata_downs_qrel.dat'
#downs_trec_file_path = 'evaluation/devdata_downs_trec.dat'

In [None]:
data_path = 'dataset/devdata_tops.dat'
dev_tops_data,tops_orderlist,model_tops_comments,tops_labellist = prepare_evaluation(data_path,comments,int_to_word,word_to_int)

In [None]:
#data_path = 'dataset/devdata_downs.dat'
#dev_downs_data,downs_orderlist,model_downs_comments,downs_labellist = prepare_evaluation(data_path,comments,int_to_word,word_to_int)

In [None]:
model_tops_comments

In [None]:
#model_downs_comments

In [None]:
lr = 0.001
rat_c = 1.0
rat_g = 1.0
epochs = 5
rate = 1.0

In [None]:
cla_cost_list = []
gen_cost_list = []
bleus_tops = []
auc_tops = []
trec_evals_tops = []
#bleus_downs = []
#trec_evals_downs = []
#auc_downs = []

In [None]:
beamsearch = True
checkpoint = 'checkpoint/trained_model.ckpt'
with tf.Session(graph=train_graph,config=config) as sess:
    writer = tf.summary.FileWriter('checkpoint/',sess.graph)
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    print(time.localtime())
    classifier_loss = train_graph.get_tensor_by_name('loss/classifier_loss:0')
    generator_loss = train_graph.get_tensor_by_name('loss/generator_loss:0')
    for epoch in range(epochs):
        b_s = 64#batch_size
        train_cla_cost = 0
        train_gen_cost = 0
        temp_cla_cost_list = []
        temp_gen_cost_list = []
        step = 0
        for _,(x_i1,x_i2,x_id1,x_id2,y_l,x_s_i,x_s_o,seq_len,max_seq_len,wei) in enumerate(get_batches(traindata,b_s,comments,toplist,downlist,combinationlist,imglist,topidlist,downidlist,word_to_int['<PAD>'])):
            _,cost1,cost2 = sess.run([train_op,classifier_loss,generator_loss],{img1:x_i1,img2:x_i2,img1id:x_id1,img2id:x_id2,label:y_l,sequence_input:x_s_i,sequence_output:x_s_o,sequence_length:seq_len,max_sequence_length:max_seq_len,batch_size:len(x_i1),learning_rate:lr,keep_prob:rate,ratio_c:rat_c,ratio_g:rat_g,weight:wei,flag:True})   
            train_cla_cost += cost1
            train_gen_cost += cost2
            step += 1
            if step%1000 == 0:
                temp_cla_cost_list.append(train_cla_cost/step)
                temp_gen_cost_list.append(train_gen_cost/step)
                print(str(train_cla_cost/step)+'&'+str(train_gen_cost/step)+' '+'pass!')
        temp_cla_cost_list.append(train_cla_cost/step)
        temp_gen_cost_list.append(train_gen_cost/step)
        cla_cost_list.append(temp_cla_cost_list)
        gen_cost_list.append(temp_gen_cost_list)
        print('Epoch {}/{} - Training Loss: {:.3f}&{:.3f}'.format(epoch+1,epochs,train_cla_cost/step,train_gen_cost/step))
        saver.save(sess,checkpoint,global_step=epoch+1)
        print('Model Trained and Saved')
        print(time.localtime())
        #validation       
        b_s = 64
        max_seq_len = 30
        system_tops_comments = {}
        tops_trec = {}
        query_number = 0
        step = 0
        for top in tops_orderlist:
            downsoftop = dev_tops_data[top]
            probabilitylist = {}
            for batch_i in range(len(downsoftop)//b_s+1):
                start_i = batch_i*b_s
                downs = downsoftop[start_i:start_i+b_s]
                x_i1,x_i2,x_id1,x_id2 = build_evaluation_batch(top,downs,0,imglist,topidlist,downidlist)
                seq_len = [30]*len(x_i1)
                prob,gred_seq,beam_seq = sess.run([prediction,greedysearch_sequence,beamsearch_sequence],{img1:x_i1,img2:x_i2,img1id:x_id1,img2id:x_id2,sequence_length:seq_len,max_sequence_length:max_seq_len,batch_size:len(x_i1),keep_prob:1.0,flag:True})
                j = 0
                for down in downs:
                    probabilitylist[down] = prob[j][1]
                    if model_tops_comments.get((top,down)) != None:
                        if beamsearch:
                            system_tops_comments[(top,down)] = [(id_seq_to_word_seq(beam_seq[j][:,index],int_to_word,word_to_int['<EOS>'])).split()[:-1] for index in range(3)]#3 is beam_width
                        else:
                            system_tops_comments[(top,down)] = (id_seq_to_word_seq(gred_seq[j],int_to_word,word_to_int['<EOS>'])).split()[:-1]
                    j += 1 
                step += 1
                if step%1000 == 0:
                    print('pass!')
            tops_trec[query_number] = sorted(probabilitylist.items(),key=lambda item:item[1],reverse=True)
            del probabilitylist,downsoftop
            query_number += 1
        bleu,_ = bleu_evalaution(model_tops_comments,system_tops_comments,beamsearch)
        bleus_tops.append(bleu)
        del system_tops_comments
        auc_tops.append(auc_evaluation(tops_labellist,tops_trec))
        trec_evals_tops.append(trec_evaluation(tops_qrel_file_path,tops_trec_file_path,tops_trec))
        del tops_trec
        '''
        system_downs_comments = {}
        downs_trec = {}
        query_number = 0
        step = 0
        for down in downs_orderlist:
            topsofdown = dev_downs_data[down]
            probabilitylist = {}
            for batch_i in range(len(topsofdown)//b_s+1):
                start_i = batch_i*b_s
                tops = topsofdown[start_i:start_i+b_s]
                x_i1,x_i2,x_id1,x_id2 = build_evaluation_batch(down,tops,1,imglist,topidlist,downidlist)
                seq_len = [30]*len(x_i1)
                prob,gred_seq,beam_seq = sess.run([prediction,greedysearch_sequence,beamsearch_sequence],{img1:x_i1,img2:x_i2,img1id:x_id1,img2id:x_id2,sequence_length:seq_len,max_sequence_length:max_seq_len,batch_size:len(x_i1),keep_prob:1.0,flag:True})
                j = 0
                for top in tops:
                    probabilitylist[top] = prob[j][1]
                    if model_downs_comments.get((down,top)) != None:
                        if beamsearch:
                            system_downs_comments[(down,top)] = [(id_seq_to_word_seq(beam_seq[j][:,index],int_to_word,word_to_int['<EOS>'])).split()[:-1] for index in range(3)]
                        else:
                            system_downs_comments[(down,top)] = (id_seq_to_word_seq(gred_seq[j],int_to_word,word_to_int['<EOS>'])).split()[:-1]
                    j += 1
                step += 1
                if step%1000 == 0:
                    print('pass!')
            downs_trec[query_number] = sorted(probabilitylist.items(),key=lambda item:item[1],reverse=True)
            del probabilitylist,topsofdown
            query_number += 1
        bleu,_ = bleu_evalaution(model_downs_comments,system_downs_comments,beamsearch)
        bleus_downs.append(bleu)
        del system_downs_comments
        auc_downs.append(auc_evaluation(downs_labellist,downs_trec))
        trec_evals_downs.append(trec_evaluation(downs_qrel_file_path,downs_trec_file_path,downs_trec))
        del downs_trec
        '''
        #validation        
        print(time.localtime())

# Evaluate Model

In [None]:
tops_qrel_file_path = 'evaluation/testdata_tops_qrel.dat'
tops_trec_file_path = 'evaluation/testdata_tops_trec.dat'
downs_qrel_file_path = 'evaluation/testdata_downs_qrel.dat'
downs_trec_file_path = 'evaluation/testdata_downs_trec.dat'

In [None]:
data_path = 'dataset/testdata_tops.dat'#in testdata_tops, the first col is img_name of top, the second col is img_name of down(i.e. bottom), the third col is rel(1 relevant, 0 irrelevant), the fourth col is comments_index(-1 is a special comment_index for irrelevant combination)    
test_tops_data,tops_orderlist,model_tops_comments,tops_labellist = prepare_evaluation(data_path,comments,int_to_word,word_to_int)

In [None]:
data_path = 'dataset/testdata_downs.dat'#in testdata_downs, the first col is img_name of down(i.e. bottom), the second col is img_name of top, the third col is rel(1 relevant, 0 irrelevant), the fourth col is comments_index(-1 is a special comment_index for irrelevant combination)    
test_downs_data,downs_orderlist,model_downs_comments,downs_labellist = prepare_evaluation(data_path,comments,int_to_word,word_to_int)

In [None]:
model_tops_comments

In [None]:
model_downs_comments

In [None]:
beamsearch = True
print(time.localtime())
checkpoint = 'checkpoint/trained_model.ckpt'
test_graph = tf.Graph()
with tf.Session(graph=test_graph,config=config) as sess:
    loader = tf.train.import_meta_graph(checkpoint+'.meta')
    loader.restore(sess,checkpoint)
    img1 = test_graph.get_tensor_by_name('inputs/img1:0')
    img2 = test_graph.get_tensor_by_name('inputs/img2:0')
    img1id = test_graph.get_tensor_by_name('inputs/img1id:0')
    img2id = test_graph.get_tensor_by_name('inputs/img2id:0')
    sequence_length = test_graph.get_tensor_by_name('inputs/sequence_length:0')
    max_sequence_length = test_graph.get_tensor_by_name('inputs/max_sequence_length:0')
    batch_size = test_graph.get_tensor_by_name('inputs/batch_size:0')
    keep_prob = test_graph.get_tensor_by_name('inputs/keep_prob:0')
    flag = test_graph.get_tensor_by_name('inputs/flag:0')
    prediction = test_graph.get_tensor_by_name('prediction/prediction:0')
    greedysearch_sequence = test_graph.get_tensor_by_name('generation/greedysearch_sequence:0')
    beamsearch_sequence = test_graph.get_tensor_by_name('generation/beamsearch_sequence:0')
    b_s = 64
    max_seq_len = 30
    system_tops_comments = {}
    tops_trec = {}
    query_number = 0
    step = 0
    for top in tops_orderlist:
        downsoftop = test_tops_data[top]
        probabilitylist = {}
        for batch_i in range(len(downsoftop)//b_s+1):
            start_i = batch_i*b_s
            downs = downsoftop[start_i:start_i+b_s]
            x_i1,x_i2,x_id1,x_id2 = build_evaluation_batch(top,downs,0,imglist,topidlist,downidlist)
            seq_len = [30]*len(x_i1)
            prob,gred_seq,beam_seq = sess.run([prediction,greedysearch_sequence,beamsearch_sequence],{img1:x_i1,img2:x_i2,img1id:x_id1,img2id:x_id2,sequence_length:seq_len,max_sequence_length:max_seq_len,batch_size:len(x_i1),keep_prob:1.0,flag:True})
            j = 0
            for down in downs:
                probabilitylist[down] = prob[j][1]
                if model_tops_comments.get((top,down)) != None:
                    if beamsearch:
                        system_tops_comments[(top,down)] = [(id_seq_to_word_seq(beam_seq[j][:,index],int_to_word,word_to_int['<EOS>'])).split()[:-1] for index in range(3)]
                    else:
                        system_tops_comments[(top,down)] = (id_seq_to_word_seq(gred_seq[j],int_to_word,word_to_int['<EOS>'])).split()[:-1]
                j += 1 
            step += 1
            if step%1000 == 0:
                print('pass!')
        tops_trec[query_number] = sorted(probabilitylist.items(),key=lambda item:item[1],reverse=True)
        del probabilitylist,downsoftop
        query_number += 1
    _,select_tops = bleu_evalaution(model_tops_comments,system_tops_comments,beamsearch)
    auc_evaluation(tops_labellist,tops_trec)
    trec_evaluation(tops_qrel_file_path,tops_trec_file_path,tops_trec)
    del tops_trec
    system_downs_comments = {}
    downs_trec = {}
    query_number = 0
    step = 0
    for down in downs_orderlist:
        topsofdown = test_downs_data[down]
        probabilitylist = {}
        for batch_i in range(len(topsofdown)//b_s+1):
            start_i = batch_i*b_s
            tops = topsofdown[start_i:start_i+b_s]
            x_i1,x_i2,x_id1,x_id2 = build_evaluation_batch(down,tops,1,imglist,topidlist,downidlist)
            seq_len = [30]*len(x_i1)
            prob,gred_seq,beam_seq = sess.run([prediction,greedysearch_sequence,beamsearch_sequence],{img1:x_i1,img2:x_i2,img1id:x_id1,img2id:x_id2,sequence_length:seq_len,max_sequence_length:max_seq_len,batch_size:len(x_i1),keep_prob:1.0,flag:True})
            j = 0
            for top in tops:
                probabilitylist[top] = prob[j][1]
                if model_downs_comments.get((down,top)) != None:
                    if beamsearch:
                        system_downs_comments[(down,top)] = [(id_seq_to_word_seq(beam_seq[j][:,index],int_to_word,word_to_int['<EOS>'])).split()[:-1] for index in range(3)]
                    else:
                        system_downs_comments[(down,top)] = (id_seq_to_word_seq(gred_seq[j],int_to_word,word_to_int['<EOS>'])).split()[:-1]
                j += 1
            step += 1
            if step%1000 == 0:
                print('pass!')
        downs_trec[query_number] = sorted(probabilitylist.items(),key=lambda item:item[1],reverse=True)
        del probabilitylist,topsofdown
        query_number += 1
    _,select_downs = bleu_evalaution(model_downs_comments,system_downs_comments,beamsearch)
    auc_evaluation(downs_labellist,downs_trec)
    trec_evaluation(downs_qrel_file_path,downs_trec_file_path,downs_trec)
    del downs_trec
print(time.localtime())

In [None]:
with open('system_comments/system_tops_comments.dat','w') as f:
    if beamsearch:
        for combination,commentlist in system_tops_comments.items():
            comment = ' '.join(commentlist[select_tops[combination]])
            f.write(combination[0]+'\t'+combination[1]+'\t'+comment+'\n')
    else:
        for combination,comment in system_downs_comments.items():
            comment = ' '.join(comment)
            f.write(combination[0]+'\t'+combination[1]+'\t'+comment+'\n')

In [None]:
with open('system_comments/system_downs_comments.dat','w') as f:
    if beamsearch:
        for combination,commentlist in system_downs_comments.items():
            comment = ' '.join(commentlist[select_downs[combination]])
            f.write(combination[0]+'\t'+combination[1]+'\t'+comment+'\n')
    else:
        for combination,comment in system_downs_comments.items():
            comment = ' '.join(comment)
            f.write(combination[0]+'\t'+combination[1]+'\t'+comment+'\n')