# generate_vocab

In [1]:
import os
import sys
import pprint


input_description_file = "./data/image_caption_data/results_20130124.token"
output_vocab_file = "./data/image_caption_data/vocab.txt"


def count_vocab(input_token_filename):
    '''词表信息统计(英文词表)，生成词表'''
    with open(input_token_filename, 'r', encoding='UTF-8') as f:
        lines = f.readlines()
    
    max_length_of_sentences = 0
    length_dict = {}   # 长度统计
    vocab_dict = {}    # 词频统计
    
    for line in lines:
        image_id, description = line.strip('\n').split('\t')
        words = description.strip(' ').split()
        max_length_of_sentences = max(max_length_of_sentences, len(words))
        length_dict.setdefault(len(words), 0)
        length_dict[len(words)] += 1

        for word in words:
            vocab_dict.setdefault(word, 0)
            vocab_dict[word] += 1
    
    print(max_length_of_sentences)
    # pprint将列表、字典、元组以易读格式打印
    pprint.pprint(length_dict)
    return vocab_dict

vocab_dict = count_vocab(input_description_file)
# 最大长度的设定，假如选择82，那么会有太多的无意义padding句子。并且
# 选择太小则会有太多的信息损失，做个trade-off

82
{2: 14,
 3: 52,
 4: 297,
 5: 1109,
 6: 3593,
 7: 7895,
 8: 11070,
 9: 13165,
 10: 14821,
 11: 15427,
 12: 14481,
 13: 12919,
 14: 11394,
 15: 9952,
 16: 8121,
 17: 6628,
 18: 5352,
 19: 4353,
 20: 3510,
 21: 2763,
 22: 2363,
 23: 1779,
 24: 1388,
 25: 1169,
 26: 1004,
 27: 778,
 28: 647,
 29: 496,
 30: 397,
 31: 362,
 32: 271,
 33: 228,
 34: 165,
 35: 165,
 36: 132,
 37: 93,
 38: 102,
 39: 61,
 40: 61,
 41: 54,
 42: 39,
 43: 40,
 44: 26,
 45: 23,
 46: 21,
 47: 12,
 48: 8,
 49: 17,
 50: 7,
 51: 18,
 52: 13,
 53: 8,
 54: 4,
 55: 9,
 56: 6,
 57: 5,
 58: 2,
 59: 5,
 60: 2,
 61: 1,
 62: 1,
 63: 1,
 64: 2,
 65: 1,
 66: 1,
 67: 2,
 68: 1,
 69: 2,
 70: 1,
 71: 1,
 72: 1,
 73: 1,
 75: 1,
 79: 1,
 82: 1}


In [2]:
# 排序
sorted_vocab_dict = sorted(vocab_dict.items(),
                          key=lambda d: d[1], reverse=True)
with open(output_vocab_file, 'w', encoding='UTF-8') as f:
    f.write("<UNK>\t100000\n")  # 未在词表中
    for item in sorted_vocab_dict:
        f.write('%s\t%d\n' % item)

# feature_extraction

In [3]:
import os
import sys
import tensorflow as tf
from tensorflow import gfile
from tensorflow import logging
import pprint
import pickle
import numpy as np

  from ._conv import register_converters as _register_converters


In [4]:
# 预训练好的模型文件
model_file = "./data/image_caption_data/checkpoint_inception_v3/inception_v3_graph_def.pb"
# 图像描述文件
input_description_file = "./data/image_caption_data/results_20130124.token"
# 图片
input_img_dir = "./data/image_caption_data/flickr30k_images/"
# 特征提取输出（比较耗时的操作，已计算好）
output_folder = "./data/image_caption_data/feature_extraction_inception_v3"

# 将所有图片按批量提取并输出，成为多个小文件，方便读写
batch_size = 1000

# 分布式环境中，os不能工作，gfile是可以工作的（也最好统一）
if not gfile.Exists(output_folder):
    gfile.MakeDirs(output_folder)


def parse_token_file(token_file):
    """Parses token file."""
    # 图片名到描述
    img_name_to_tokens = {}
    with gfile.GFile(token_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        img_id, description = line.strip('\n').split('\t')
        img_name, _ = img_id.split('#')
        img_name_to_tokens.setdefault(img_name, [])
        img_name_to_tokens[img_name].append(description)
    return img_name_to_tokens


img_name_to_tokens = parse_token_file(input_description_file)
all_img_names = img_name_to_tokens.keys()

In [5]:
logging.info("num of all images: %d" % len(all_img_names))
pprint.pprint(list(img_name_to_tokens.keys())[0:10])
pprint.pprint(img_name_to_tokens['2778832101.jpg'])

INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg',
 '1000523639.jpg',
 '1000919630.jpg',
 '10010052.jpg',
 '1001465944.jpg',
 '1001545525.jpg']
['A man in jeans is reclining on a green metal bench along a busy sidewalk and '
 'crowded street .',
 'A white male with a blue sweater and gray pants laying on a sidewalk bench .',
 'A man in a blue shirt and gray pants is sleeping on a sidewalk bench .',
 'A person is sleeping on a bench , next to cars .',
 'A man sleeping on a bench in a city area .']


In [6]:
num_batches = int(len(all_img_names) /  batch_size)
if len(all_img_names) % batch_size != 0:
    num_batches += 1


def load_pretrained_inception_v3(model_file):
    '''计算图和权重文件gfile导入'''
    with gfile.FastGFile(model_file, "rb") as f:
        # 新建空的计算图
        graph_def = tf.GraphDef()
        # 读取预训练计算图
        graph_def.ParseFromString(f.read())
        # 导入默认计算图：打开一个session就会打开一个默认graph
        tf.import_graph_def(graph_def, name="")

        
load_pretrained_inception_v3(model_file)

In [7]:
with tf.Session() as sess:
    # 使用倒数第二层的输出
    second_to_last_tensor = sess.graph.get_tensor_by_name("pool_3:0")
    for i in range(num_batches):
        batch_img_names = list(all_img_names)[i * batch_size:(i + 1) * batch_size]
        batch_features = []
        for img_name in batch_img_names:
            img_path = os.path.join(input_img_dir, img_name)
            logging.info("processing img %s" % img_name)
            if not gfile.Exists(img_path):
                raise Exception("%s doesn't exists" % img_path)
            img_data = gfile.FastGFile(img_path, "rb").read()
            feature_vector = sess.run(
                second_to_last_tensor,
                feed_dict={"DecodeJpeg/contents:0": img_data})
            batch_features.append(feature_vector)
        # 转换成矩阵文件
        batch_features = np.vstack(batch_features)
        output_filename = os.path.join(output_folder,
                                       "image_features-%d.pickle" % i)
        logging.info("writing to file %s" % output_filename)
        with gfile.GFile(output_filename, 'w') as f:
            # 一一对应的压缩
            pickle.dump((batch_img_names, batch_features), f)

# image_caption_train

In [3]:
import os
import sys
import tensorflow as tf
from tensorflow import gfile
from tensorflow import logging
import pprint
import pickle
import numpy as np
import math
# import tensorflow.contrib.eager as tfe
# tfe.enable_eager_execution()

# 第二步提取的图片特征
input_img_feature_dir = "./data/image_caption_data/feature_extraction_inception_v3"
# 图像描述文件
input_description_file = "./data/image_caption_data/results_20130124.token"
# 输出
output_dir = "./data/image_caption_data/local_run"
# 词表
input_vocab_file = "./data/image_caption_data/vocab.txt"

In [4]:
if not gfile.Exists(output_dir):
    gfile.MakeDirs(output_dir)

def get_default_params():
    return tf.contrib.training.HParams(
        num_vocab_word_threshold=3,
        num_embedding_nodes=32,
        num_timesteps=10,
        num_lstm_nodes=[64, 64],
        num_lstm_layers=2,
        num_fc_nodes=32,
        batch_size=50,
        cell_type='lstm',
        clip_lstm_grads=1.0,
        learning_rate=0.001,
        keep_prob=0.8,
        log_frequent=100,
        save_frequent=1000,
    )

# 超参数
hps = get_default_params()

## 词表处理

In [5]:
class Vocab(object):
    """词处理类，保存词和ids的mapping"""
    def __init__(self, filename, word_num_threshold):
        self._id_to_word = {}
        self._word_to_id = {}
        self._unk = -1
        self._eos = -1
        self._word_num_threshold = word_num_threshold
        self._read_dict(filename)

    def _read_dict(self, filename):
        with gfile.GFile(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, occurence = line.strip('\r\n').split('\t')
            occurence = int(occurence)
            if word != '<UNK>' and occurence < self._word_num_threshold:
                continue
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word == '.':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception('duplicate words in vocab file')
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word

    @property
    def unk(self):
        return self._unk

    @property
    def eos(self):
        return self._eos

    def word_to_id(self, word):
        return self._word_to_id.get(word, self.unk)

    def id_to_word(self, cur_id):
        return self._id_to_word.get(cur_id, '<UNK>')

    def size(self):
        return len(self._word_to_id)

    def encode(self, sentence):
        'return: list, ids'
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]
        return word_ids

    def decode(self, sentence_id):
        'return: string, words'
        words = [self.id_to_word(word_id) for word_id in sentence_id]
        return ' '.join(words)

In [6]:
def parse_token_file(token_file):
    """Parses token file.
    {img_name: [description]}"""
    img_name_to_tokens = {}
    with gfile.GFile(token_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        img_id, description = line.strip('\r\n').split('\t')
        img_name, _ = img_id.split('#')
        img_name_to_tokens.setdefault(img_name, [])
        img_name_to_tokens[img_name].append(description)
    return img_name_to_tokens

def convert_token_to_id(img_name_to_tokens, vocab):
    """Converts tokens of each description of imgs to id. """
    img_name_to_token_ids = {}
    for img_name in img_name_to_tokens:
        img_name_to_token_ids.setdefault(img_name, [])
        descriptions = img_name_to_tokens[img_name]
        for description in descriptions:
            token_ids = vocab.encode(description)
            img_name_to_token_ids[img_name].append(token_ids)
    return img_name_to_token_ids

In [7]:
vocab = Vocab(input_vocab_file, hps.num_vocab_word_threshold)
vocab_size = vocab.size()
logging.info("vocab_size: %d" % vocab_size)
    
img_name_to_tokens = parse_token_file(input_description_file)
img_name_to_token_ids = convert_token_to_id(img_name_to_tokens, vocab)

logging.info("num of all images: %d" % len(img_name_to_tokens))
pprint.pprint(list(img_name_to_tokens.keys())[0:10])
pprint.pprint(img_name_to_tokens['2778832101.jpg'])

logging.info("num of all images: %d" % len(img_name_to_token_ids))
pprint.pprint(list(img_name_to_token_ids.keys())[0:10])
pprint.pprint(img_name_to_token_ids['2778832101.jpg'])

INFO:tensorflow:vocab_size: 10875
INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg',
 '1000523639.jpg',
 '1000919630.jpg',
 '10010052.jpg',
 '1001465944.jpg',
 '1001545525.jpg']
['A man in jeans is reclining on a green metal bench along a busy sidewalk and '
 'crowded street .',
 'A white male with a blue sweater and gray pants laying on a sidewalk bench .',
 'A man in a blue shirt and gray pants is sleeping on a sidewalk bench .',
 'A person is sleeping on a bench , next to cars .',
 'A man sleeping on a bench in a city area .']
INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg',
 '1000523639.jpg',
 '1000919630.jpg',
 '10010052.jpg',
 '1001465944.jpg',
 '1001545525.jpg']
[[3, 9, 4, 132, 8, 3532, 6, 1, 48, 337, 146, 139, 1, 244, 93, 7, 380, 36, 2],
 [3, 20, 179, 11, 1, 26, 284, 7, 120, 128, 297, 6, 1, 93, 146, 2],
 [3, 9, 4

## 训练batch数据生成

In [8]:
class ImageCaptionData(object):
    def __init__(self,
                 img_name_to_token_ids,
                 img_feature_dir,
                 num_timesteps,
                 vocab,
                 deterministic = False):
        self._vocab = vocab
        self._all_img_feature_filepaths = []
        for filename in gfile.ListDirectory(img_feature_dir):
            self._all_img_feature_filepaths.append(os.path.join(img_feature_dir, filename))
        pprint.pprint(self._all_img_feature_filepaths)

        self._img_name_to_token_ids = img_name_to_token_ids
        self._num_timesteps = num_timesteps  # 目标解码长度
        self._indicator = 0
        self._deterministic = deterministic  # 是否随机打乱
        self._img_feature_filenames = []
        self._img_feature_data = []
        self._load_img_feature_pickle()
        if not self._deterministic:
            self._random_shuffle()

    def _load_img_feature_pickle(self):
        for filepath in self._all_img_feature_filepaths:
            logging.info("loading %s" % filepath)
            with gfile.GFile(filepath, 'rb') as f:
                filenames, features = pickle.load(f, encoding='iso-8859-1')
                self._img_feature_filenames += filenames
                self._img_feature_data.append(features)
        self._img_feature_data = np.vstack(self._img_feature_data)
        origin_shape = self._img_feature_data.shape
        # 每个图的特征转换为一个序列特征
        self._img_feature_data = np.reshape(
            self._img_feature_data, (origin_shape[0], origin_shape[3]))
        self._img_feature_filenames = np.asarray(self._img_feature_filenames)
        print(self._img_feature_data.shape)
        print(self._img_feature_filenames.shape)
        if not self._deterministic:
            self._random_shuffle()

    def size(self):
        return len(self._img_feature_filenames)

    def img_feature_size(self):
        return self._img_feature_data.shape[1]

    def _random_shuffle(self):
        p = np.random.permutation(self.size())
        self._img_feature_filenames = self._img_feature_filenames[p]
        self._img_feature_data = self._img_feature_data[p]

    def _img_desc(self, filenames):
        batch_sentence_ids = []
        batch_weights = []
        for filename in filenames:
            token_ids_set = self._img_name_to_token_ids[filename]
            # chosen_token_ids = random.choice(token_ids_set)
            chosen_token_ids = token_ids_set[0]
            chosen_token_length = len(chosen_token_ids)
            # 控制训练数据长度
            weight = [1 for i in range(chosen_token_length)]
            if chosen_token_length >= self._num_timesteps:
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [self._vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        return batch_sentence_ids, batch_weights

    def next(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0
            end_indicator = self._indicator + batch_size
        assert end_indicator <= self.size()

        batch_img_features = self._img_feature_data[self._indicator: end_indicator]
        batch_img_names = self._img_feature_filenames[self._indicator: end_indicator]
        batch_sentence_ids, batch_weights = self._img_desc(batch_img_names)

        self._indicator = end_indicator
        return batch_img_features, batch_sentence_ids, batch_weights, batch_img_names

In [9]:
caption_data = ImageCaptionData(img_name_to_token_ids, input_img_feature_dir, hps.num_timesteps, vocab)
img_feature_dim = caption_data.img_feature_size()
caption_data_size = caption_data.size()
logging.info("img_feature_dim: %d" % img_feature_dim)
logging.info("caption_data_size: %d" % caption_data_size)

batch_img_features, batch_sentence_ids, batch_weights, batch_img_names = caption_data.next(5)
pprint.pprint(batch_img_features)
pprint.pprint(batch_sentence_ids)
pprint.pprint(batch_weights)
pprint.pprint(batch_img_names)

['./data/image_caption_data/feature_extraction_inception_v3\\image_features-0.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-1.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-10.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-11.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-12.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-13.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-14.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-15.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-16.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-17.pickle',
 './data/image_caption_data/feature_extraction_inception_v3\\image_features-18.pickle',
 './data/image_caption_data/featur

## 定义网络结构

In [10]:
# from AdaBound import AdaBoundOptimizer
# tf.reset_default_graph()

def create_rnn_cell(hidden_dim, cell_type):
    if cell_type == 'lstm':
        return tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
    elif cell_type == 'gru':
        return tf.contrib.rnn.GRUCell(hidden_dim)
    else:
        raise Exception("%s has not been supported" % cell_type)

        
def dropout(cell, keep_prob):
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)


def get_train_model(hps, vocab_size, img_feature_dim):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    img_feature  = tf.placeholder(tf.float32, (batch_size, img_feature_dim))
    sentence = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    mask = tf.placeholder(tf.float32, (batch_size, num_timesteps))
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)
    
    # prediction process:
    #    sentence: [a,,c,f,...]
    #    input: [img, a, c, ...]
    #    img_feature: [0.12, 0.123, ...]
    #    predict: img_feature -> embedding_img -> lstm -> a
    #             a -> embedding_word -> lstm -> b 
    
    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],
            tf.float32)
        # 取到倒数第二个token，因为预测第一步是img输入
        # [batchsize, num_timesteps-1, num_embedding_nodes]
        embed_token_ids = tf.nn.embedding_lookup(embeddings, sentence[:, 0:num_timesteps-1])

    img_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('image_feature_embed', initializer=img_feature_embed_init):
        # 将img feature作为第一步输入
        # [batchsize, num_embedding_nodes]
        embed_img = tf.layers.dense(img_feature, hps.num_embedding_nodes)
        # [batchsize, 1, num_embedding_nodes]
        embed_img = tf.expand_dims(embed_img, 1)
        # [batchsize, num_timesteps, num_embedding_nodes]
        embed_inputs = tf.concat([embed_img, embed_token_ids], axis=1)

    # Sets up LSTM network.
    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)

        initial_state = cell.zero_state(hps.batch_size, tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, hps.num_lstm_node[-1]]
        rnn_outputs, _ = tf.nn.dynamic_rnn(cell,
                                           embed_inputs,
                                           initial_state=initial_state)

    # Sets up the fully-connected layer.
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hps.num_lstm_nodes[-1]])
        fc1 = tf.layers.dense(rnn_outputs_2d, hps.num_fc_nodes, name='fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        fc1_dropout = tf.nn.relu(fc1_dropout)
        logits = tf.layers.dense(fc1_dropout, vocab_size, name='logits')

    with tf.variable_scope('loss'):
        sentence_flatten = tf.reshape(sentence, [-1])
        mask_flatten = tf.reshape(mask, [-1])
        mask_sum = tf.reduce_sum(mask_flatten)
        
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=sentence_flatten)
        weighted_softmax_loss = tf.multiply(softmax_loss,
                                            tf.cast(mask_flatten, tf.float32))
        
        prediction = tf.argmax(logits, 1, output_type = tf.int32)
        correct_prediction = tf.equal(prediction, sentence_flatten)
        correct_prediction_with_mask = tf.multiply(
            tf.cast(correct_prediction, tf.float32),
            mask_flatten)
        
        # 参考指标
        accuracy = tf.reduce_sum(correct_prediction_with_mask) / mask_sum
        loss = tf.reduce_sum(weighted_softmax_loss) / mask_sum
        tf.summary.scalar('loss', loss)

    with tf.variable_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            logging.info("variable name: %s" % (var.name))
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(loss, tvars), hps.clip_lstm_grads)
        for grad, var in zip(grads, tvars):
            tf.summary.histogram('%s_grad' % (var.name), grad)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)

    return ((img_feature, sentence, mask, keep_prob),
            (loss, accuracy, train_op),
            global_step)

placeholders, metrics, global_step = get_train_model(hps, vocab_size, img_feature_dim)
img_feature, sentence, mask, keep_prob = placeholders
loss, accuracy, train_op = metrics

summary_op = tf.summary.merge_all()

init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
INFO:tensorflow:variable name: embedding/embeddings:0
INFO:tensorflow:variable name: image_feature_embed/dense/kernel:0
INFO:tensorflow:variable name: image_feature_embed/dense/bias:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: fc/fc1/kernel:0
INFO:tensorflow:variable name: fc/fc1/bias:0
INFO:tensorflow:variable name: fc/logits/kernel:0
INFO:tensorflow:variable name: fc/logits/bias:0
INFO:tensorflow:Summary name embedding/embeddings:0_grad is illegal; using embedding/embeddings_0_grad instead.
INFO:tensorflow:Summary name ima

## training

In [11]:
training_steps = 10000

with tf.Session() as sess:
    sess.run(init_op)
    writer = tf.summary.FileWriter(output_dir, sess.graph)
    for i in range(training_steps):
        batch_img_features, batch_sentence_ids, batch_weights, _ = caption_data.next(hps.batch_size)
        input_vals = (batch_img_features, batch_sentence_ids, batch_weights, hps.keep_prob)
        
        feed_dict = dict(zip(placeholders, input_vals))
        fetches = [global_step, loss, accuracy, train_op]
        
        should_log = (i + 1) % hps.log_frequent == 0
        should_save = (i + 1) % hps.save_frequent == 0
        
        if should_log:
            fetches += [summary_op]
        
        outputs = sess.run(fetches, feed_dict)
        global_step_val, loss_val, accuracy_val = outputs[0:3]
        
        if should_log:
            summary_str = outputs[4]
            writer.add_summary(summary_str, global_step_val)
            logging.info('Step: %5d, loss: %3.3f, accuracy: %3.3f'
                         % (global_step_val, loss_val, accuracy_val))
        if should_save:
            logging.info("Step: %d, image caption model saved" % (global_step_val))
            saver.save(sess, os.path.join(output_dir, "image_caption"), global_step=global_step_val)

INFO:tensorflow:Step:   100, loss: 6.098, accuracy: 0.102
INFO:tensorflow:Step:   200, loss: 5.526, accuracy: 0.138
INFO:tensorflow:Step:   300, loss: 5.283, accuracy: 0.174
INFO:tensorflow:Step:   400, loss: 4.881, accuracy: 0.211
INFO:tensorflow:Step:   500, loss: 4.879, accuracy: 0.190
INFO:tensorflow:Step:   600, loss: 4.888, accuracy: 0.218
INFO:tensorflow:Step:   700, loss: 4.454, accuracy: 0.272
INFO:tensorflow:Step:   800, loss: 4.789, accuracy: 0.234
INFO:tensorflow:Step:   900, loss: 4.708, accuracy: 0.218
INFO:tensorflow:Step:  1000, loss: 4.764, accuracy: 0.216
INFO:tensorflow:Step: 1000, image caption model saved
INFO:tensorflow:Step:  1100, loss: 4.440, accuracy: 0.250
INFO:tensorflow:Step:  1200, loss: 4.557, accuracy: 0.226
INFO:tensorflow:Step:  1300, loss: 4.277, accuracy: 0.266
INFO:tensorflow:Step:  1400, loss: 4.754, accuracy: 0.218
INFO:tensorflow:Step:  1500, loss: 4.429, accuracy: 0.230
INFO:tensorflow:Step:  1600, loss: 4.411, accuracy: 0.258
INFO:tensorflow:St

# Eval

In [12]:
tf.reset_default_graph()

def get_default_params():
    return tf.contrib.training.HParams(
        num_vocab_word_threshold=3,
        num_embedding_nodes=32,
        num_timesteps=10,
        num_lstm_nodes=[64, 64],
        num_lstm_layers=2,
        num_fc_nodes=32,
        batch_size=1,
        cell_type='lstm',
        clip_lstm_grads=1.0,
        learning_rate=0.001,
        keep_prob=0.8,
        log_frequent=100,
        save_frequent=1000,
    )

# 超参数
hps = get_default_params()

def create_rnn_cell(hidden_dim, cell_type):
    if cell_type == 'lstm':
        return tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
    elif cell_type == 'gru':
        return tf.contrib.rnn.GRUCell(hidden_dim)
    else:
        raise Exception("%s has not been supported" % cell_type)

def dropout(cell, keep_prob):
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

def eval_get_embedding_for_img(hps, img_feature_dim):
    img_feature  = tf.placeholder(tf.float32, (1, img_feature_dim))
    img_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('image_feature_embed', initializer=img_feature_embed_init):
        embed_img = tf.layers.dense(img_feature, hps.num_embedding_nodes)
        embed_img = tf.expand_dims(embed_img, 1)
        return img_feature, embed_img

def eval_embedding_lookup(hps, vocab_size):
    word = tf.placeholder(tf.int32, (1, 1))
    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],
            tf.float32)
        embed_word = tf.nn.embedding_lookup(embeddings, word)
    return word, embed_word

def eval_lstm_single_step(hps, vocab_size):
    embed_input = tf.placeholder(tf.float32, (1, 1, hps.num_embedding_nodes))
    num_lstm_layers = []
    for i in range(hps.num_lstm_layers):
        num_lstm_layers.append(hps.num_lstm_nodes[i])
        num_lstm_layers.append(hps.num_lstm_nodes[i])

    num_hidden_states = sum(num_lstm_layers)
    input_state = tf.placeholder(tf.float32, (1, num_hidden_states))
    unpack_init_state = tf.split(input_state, num_lstm_layers, axis=1)
    input_tuple_state = []
    
    i = 0
    while i < len(unpack_init_state):
        input_tuple_state.append(
            tf.nn.rnn_cell.LSTMStateTuple(
                unpack_init_state[i], unpack_init_state[i+1]))
        i += 2
    input_tuple_state = tuple(input_tuple_state)

    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, 1.0)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        decode_cell = BeamSearchDecoder(cell, 
                                        embedding=)
        
        rnn_output, output_tuple_state = tf.nn.dynamic_rnn(
            cell,
            embed_input,
            initial_state=input_tuple_state)
        output_state = []
        for state in output_tuple_state:
            output_state.append(state[0])
            output_state.append(state[1])
        output_state = tf.concat(output_state, axis=1, name="output_state")

    # Sets up the fully-connected layer.
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        rnn_output_2d = tf.reshape(rnn_output, [-1, hps.num_lstm_nodes[-1]])
        fc1 = tf.layers.dense(rnn_output_2d, hps.num_fc_nodes, name='fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, 1.0)
        fc1_dropout = tf.nn.relu(fc1_dropout)
        logits = tf.layers.dense(fc1_dropout, vocab_size, name='logits')

    return embed_input, rnn_output, logits, input_state, output_state, num_hidden_states


img_feature, embed_img = eval_get_embedding_for_img(hps, img_feature_dim)
word, embed_word = eval_embedding_lookup(hps, vocab_size)
embed_input, rnn_output, logits, input_state, output_state, num_hidden_states = eval_lstm_single_step(hps, vocab_size)

summary_op = tf.summary.merge_all()

init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)

In [13]:
test_examples = 1

with tf.Session() as sess:
    sess.run(init_op)
    logging.info("[*] Reading checkpoint ...")
    ckpt = tf.train.get_checkpoint_state(output_dir)
    if ckpt and ckpt.model_checkpoint_path:
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
        saver.restore(sess, os.path.join(output_dir, ckpt_name))
        logging.info("[*] Success Read Checkpoint From %s" % (ckpt_name))
    else:
        raise Exception("[*] Failed load checkpoint")
    
    for i in range(test_examples):
        single_img_features, single_sentence_ids, single_weights, single_img_name = caption_data.next(hps.batch_size)
        print(single_img_name)

        pprint.pprint(img_name_to_tokens[single_img_name[0]])
        pprint.pprint(img_name_to_token_ids[single_img_name[0]])

        embed_img_val = sess.run(embed_img, feed_dict={img_feature: single_img_features})

        state_val = np.zeros((1, num_hidden_states))
        embed_input_val = embed_img_val
        generated_sequence = []

        for j in range(hps.num_timesteps):
            logits_val, state_val = sess.run([logits, output_state],
                                             feed_dict = {
                                                 embed_input: embed_input_val,
                                                 input_state: state_val
                                             })
            predicted_word_id = np.argmax(logits_val[0])
            generated_sequence.append(predicted_word_id)
            embed_input_val = sess.run(embed_word,
                                       feed_dict={word: [[predicted_word_id]]})
        pprint.pprint("generated words: ")
        pprint.pprint(generated_sequence)
        pprint.pprint(vocab.decode(generated_sequence))

INFO:tensorflow:[*] Reading checkpoint ...
INFO:tensorflow:Restoring parameters from ./data/image_caption_data/local_run\image_caption-10000
INFO:tensorflow:[*] Success Read Checkpoint From image_caption-10000
['2667078298.jpg']
['A girl in a short v-neck blue dress and high heel sandals is carrying a '
 'bouquet of calla lilies down and aisle with a man in a tuxedo .',
 'A woman in a blue dress and a man in a suit walk down the aisle together at '
 'a wedding ceremony .',
 'A bridesmaid in a knee-length blue dress walks down the aisle with a '
 'grooms-man .',
 'A woman with flowers in her hand and a man walk on a walkway arm in arm .',
 'A bridesmaid and groomsmen walk down the aisle .']
[[3,
  30,
  4,
  1,
  464,
  6627,
  26,
  117,
  7,
  292,
  6683,
  762,
  8,
  141,
  1,
  1556,
  10,
  0,
  9989,
  37,
  7,
  1874,
  11,
  1,
  9,
  4,
  1,
  2304,
  2],
 [3,
  13,
  4,
  1,
  26,
  117,
  7,
  1,
  9,
  4,
  1,
  191,
  150,
  37,
  5,
  1874,
  140,
  17,
  1,
  602,
  154

# 其他模型

[show attention and tell](https://github.com/yunjey/show-attend-and-tell)

[show attention and tell.tensorflow](https://github.com/jazzsaxmafia/show_attend_and_tell.tensorflow)