In [None]:
#图像生成文字
'''
载入词表，构造两种映射关系
word to id
id to word

将image_description的字典转换为{'图像名’：[[1,2,3,45467,123...],[24,56,...]]}

载入图像特这帮
构建batch，随机挑选出一个描述


构建计算图

'''

In [19]:
import os
import sys
import math
import pickle
import pprint
import numpy as np 
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow.compat.v1 import gfile
from tensorflow.compat.v1 import logging

input_descroption_file = '../../datasets/image_caption_data/results_20130124.token'
input_img_feature_dir = '../../datasets/image_caption_data/feature_extraction_inception_v3/'
input_vocab_file = '../../datasets/image_caption_data/vocab.txt'
output_dir = '../../datasets/image_caption_data/local_run'

if not gfile.Exists(output_dir):
    gfile.MakeDirs(output_dir)

In [2]:
class get_default_params():
    def __init__(self,name):
        self.name =name
        self.num_vocab_threshold = 3
        self.num_embedding_node =32
        self.num_timesteps = 10
        self.num_lstm_nodes =[64,64]
        self.num_lstm_layer = 2
        self.num_fc_nodes = 32
        self.batch_size = 100
        self.cell_type = 'lstm'
        self.learning_rate = 0.1
        self.keep_prob = 0.8
        self.log_frequent = 500
        self.save_frequent= 5000

train_steps = 10000
hps = get_default_params('hps1')
  

In [3]:
class Vocab(object):
    def __init__(self,filename,word_num_threshold):
        self._word_to_id = {}
        self._id_to_word = {}
        self._unk = -1
        self._eos = -1
        self._word_num_threshold = word_num_threshold
        self.read_file(filename)
    
    
    def read_file(self,filename):
        with gfile.GFile(filename,'r') as f:
            lines =f.readlines()
        for line in lines:
            word,frequent = line.strip('\r\n').split('\t')
            frequent = int(frequent)
            if frequent < self._word_num_threshold:
                continue
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word =='.':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception('duplicate words in vocab file')
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word 
        
    @property
    def unk(self):
        return self._unk
        
    @property
    def eos(self):
        return self._eos
        
    def word_to_id(self,word):
        return self._word_to_id.get(word,self.unk)
        
    def id_to_word(self,id):
        return self._id_to_word.get(id,'<UNK>')
        
    def size(self):
        return len(self._word_to_id)

    def encode(self,sentence):
        words_id = [self.word_to_id(word) for word in sentence.split(' ')]
        return words_id

    def decode(self,sentence_id):
        id_words = [self.id_to_word(word_id) for word_id in sentence_id]  
        return ' '.join(id_words)


vocab = Vocab(input_vocab_file,hps.num_vocab_threshold)
vocab_size = vocab.size()
logging.info('vocab_size is:%d' %vocab_size)
pprint.pprint(vocab.encode('i hava a dream'))
pprint.pprint(vocab.decode([5,20,3,4]))


INFO:tensorflow:vocab_size is:10875
[3838, 0, 1, 0]
'the white A in'


In [4]:
def parse_token_file(token_file):
    img_name_to_token = {}
    with gfile.GFile(token_file,'r') as f:
        lines = f.readlines()
    for line in lines:
        image_id,description = line.strip('\r\n').split('\t')
        image_name,_ = image_id.split('#')
        img_name_to_token.setdefault(image_name,[])
        img_name_to_token[image_name].append(description)
    return img_name_to_token


In [5]:
def convert_token_to_id(img_name_to_token,vocab):
    img_name_to_token_id = {}
    for image_name in img_name_to_token:
        img_name_to_token_id.setdefault(image_name,[])
        descriptions = img_name_to_token[image_name]
        for description in descriptions:
            ids = vocab.encode(description)
            img_name_to_token_id[image_name].append(ids)
    return img_name_to_token_id

img_name_to_token = parse_token_file(input_descroption_file)
img_name_to_token_id = convert_token_to_id(img_name_to_token,vocab)
#logging.info(len(img_name_to_token))
#pprint.pprint(img_name_to_token['2778832101.jpg'])
#logging.info(len(img_name_to_token_id))
#pprint.pprint(img_name_to_token_id['2778832101.jpg'])

In [6]:
class ImageCaptionData(object):
    '''
    数据供应
    '''
    def __init__(self,
                 img_name_to_token_ids,
                 img_feature_dir,
                 num_timesteps,
                 vocab,
                 deterministic=False):
        '''

        :param img_name_to_token_ids: 图像到描述字典
        :param img_feature_dir: 图像特征 保存文件目录
        :param num_timesteps: 时间步的数量
        :param vocab: 词表
        :param deterministic: 是否打乱
        '''
        self._vocab = vocab
        self._all_img_feature_filepaths = [] # 拼接出　图像特征文件的　路径
        for filename in gfile.ListDirectory(img_feature_dir):
            self._all_img_feature_filepaths.append(os.path.join(img_feature_dir, filename))

        self._img_name_to_token_ids = img_name_to_token_ids
        self._num_timesteps = num_timesteps
        self._indicator = 0 # batch_size 的 起始点
        self._deterministic = deterministic
        self._img_feature_filenames = [] # 保存所有图像特征的路径
        self._img_feature_data = [] # 保存 所有 图像特征
        self._load_img_feature_pickle()
        if not self._deterministic:
            self._random_shuffle()

    def _load_img_feature_pickle(self):
        '''
        从 文件 从 读取 图像 特征
        :return:
        '''
        for filepath in self._all_img_feature_filepaths:
            with gfile.GFile(filepath, 'rb') as f:
                filenames, features = pickle.load(f, encoding='iso-8859-1')
                self._img_feature_filenames += filenames # 将列表拼接到一起
                self._img_feature_data.append(features) # 将 特征 保存到一起
        # 如 原来矩阵是 [#(1000, 1, 1, 2048), #(1000, 1, 1, 2048)] 合并之后为 (2000, 1, 1, 2048)
        self._img_feature_data = np.vstack(self._img_feature_data)
        origin_shape = self._img_feature_data.shape
        # 此刻 origin_shape 的 shape：(31783, 1, 1, 2048)
        self._img_feature_data = np.reshape( # 将其中的 两维度 去掉
            self._img_feature_data, (origin_shape[0], origin_shape[3]))
        self._img_feature_filenames = np.asarray(self._img_feature_filenames)
        print(self._img_feature_data.shape) # (31783, 2048)
        print(self._img_feature_filenames.shape) # (31783,)
        if not self._deterministic:
            self._random_shuffle()

    def size(self):
        # 图像文件的个数
        return len(self._img_feature_filenames)

    def img_feature_size(self):
        # 获得图像特征的维度
        return self._img_feature_data.shape[1]

    def _random_shuffle(self):
        p = np.random.permutation(self.size())
        self._img_feature_filenames = self._img_feature_filenames[p]
        self._img_feature_data = self._img_feature_data[p]

    def _img_desc(self, filenames):
        '''
        从多条语句中，随机获得一条描述
        :param filenames:
        :return:
        '''
        batch_sentence_ids = []
        batch_weights = []# 为最后 去掉无用的梯度做准备
        for filename in filenames:
            token_ids_set = self._img_name_to_token_ids[filename]
            chosen_token_ids = random.choice(token_ids_set) # 随机选取一个
            #chosen_token_ids = token_ids_set[0]
            chosen_token_length = len(chosen_token_ids)

            weight = [1 for i in range(chosen_token_length)]
            if chosen_token_length >= self._num_timesteps:
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:# 否则 需要补零
                # 计算需要补零的个数
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [self._vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        # 此刻返回的是 batch 句子描述， 和 weights
        return batch_sentence_ids, batch_weights

    def next(self, batch_size):
        '''
                返回 batch_size 个数据
                流程如下：
                1. 得到 图像名称
                2. 得到 图像特征
                3. 得到 图像描述信息
                :param batch_size:
                :return:
                '''
        end_indicator = self._indicator + batch_size
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0
            end_indicator = self._indicator + batch_size
        assert end_indicator <= self.size()

        batch_img_features = self._img_feature_data[self._indicator: end_indicator]
        batch_img_names = self._img_feature_filenames[self._indicator: end_indicator]

        # batch_sentence_ids 是 图像描述 的id形式，
        # batch_weights 句子权重，sentence_ids:[100, 101, 102, 0, 0, 0]--->[1, 1, 1, 0, 0, 0]
        #   相当于是一个mask，和sentence_ids相乘，计算损失函数的时候，不去计算他们的损失
        batch_sentence_ids, batch_weights = self._img_desc(batch_img_names)

        self._indicator = end_indicator
        return batch_img_features, batch_sentence_ids, batch_weights, batch_img_names
caption_data = ImageCaptionData(img_name_to_token_id, input_img_feature_dir, hps.num_timesteps, vocab)
img_feature_dim = caption_data.img_feature_size()
logging.info('dim:%d' %img_feature_dim)
logging.info('data_size:%d' %caption_data.size())


(31783, 2048)
(31783,)
INFO:tensorflow:dim:2048
INFO:tensorflow:data_size:31783


In [7]:
def creat_rnn_cell(hidden_dim,cell_type):
    if cell_type == 'lstm':
        return tf.nn.rnn_cell.BasicLSTMCell(hidden_dim,state_is_tuple=true)
    if cell_type == 'gru':
        return tf.nn.rnn_cell.GRUCell(hidden_dim)

def drop_out(cell,keep_drop):
    return tf.nn.rnn_cell.DropoutWrapper(cell,keep_drop)

In [None]:
def creat_train_model(hps,vocab_size,img_feature_dim):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    
    image_feature = tf.placeholder(tf.float32,[batch_size,img_feature_dim])
    sentence = tf.placeholder(tf.int64,[batch_size,num_timesteps])
    mask = tf.placeholder(tf.float32,[batch_size,num_timesteps])
    keep_prob = tf.placeholder(tf.float32,name = 'keep_prob')
    
    global_step = tf.Variable(tf.zeros([],tf.int64),name='global_step',trainable=False)
    
    init_embedding = tf.random_uniform_initializer(-1.0,1.0)
    with tf.variable_scope('embedding',initializer=init_embedding):
        matrix_embedding = tf.get_variable('embedding',[vocab_size,hps.num_embedding_node],tf.float32)
        #embed_look的大小是：batch_size,num_timestep-1,num_embedding_node
        embed_look = tf.nn.embedding_lookup(matrix_embedding,sentence[:,1:num_timesteps])
        
        init_image_feature = tf.uniform_unit_scaling_initializer(factor=1.0)
        with tf.variable_scope('init_image_feature',initializer=init_image_feature):
            embed_img = tf.layers.dense(image_feature,hps.num_embedding_node)
            embed_img = tf.expand_dims(embed_img,1)
            embed_input = tf.concat([embed_img,embed_look],axis=1)
            
            
    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm',initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layer):
            cell = creat_rnn_cell(hps.num_lstm_nodes[i],hps.cell_type)
            cell = drop_out(cell,keep_prob)
            cells.append(cell)
        cell = tf.nn.rnn_cell.MultiRNNCell(cells)
        
        init_state = cell.zero_state(hps.batch_size,tf.float32)
        #rnn_output.size = [batch_size，num_timesteps,hps.num_lstm_node[-1]]
        rnn_output, _ = tf.nn.dynamic_rnn(cell,embed_input,initial_state=init_state) #函数作用：由指定的cell创建循环神经网络
        
        
    init_fc = tf.uniform_unit_scaling_initializer(factor = 1.0)
    with tf.variable_scope('fc',initializer=init_fc):
        #[batch_size*num_timesteps,num_lstm_node[-1]]
        run_output_reshape =tf.reshape(run_output,[-1,hps.num_lstm_nodes[-1]])
        #batch_size*num_timesteps,hps.num_fc_nodes
        fc = tf.layers.dense(run_output_reshape,hps.num_fc_nodes,name='fc1')
        fc_prob = tf.nn.dropout(fc,keep_prob,name='fc_prob')
        fc_relu = tf.nn.relu(fc_prob)
        #logits 的大小：#batch_size*num_timesteps,vocab_size
        logits = tf.layers.dense(fc_relu,vocab_size,name='logits')
        '''
        logits是在整个词表上的概率分布
    run_output_reshape 的大小是：batch_seize*num_timesteps,num_lstm_node
    [
     [1.jpg的num_timestpes中的第一个num_timestep，lstm的最后一层个数]
     [1.jpg的num_timestpes中的第二个num_timestep，lstm的最后一层个数]
     ...
     [1.jpg的num_timestpes中的第num_timestep个num_timestep，lstm的最后一层个数]
     [2.jpg的num_timestpes中的第一个num_timestep，lstm的最后一层个数]
     [2.jpg的num_timestpes中的第二个num_timestep，lstm的最后一层个数]
     ...
     [2.jpg的num_timestpes中的第num_timestep个num_timestep，lstm的最后一层个数]
     ...
     [batch_size.jpg的num_timestpes中的第一个num_timestep，lstm的最后一层个数]
     [batch_size.jpg的num_timestpes中的第二个num_timestep，lstm的最后一层个数]
     ...
     [batch_size.jpg的num_timestpes中的第num_timestep个num_timestep，lstm的最后一层个数]
    ]
    logits的大小是：#batch_size*num_timesteps,vocab_size
    [
     [1.jpg的num_timestpesz中的第一个num_timestep，在整个词表上的概率分布]
     [1.jpg的num_timestpes中的第二个num_timestep，在整个词表上的概率分布]
     ...
     [1.jpg的num_timestpes中的第num_timeteps个num_timestep，在整个词表上的概率分布]
     ...
    [batch_size.jpg的num_timestpes中的第一个num_timestep，在整个词表上的概率分布]
     [batch_size.jpg的num_timestpes中的第二个num_timestep，在整个词表上的概率分布]
     ...
     [batch_size.jpg的num_timestpes中的第num_timestep个num_timestep，在整个词表上的概率分布数]
    
    ]
    sentence_flatten:
     [
                1.jpg的第1个timestep gt
                1.jpg的第2个timestep gt
                ...
                2.jpg的第1个timestep gt
                2.jpg的第2个timestep gt
            ]

    
        
        '''
    
    #计算损失函数
    
    sentence_flatten = tf.reshapea(sentence,[-1])
    mask_flatten = tf.reshape(mask,[-1])
    mask_sum = tf.reduce_sum(mask_flatten)
    
    softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sentence_flatten,logits=logits)
    weighted_softmax_loss = tf.multiply(softmax_loss,tf.cast(mask_flatten,tf.float32))
    
    predict = tf.argmax(logits,1)
    correct_predict = tf.equal(tf.cast(predict,tf.int32),sentence_flatten)
    correct_predict_with_mask = tf.multiply(tf.cast(correct_predict,tf.float32),mask_flatten)
    acc = tf.reduce_sum(correct_predict_with_mask)/mask_sum
    loss = tf.reduce_sum(weighted_softmax_loss)/mask_sum
    
    
    with tf.name_scope('train'):
        tvars = tf.trainable_variables()
        grads,_ = tf.clip_by_global_norm(tf.gradients(loss,tvars),1.0)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grad,tvars),global_step = global_step)
    
    return ((image_feature,sentence,mask,keep_prob),
            (loss,acc,train_op),
            (global_step)
            )
            
placeholders, metrics, global_step = creat_train_model(hps, vocab_size, img_feature_dim)
img_feature, sentence, mask, keep_prob = placeholders
loss, accuracy, train_op = metrics


global_init = tf.initialize_all_variables()

with tf.Session() as sess:
    sess.run(global_init)
    for i in range(train_steps):
        batch_img_features, batch_sentence_ids, batch_weights, _ = caption_data.next(hps.batch_size)
        input_vals = (batch_img_features,batch_sentence_ids,batch_weights,hps.keep_prob)
        feed_dict = dict(zip(placeholders,input_vals))
        fetches = [global_step,loss,acc,train_op]
        output = sess.run(fetch,feed_dict)
        
        
        
        
        
    

    
    
        
        

    
    
    

[1 2 3 4 5 6]


[[1. 0. 1.]
 [0. 1. 0.]]
