# 搭建VGGNet

In [1]:
import os
import math
import numpy as np
import tensorflow as tf
from PIL import Image
import time

# 之前vgg模型，RGB通道的3个均值，我们输入图像时，需要减去这三个均值，这三个均值是写在vgg net的代码中的
VGG_MEAN = [103.939, 116.779, 123.68]

class VGGNet:
    """Builds VGG-16 net structure,
       load parameters from pre-train models.
    """
    def __init__(self, data_dict):
        self.data_dict = data_dict
    
    def get_conv_filter(self, name):  # name可能是conv1_2
        # self.data_dict[name][0]是w参数，[1]是偏置
        return tf.constant(self.data_dict[name][0], name='conv')
    
    def get_fc_weight(self, name):  # 和上面类似
        return tf.constant(self.data_dict[name][0], name='fc')
    
    def get_bias(self, name):
        return tf.constant(self.data_dict[name][1], name='bias')
    
    def conv_layer(self, x, name):
        """搭建卷积层"""
        with tf.name_scope(name):
            conv_w = self.get_conv_filter(name)
            conv_b = self.get_bias(name)
            h = tf.nn.conv2d(x, conv_w, [1,1,1,1], padding='SAME')  # 这个api比layers更基础
            # 上面第三个参数是各个维度的stide
            h = tf.nn.bias_add(h, conv_b)
            h = tf.nn.relu(h)
            return h
    
    
    def pooling_layer(self, x, name):
        """搭建池化层"""
        return tf.nn.max_pool(x,
                              ksize = [1,2,2,1],
                              strides = [1,2,2,1],
                              padding = 'SAME',
                              name = name)
    
    def fc_layer(self, x, name, activation=tf.nn.relu):
        """搭建全连接层"""
        with tf.name_scope(name):
            fc_w = self.get_fc_weight(name)
            fc_b = self.get_bias(name)
            h = tf.matmul(x, fc_w)
            h = tf.nn.bias_add(h, fc_b)
            if activation is None:
                return h
            else:
                return activation(h)
    
    def flatten_layer(self, x, name):  # 通过展平将卷积层展平后给全连接
        """搭建展平层"""
        with tf.name_scope(name):
            # [batch_size, image_width, image_height, channel]  4维张量含义
            x_shape = x.get_shape().as_list()
            dim = 1
            for d in x_shape[1:]:  # 把后3个维度相乘
                dim *= d
            x = tf.reshape(x, [-1, dim])  # -1就会变为batch_size
            return x
    
    def build(self, x_rgb):
        """Build VGG16 network structure.
        Parameters:
        - x_rgb: [1, 224, 224, 3]  #这个设置是vgg_net的设置
        """
        
        start_time = time.time()
        print('building model ...')
        
        r, g, b = tf.split(x_rgb, [1,1,1], axis=3)  # 切分为3份，每份只有一个通道，从轴3切割
        x_bgr = tf.concat(
            [b - VGG_MEAN[0],
             g - VGG_MEAN[1],
             r - VGG_MEAN[2]],
            axis = 3)  # 每个通道减去均值后再次合并
        
        assert x_bgr.get_shape().as_list()[1:] == [224, 224, 3]  # 做一个断言，防止后面出错
        # 这里是第一组
        self.conv1_1 = self.conv_layer(x_bgr, 'conv1_1')
        self.conv1_2 = self.conv_layer(self.conv1_1, 'conv1_2')
        self.pool1 = self.pooling_layer(self.conv1_2, 'pool1')
        # 第二组
        self.conv2_1 = self.conv_layer(self.pool1, 'conv2_1')
        self.conv2_2 = self.conv_layer(self.conv2_1, 'conv2_2')
        self.pool2 = self.pooling_layer(self.conv2_2, 'pool2')
        
        self.conv3_1 = self.conv_layer(self.pool2, 'conv3_1')
        self.conv3_2 = self.conv_layer(self.conv3_1, 'conv3_2')
        self.conv3_3 = self.conv_layer(self.conv3_2, 'conv3_3')
        self.pool3 = self.pooling_layer(self.conv3_3, 'pool3')
        
        self.conv4_1 = self.conv_layer(self.pool3, 'conv4_1')
        self.conv4_2 = self.conv_layer(self.conv4_1, 'conv4_2')
        self.conv4_3 = self.conv_layer(self.conv4_2, 'conv4_3')
        self.pool4 = self.pooling_layer(self.conv4_3, 'pool4')
        
        self.conv5_1 = self.conv_layer(self.pool4, 'conv5_1')
        self.conv5_2 = self.conv_layer(self.conv5_1, 'conv5_2')
        self.conv5_3 = self.conv_layer(self.conv5_2, 'conv5_3')
        self.pool5 = self.pooling_layer(self.conv5_3, 'pool5')
        
        # 大部分时间都花费在下面的全连接层上,可以注释掉看看
        
        self.flatten5 = self.flatten_layer(self.pool5, 'flatten')
        self.fc6 = self.fc_layer(self.flatten5, 'fc6')
        self.fc7 = self.fc_layer(self.fc6, 'fc7')
        self.fc8 = self.fc_layer(self.fc7, 'fc8', activation=None)  # fc8不加激活函数是因为最后我们要进行softmax
        self.prob = tf.nn.softmax(self.fc8, name='prob')
        
        
        print('building model finished: %4ds' % (time.time() - start_time))  # 模型构建好再次打印时间

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# 下面代码只是测试一下模型的构建时间
vgg16_npy_path = 'vgg16.npy'
data_dict =np.load('vgg16.npy', encoding='latin1',allow_pickle=True).item()
vgg16_for_result = VGGNet(data_dict)
content = tf.placeholder(tf.float32,shape=[1,224,224,3])
vgg16_for_result.build(content)


building model ...
building model finished:    6s


# 加载vgg16(参数)，建模

In [3]:
vgg16_npy_path = 'vgg16.npy'
content_img_path = 'gugong.jpg'  # 内容图像路径，这个可以修改
style_img_path = 'xingkong.jpeg'  # 风格图像路径，这个也可以修改

num_steps = 100  # 训练多少步
learning_rate = 10  # 学习率

lambda_c = 0.1   # 内容损失的系数，如果设置为0，就是只用风格特征重建图片
lambda_s = 500   # 风格损失系数，通过最终的打印就可以明白为什么，如果为零，就是只有内容特征重建图片

output_dir = './run_style_transfer'  # 输出文件夹

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [4]:
# 对图像进行初始化
# shape表示生成张量的维度，mean是均值，stddev是标准差。这个函数产生正太分布，均值和标准差自己设定
def initial_result(shape, mean, stddev):
    initial = tf.truncated_normal(shape, mean = mean, stddev = stddev)
    return tf.Variable(initial)

# 读取图像数据
def read_img(img_name):
    img = Image.open(img_name)
    np_img = np.array(img) # (224, 224, 3)
    np_img = np.asarray([np_img], dtype=np.int32) # 转维度为(1, 224, 224, 3)
    return np_img

def gram_matrix(x):
    """Calulates gram matrix
    Args:
    - x: feaures extracted from VGG Net. shape: [1, width, height, ch]
    """
    b, w, h, ch = x.get_shape().as_list()  # 获取各个维度的值
    features = tf.reshape(x, [b, h*w, ch]) # [ch, ch] -> (i, j)  # 因为w和h维度像素点特点一致
    # [h*w, ch] matrix -> [ch, h*w] * [h*w, ch] -> [ch, ch]  
    # 计算任意两列的相似度，通过矩阵乘法即可，adjoint_a是把其中一个features进行转置
    # 为了防止最终的数比较大，我们除以一个常量，矩阵维度的乘积
    gram = tf.matmul(features, features, adjoint_a=True) / tf.constant(ch * w * h, tf.float32)
    return gram
    

result = initial_result((1, 224, 224, 3), 127.5, 20)

content_val = read_img(content_img_path)
style_val = read_img(style_img_path)

content = tf.placeholder(tf.float32, shape=[1, 224, 224, 3])  # 这是1.0版本需要的
style = tf.placeholder(tf.float32, shape=[1, 224, 224, 3])

data_dict = np.load(vgg16_npy_path, encoding='latin1',allow_pickle=True).item()
# 创建3个vggnet
vgg_for_content = VGGNet(data_dict)
vgg_for_style = VGGNet(data_dict)
vgg_for_result = VGGNet(data_dict)

vgg_for_content.build(content)  # content是它的输入
vgg_for_style.build(style)
vgg_for_result.build(result)
# 下面的层次也是超参数，多层效果比较好
# 可以加其他层特征来尝试，感受不同的效果
content_features = [
    vgg_for_content.conv1_2,
    # vgg_for_content.conv2_2,
    # vgg_for_content.conv3_3,
    # vgg_for_content.conv4_3,
    # vgg_for_content.conv5_3
]

# 结果图像提取内容特征，结果一定要和内容的层数保持一致
result_content_features = [
    vgg_for_result.conv1_2,
    # vgg_for_result.conv2_2,
    # vgg_for_result.conv3_3,
    # vgg_for_result.conv4_3,
    # vgg_for_result.conv5_3
]

# 也给风格特征初始化层次
# feature_size, [1, width, height, channel]
style_features = [
    # vgg_for_style.conv1_2,
    # vgg_for_style.conv2_2,
    # vgg_for_style.conv3_3,
    vgg_for_style.conv4_3,
    # vgg_for_style.conv5_3
]

# 风格图像的gram矩阵，gram矩阵是两两通道之间的相似度
style_gram = [gram_matrix(feature) for feature in style_features]
# 给结果图像提取风格特征，和风格特征图像的层次必须一致
result_style_features = [
    # vgg_for_result.conv1_2,
    # vgg_for_result.conv2_2,
    # vgg_for_result.conv3_3,
    vgg_for_result.conv4_3,
    # vgg_for_result.conv5_3
]

# 结果图像的gram矩阵
result_style_gram = \
    [gram_matrix(feature) for feature in result_style_features]

content_loss = tf.zeros(1, tf.float32)
# zip: [1, 2], [3, 4], zip([1,2], [3,4]) -> [(1, 3), (2, 4)]
# shape: [1, width, height, channel]
#因为是多层的，所以需要对每一层去计算损失，加起来
for c, c_ in zip(content_features, result_content_features):
    content_loss += tf.reduce_mean((c - c_) ** 2, [1, 2, 3])

# 风格损失是gram矩阵的损失
style_loss = tf.zeros(1, tf.float32)
for s, s_ in zip(style_gram, result_style_gram):
    style_loss += tf.reduce_mean((s - s_) ** 2, [1, 2])

# 最终的损失是内容的损失，是内容损失和风格损失的加权
loss = content_loss * lambda_c + style_loss * lambda_s
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

Instructions for updating:
Colocations handled automatically by placer.
building model ...
building model finished:    5s
building model ...
building model finished:    2s
building model ...
building model finished:    2s
Instructions for updating:
Use tf.cast instead.


# 训练（无监督）

In [5]:
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    for step in range(num_steps):  # 训练步骤
        loss_value, content_loss_value, style_loss_value, _ \
            = sess.run([loss, content_loss, style_loss, train_op],
                     feed_dict = {
                         content: content_val,  # 输入内容图像
                         style: style_val,  # 输入风格图像
                     })
        print('step: %d, loss_value: %8.4f, content_loss: %8.4f, style_loss: %8.4f' 
              % (step+1,
                 loss_value[0],
                 content_loss_value[0],
                 style_loss_value[0]))  # 每次训练打印loss，content_loss，style_loss
        result_img_path = os.path.join(
            output_dir, 'result-%05d.jpg' % (step+1))  # 每一步都把结果图像存储
        result_val = result.eval(sess)[0]  # 本身维度是(1, 224, 224, 3)，[0]就是(224, 224, 3)
        result_val = np.clip(result_val, 0, 255)  # 值拉到0到255直接
        img_arr = np.asarray(result_val, np.uint8)
        img = Image.fromarray(img_arr)  # 这个可以将某个ndarray变为图像
        img.save(result_img_path)  # 保存图像

step: 1, loss_value: 14290.1348, content_loss: 60762.7617, style_loss:  16.4277
step: 2, loss_value: 11897.3613, content_loss: 46404.1367, style_loss:  14.5139
step: 3, loss_value: 9192.0273, content_loss: 37857.9609, style_loss:  10.8125
step: 4, loss_value: 7623.8311, content_loss: 33294.2852, style_loss:   8.5888
step: 5, loss_value: 6845.5195, content_loss: 30470.9531, style_loss:   7.5968
step: 6, loss_value: 6176.9346, content_loss: 28812.4590, style_loss:   6.5914
step: 7, loss_value: 5325.7437, content_loss: 27791.3223, style_loss:   5.0932
step: 8, loss_value: 5141.1670, content_loss: 27275.6855, style_loss:   4.8272
step: 9, loss_value: 4622.7500, content_loss: 26944.9023, style_loss:   3.8565
step: 10, loss_value: 4503.7935, content_loss: 26751.0195, style_loss:   3.6574
step: 11, loss_value: 4262.6870, content_loss: 26599.7695, style_loss:   3.2054
step: 12, loss_value: 4172.7349, content_loss: 26377.6895, style_loss:   3.0699
step: 13, loss_value: 3995.2620, content_loss: 