In [1]:
import tensorflow as tf
import pickle
from ssd_utils import BBoxUtility

  return f(*args, **kwds)


In [2]:
d = pickle.load(open('./deepfashion.pkl', 'rb'))
keys = sorted(d.keys())
len(keys)

289222

In [3]:
n_train = int(round(0.1 * len(keys)))
train_keys = keys[:n_train]
val_keys = keys[n_train:]
n_val = len(val_keys)
print("train num: {}".format(n_train))
print("validation num: {}".format(n_val))

train num: 28922
validation num: 260300


In [4]:
num_classes = 3
input_shape = (300, 300, 3)
normalizations = [20, -1, -1, -1, -1, -1]
anchor_sizes = [(21, 45), (45, 99), (99, 153), (153, 207), (207, 261), (261, 315)]
anchor_ratios = [[2, 1/2],[2, 1/2, 3, 1/3],[2, 1/2, 3, 1/3],[2, 1/2, 3, 1/3],[2, 1/2],[2, 1/2]]

In [5]:
priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb'))

In [6]:
priors.shape

(7308, 8)

In [7]:
priors[0].tolist()

[0.0,
 0.0,
 0.06315789371728897,
 0.06315789371728897,
 0.10000000149011612,
 0.10000000149011612,
 0.20000000298023224,
 0.20000000298023224]

Priors and variances  [xmin, ymin, xmax, ymax, varxc, varyc, varw, varh]

In [8]:
priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb'))
bbox_util = BBoxUtility(num_classes, priors)

In [9]:
def weight_variable(shape):
    w = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(w)


def bias_variable(shape):
    b = tf.constant(0.1, shape=shape)
    return tf.Variable(b)


def conv2d(x, filt, name, strides=None, padding="SAME"):
    """
        x is a batch of images. x.shape is [batch_size, in_height, in_width, in_channels]
        filt is a filter. filt.shape is [filter_height, filter_width, in_channels, out_channels]
    """ 
    if strides is None:
        strides = [1,1,1,1]
    return tf.nn.conv2d(x, filt, strides=strides, padding=padding, name=name)


def max_pool(x, name, size=2):
    """
        x is a batch of images. x.shape is [batch_size, in_height, in_width, in_channels]
        size is a pooling size using ksize.
        (ksize is a kernel size. ksize.shape is [batch_size, height, width, channels]
            The batch_size and channels are usual 1 on pooling. If you want 2x2 pooling, ksize=[1, 2, 2, 1])
    
    """
    return tf.nn.max_pool(x, ksize=[1, size, size, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)


def max_pool(x, name, size=2):
    """
        x is a batch of images. x.shape is [batch_size, in_height, in_width, in_channels]
        size is a pooling size using ksize.
        (ksize is a kernel size. ksize.shape is [batch_size, height, width, channels]
            The batch_size and channels are usual 1 on pooling. If you want 2x2 pooling, ksize=[1, 2, 2, 1])
    
    """
    return tf.nn.avg_pool(x, ksize=[1, size, size, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)


def conv_layer(x, out_size, name, filter_size=3, strides=None, padding="SAME",act="relu"):
    shape =x.get_shape().as_list()
    in_channel = shape[-1]
    with tf.name_scope("conv_{}".format(name)):
        with tf.name_scope('weights'):
            W = weight_variable([filter_size, filter_size, in_channel, out_size])
            variable_summaries(W)        
        with tf.name_scope('biases'):
            b = bias_variable([out_size])
            variable_summaries(b)
        h_conv = conv2d(x, W, "conv2d_{}".format(name),strides=strides, padding=padding)
        h = tf.nn.bias_add(h_conv, b)
        variable_summaries(h)
        
    if act == "relu":
        with tf.name_scope("conv_relu_{}".format(name)):
            h_act = tf.nn.relu(h)
    if act == "ident":
        h_act = h
    return h_act


def fc_layer(x_flatten, out_size, name, act="relu"):
    shape =x_flatten.get_shape().as_list()
    dim = shape[1]
    with tf.name_scope("fc_{}".format(name)):
        with tf.name_scope('weights'):
            W = weight_variable([dim, out_size])
        with tf.name_scope('biases'):   
            b  = bias_variable([out_size])
        h = tf.nn.bias_add(tf.matmul(x_flatten, W), b)
      
    if act == "relu":
        with tf.name_scope("fc_relu_{}".format(name)):
            h_act = tf.nn.relu(h)
    if act == "ident":
        h_act = h
    return h_act


def dropout(x, keep_prob, name):
    with tf.name_scope("dropout_{}".format(name)):
        h = tf.nn.dropout(x, keep_prob)
    return h
    

def flatten(x, name):
    shape = x.get_shape().as_list()
    dim = 1
    for s in shape[1:]:
        dim *= s
    with tf.name_scope(name):
        h = tf.reshape(x, [-1, dim])
    return h


def get_channel(x):
    shape = x.get_shape().as_list()
    return shape[-1]

def vgg_block(x, out_sizes, name):
    """
     x is a batch of images.
     out_sizes is a list of cnn layer sizes.
     name is this function's name. 
    """
    for idx, size in enumerate(out_sizes):
        if idx == 0:
            _x = x
        else:
            _x = h
        h = conv_layer(_x, size, "{}_{}".format(name, idx), filter_size=3)
        
    
    with tf.name_scope("pool_{}".format(name)):   
        h_pool1 = max_pool(h, name)
        variable_summaries(h_pool1)
    return h_pool1


def variable_summaries(var):  
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)



In [14]:
def ssd_multibox_layer(x, num_classes, sizes, ratios=[1], normalization=-1, bn_normalization=False):
    """Construct a multibox layer, return a class and localization predictions.
    """

    if normalization > 0:
        x_norm = tf.nn.l2_normalize(x, scaling=True)
    else:
        x_norm = x
        
    # Number of anchors.
    num_anchors = len(sizes) + len(ratios)

    # Location.
    num_loc_pred = num_anchors * 4
    loc_conv = conv_layer(x_norm, num_loc_pred, "location", filter_size=3, act="ident")
    loc_pred = tf.reshape(loc_conv, tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])
    
    # Class prediction.
    num_cls_pred = num_anchors * num_classes
    loc_conv = conv_layer(x_norm, num_cls_pred, "class pred", filter_size=3, act="ident")
    cls_pred = tf.reshape(cls_pred, tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes])
    return cls_pred, loc_pred


In [10]:
img_size = (input_shape[1], input_shape[0])

In [12]:
with tf.Graph().as_default() as g1:
    keep_prob = tf.placeholder(tf.float32)
    with tf.name_scope('input'):
        x = tf.placeholder(
            tf.float32, shape=[None, input_shape[0],input_shape[1], input_shape[2]], name='x-input')
        y_ = tf.placeholder(tf.float32, shape=[None, 10], name='y-input')

    
    h_pool1 = vgg_block(x, [64, 64], "1") 
    h_pool2 = vgg_block(h_pool1, [128, 128], "2") 
    h_pool3 = vgg_block(h_pool2, [256, 256, 256], "3") 
    h_pool4 = vgg_block(h_pool3, [512, 512, 512], "4")  # feature_layer
    h_pool5 = vgg_block(h_pool4, [512, 512, 512], "5") 
    
    # SSD block
    h_conv6 = conv_layer(h_pool5, 1024, "6", filter_size=3)
    h_conv6_drop = dropout(h_conv6, keep_prob, "6_drop")
    h_conv7 = conv_layer(h_conv6_drop, 1024, "7", filter_size=1)
    h_conv7_drop = dropout(h_conv7, keep_prob, "7_drop") # feature_layer
    
    h_conv8_0 = conv_layer(h_conv7_drop, 256, "8_0", filter_size=1)
    h_conv8_1 = conv_layer(h_conv8_0, 512, "8_1", filter_size=3, strides=[2]*4) # feature_layer
    h_conv9_0 = conv_layer(h_conv8_1, 128, "9_0", filter_size=1)
    h_conv9_1 = conv_layer(h_conv9_0, 256, "9_1", filter_size=3, strides=[2]*4) # feature_layer
    h_conv10_0 = conv_layer(h_conv9_1, 128, "9_0", filter_size=1)
    h_conv10_1 = conv_layer(h_conv10_0, 256, "9_1", filter_size=3) # feature_layer
    h_conv11_0 = conv_layer(h_conv10_1, 128, "9_0", filter_size=1)
    h_conv11_1 = conv_layer(h_conv11_0, 256, "9_1", filter_size=3) # feature_layer
    
    
    
    h_pool_flat = flatten(h_pool5, "pool_flat")
    h_fc1 = fc_layer(h_pool_flat, 1024, "1", act="relu") 
    h_fc1_drop = dropout(h_fc1, keep_prob, "fc1")
    h_fc2 = fc_layer(h_fc1_drop, 1024, "2", act="relu") 
    h_fc2_drop = dropout(h_fc2, keep_prob, "fc2")
    y_conv = fc_layer(h_fc2_drop , 10, "3", act="ident") 

    with tf.name_scope("loss"):
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
        tf.summary.scalar('cross_entropy', loss)

    with tf.name_scope("optimizer"):
        opt = tf.train.AdamOptimizer(1e-4).minimize(loss)

    with tf.name_scope("accuracy"):
        correct = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        tf.summary.scalar('accuracy', accuracy)
        

In [13]:
import tensorboard as tb 
tb.show_graph(g1)

In [None]:
with tf.Session(graph=g1) as sess:
    sess.run(tf.global_variables_initializer())