In [1]:
import numpy as np
import tensorflow as tf

B = 5

'''
<sos> 0
0     1
1     2
2     3
3     4
4     5
5     6
identity 7
sep conv 8
max pool 9
avg pool 10
3x3      11
5x5      12
'''

'\n<sos> 0\n0     1\n1     2\n2     3\n3     4\n4     5\n5     6\nidentity 7\nsep conv 8\nmax pool 9\navg pool 10\n3x3      11\n5x5      12\n'

arch是字符串类型，代表了一个架构的描述，包含两个cell，先normal cell 后 reduction cell

In [1]:
def build_dag(arch):
    if arch is None:
        return None,None
    #assume arch is the format [index,op ...] where index is in [0,5] and op in [0,10]
    arch = list(map(int,arch.strip().split()))
    length = len(arch)
    conv_dag = arch[:length//2]
    reduc_dag = arch[length//2:]
    return conv_dag, reduc_dag

根据点数和操作数，随机生成一个架构

注意np.random.randint与random.randint不同，不包含上界

In [3]:
def generate_arch(n,num_nodes,num_ops = 7):
    def _get_arch():
        arch = []
        for i in range(2,num_nodes+2):
            p1 = np.random.randint(0,i)
            op1 = np.random.randint(0,num_ops)
            p2 = np.random.randint(0,i)
            op2 = np.random.randint(0,num_ops)
            arch.extend([p1,op1,p2,op2])
        return arch
    archs = [ [_get_arch(),_get_arch()] for i in range(n) ] #[[[conv],[reduc]]]
    return archs

给定一个参数的集合，计算参数个数

np.prod表示计算后面一个list里元素的乘积

In [2]:
def count_model_params(tf_variables):
    num_vars = 0
    for var in tf_variables:
        num_vars += np.prod([dim.value for dim in var.get_shape()])
    return num_vars

根据输入，获取训练所需要的train_op, learning_rate, grad_norm, opt, grad_norms

In [4]:
def get_train_op(loss,
                tf_variables,
                train_step,
                clip_mode=None,
                grad_bound=None,
                l2_reg=1e-4,
                lr_warmup_val=None,
                lr_warmup_steps=100,
                lr_init=0.1,
                lr_dec_start=0,
                lr_dec_every=10000,
                lr_dec_rate=0.1,
                lr_dec_min=None,
                lr_cosine=False,
                lr_max=None,
                lr_min=None,
                lr_T_0=None,
                lr_T_mul=None,
                num_train_batches=None,
                optim_algo=None,
                sync_replicas=False,
                num_aggregate=None,
                num_replicas=None,
                get_grad_norms=False,
                moving_average=None):
    '''
    clip_mode:"global","norm" or None
    moving_average: store the moving average of parameters
    '''
    if l2_reg > 0:
        l2_losses = []
        for var in tf_variables:
            l2_losses.append(tf.reduce_sum(var ** 2))
        l2_loss = tf.add_n(l2_losses)
        loss += l2_reg * l2_loss
    
    grads = tf.gradients(loss, tf_variables)
    grad_norm = tf.global_norm(grads)
    
    grad_norms = {}
    for v,g in zip(tf_variables, grads):
        if v is None or g is None:
            continue
        if isinstance(g, tf.IndexedSlices):
            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g.values ** 2))
        else:
            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g ** 2))
        
    if clip_mode is not None:
        assert grad_bound is not None, "Need grad_bound to clip gradients."
        if clip_mode == "global":
            grads, _ = tf.clip_by_global_norm(grads, grad_bound)
        elif clip_mode == "norm":
            clipped = []
            for g in grads:
                if isinstance(g, tf.IndexedSlices):
                    c_g = tf.clip_by_norm(g.values, grad_bound)
                    c_g = tf.IndexedSlices(c_g, g.indices)
                else:
                    c_g = tf.clip_by_norm(g, grad_bound)
                clipped.append(g)
            grads = clipped
        else:
            raise NotImplementedError("Unknown clip_mode {}".format(clip_mode))
    
    if lr_cosine:
        assert lr_max is not None, "Need lr_max to use lr_cosine"
        assert lr_min is not None, "Need lr_min to use lr_cosine"
        assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
        assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"
        assert num_train_batches is not None, "Need num_train_batches to use lr_cosine"
        
        curr_epoch = tf.cast(train_step // num_train_batches, tf.int32)
        
        last_reset = tf.get_variable("last_reset", initializer=0, dtype=tf.int32, trainable=False)
        T_i = tf.get_variable("T_i", initializer=lr_T_0, dtype=tf.int32, trainable=False)
        T_curr = curr_epoch - last_reset
        
        def _update():
            update_last_reset = tf.assign(last_reset, curr_epoch, use_locking = True)
            update_T_i = tf.assign(T_i, T_i * lr_T_mul, use_locking=True)
            with tf.control_dependencies([update_last_reset, update_T_i]):
                rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
                lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
            return lr
        
        def _no_update():
            rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
            lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
        
        learning_rate = tf.cond(tf.greater_equal(T_curr, T_i), _update, _no_update)
        
    else:
        learning_rate = tf.train.exponential_decay(lr_init, tf.maximum(train_step - lr_dec_start,0), lr_dec_every, lr_dec_rate, staircase=True)
        
        if lr_dec_min is not None:
            learning_rate = tf.maximum(learning_rate, lr_dec_min)
    
    if lr_warmup_val is not None:
        learning_rate = tf.cond(tf.less(train_step, lr_warmup_steps), lambda: lr_warmup_val, lambda: learning_rate)
        
    if optim_algo == 'momentum':
        opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=True, use_nesterov=True)
    elif optim_algo == "sgd":
        opt = tf.train.GradientDescentOptimizer(learning_rate, use_locking=True)
    elif optim_algo == "adam":
        opt = tf.train.AdamOptimizer(learning_rate, beta1=0.0, epsilon=1e-3, use_locking=True)
    else:
        raise ValueError("Unknown optim_algo {}".format(optim_algo))
    
    if sync_replicas:
        assert num_aggregate is not None, "Need num_aggregate to sync"
        assert num_replicas is not None, "Need num_replicas to sync"
        
        opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=num_aggregate, total_num_replicas=num_replicas, use_locking=True)
        
    if moving_average is not None:
        opt = tf.contrib.opt.MovingAverageOptimizer(opt, average_decay=moving_average)
    
    train_op = opt.apply_gradients( zip(grads, tf_variables), global_step = train_step )
    
    if get_grad_norms:
        return train_op, learning_rate, grad_norm, opt, grad_norms
    else:
        return train_op, learning_rate, grad_norm, opt
    


给定格式化的cell，和branch length，输出对应的序列，在这里序列里所有的input的index都+1，也就是从0,1,2,3,4,5变为了1，2，3，4，5，6。当branchlength等于2的时候，所有的操作都+7，从0，1，2，3，4变为了7，8，9，10，11.但是当branch-length变为3的时候，却有点奇怪，类似于用两位编码的方式存起来了。

In [5]:
def parse_arch_to_seq(cell, branch_length):
    assert branch_length in [2,3]
    seq = []
    def _parse_op(op):
        if op == 0:
            return 7, 12
        if op == 1:
            return 8, 11
        if op == 2:
            return 8, 12
        if op == 3:
            return 9, 11
        if op == 4:
            return 10, 11
    
    for i in range(B):
        prev_node1 = cell[4 * i] + 1
        prev_node2 = cell[4 * i + 2] + 1
        if branch_length == 2:
            op1 = cell[4*i+1] + 7
            op2 = cell[4*i+3] + 7
            seq.extend([prev_node1, op1, prev_node2, op2])
        else:
            op11, op12 = _parse_op(cell[4*i + 1])
            op21, op22 = _parse_op(cell[4*i + 3])
            seq.extend([prev_node1, op11, op12, prev_node2, op21, op22])
    return seq
        

与上面的函数相反，给定seq和branch-length，解析出结构化的cell

In [6]:
def parse_seq_to_arch(seq, branch_length):
    n = len(seq)
    assert branch_length in [2,3]
    assert n // 2 // 5 // 2 == branch_length
    
    def _parse_cell(cell_seq):
        cell_arch = []
        def _recover_op(op1,op2):
            if op1 == 7:
                return 0
            if op1 == 8:
                if op2 == 11:
                    return 1
                if op2 == 12:
                    return 2
            if op1 == 9:
                return 3
            if op1 == 10:
                return 4
        if branch_length == 2:
            for i in range(B):
                p1 = cell_seq[4*i] - 1
                op1 = cell_seq[4*i+1] - 7
                p2 = cell_seq[4*i+2] - 1
                op2 = cell_seq[4*i+3] - 7
                cell_arch.extend([p1,op1,p2,op2])
            return cell_arch
        else:
            for i in range(B):
                p1 = cell_seq[6*i] - 1
                op1 = _recover_op(cell_seq[6*i+1],cell_seq[6*i+2])
                p2 = cell_seq[6*i+3] - 1
                op2 = _recover_op(cell_seq[6*i+4],cell_seq[6*i+5])
                cell_arch.extend([p1,op1,p2,op2])
            return cell_arch
    conv_seq = seq[:n//2]
    reduc_seq = seq[n//2:]
    conv_arch = _parse_cell(conv_seq)
    reduc_arch = _parse_cell(reduc_seq)
    arch = [conv_arch, reduc_arch]
    return arch
    