In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

from tensorflow.keras.datasets.cifar10 import load_data
import tensorflow as tf
import numpy as np
import os
import shutil

# Tensorboard 공간
log_dir = "../logs"
if os.path.exists(log_dir):
    # 존재하면 이전 log 정보들을 지움 (섞이지 않기 위해)
    shutil.rmtree(log_dir)
else:
    os.makedirs(log_dir, exist_ok=True)

In [None]:
def build_graph(init_op):
    """
    initializer에 따른 초기학습에 대한 성능 비교
    CIFAR를 통해 초기 학습에 얼마만큼의 차이를 보이는지 확인하고자 함    
    """
    relu = lambda x: tf.nn.relu(x) # 활성화 함수

    graph = tf.Graph()
    with graph.as_default():
        X = tf.placeholder(tf.float32, shape=(None, 32*32*3), name="X")
        y = tf.placeholder(tf.int64, shape=(None,), name='y')

        fc_layer = tf.layers.dense(X, 512, activation=relu,
                                   kernel_initializer=init_op, name='dense_1')

        fc_layer = tf.layers.dense(fc_layer, 256, activation=relu,
                                   kernel_initializer=init_op, name='dense_2')

        fc_layer = tf.layers.dense(fc_layer, 256, activation=relu,
                                   kernel_initializer=init_op, name='dense_3')

        fc_layer = tf.layers.dense(fc_layer, 256, activation=relu,
                                   kernel_initializer=init_op, name='dense_4')

        fc_layer = tf.layers.dense(fc_layer, 256, activation=relu,
                                   kernel_initializer=init_op, name='dense_5')
            
        logits = tf.layers.dense(fc_layer, 10, kernel_initializer=init_op, 
                                 name='logit')

        with tf.variable_scope("loss"):
            loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

        with tf.variable_scope('metric'):
            correct = tf.nn.in_top_k(logits, y, 1)
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')
            
        global_step = tf.train.create_global_step()
        training_op = (tf.train.AdamOptimizer(learning_rate=0.0001)
                       .minimize(loss,global_step=global_step))
        
        # Summary        
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('accuracy', accuracy)    
        weights = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,'dense_[1-5]/kernel')
        for idx, weight in enumerate(weights):
            tf.summary.histogram('dense_layer{}'.format(idx), weight)
        

    return graph

def cifar_generator(batch_size, data, labels):
    '''
    cifar 데이터셋을 배치 size 단위로 반환하는 generator

    :param batch_size : 배치 크기
    :param data : (Num,32,32,3)으로된 cifar 이미지
    :param labels : (Num,)으로된 cifar 라벨
    '''
    start_idx = 0
    num_step = len(data) // batch_size
    indexes = np.arange(0, len(data))
    while True:
        if start_idx >= num_step-1:
            np.random.shuffle(indexes)
            start_idx = 0
        else:
            start_idx += 1            
        batch_index = indexes[start_idx*batch_size:
                              (start_idx+1)*batch_size]

        batch_data = data[batch_index]
        batch_label = labels[batch_index]
        yield batch_data, batch_label

In [None]:
def train_cifar_model(graph, save_dir, train_generator, test_x, test_y):
    """
    graph를 학습시키고, 그 경과를 tensorboard에서 볼 수 있도록 save_dir로 summary 저장
    
    """
    ##################
    # Prepare Training
    #     관련된 operation와 tensor들을 가져옴
    ##################
    # Input Tensor
    X = graph.get_tensor_by_name('X:0')
    y = graph.get_tensor_by_name('y:0')

    # Train Operation
    train_op = graph.get_collection(tf.GraphKeys.TRAIN_OP)[0]
    global_step = graph.get_collection(tf.GraphKeys.GLOBAL_STEP)[0]

    # Metric Operation
    loss = graph.get_collection(tf.GraphKeys.LOSSES)[0]
    accuracy = graph.get_tensor_by_name('metric/accuracy:0')
    
    # Summary Operation
    summary_collection = graph.get_collection(tf.GraphKeys.SUMMARIES)
    summary_op = tf.summary.merge(summary_collection)

    # Summary Writer
    train_writer = tf.summary.FileWriter(os.path.join(save_dir,"train"), graph, flush_secs=5)
    test_writer = tf.summary.FileWriter(os.path.join(save_dir,"test"), flush_secs=5)

    ###################
    # Run Training
    #     모델을 학습시키고, 그 경과를 기록
    ###################
    n_epochs = 20
    batch_size = 100 
    num_steps = 50000 // batch_size # 데이터 수 : 50000
    
    console_format = "[{}] acc : {:2.2f}% | loss : {:.3f}  "
    with tf.Session(graph=graph) as sess:
        # 변수초기화
        init = [tf.local_variables_initializer(),
                tf.global_variables_initializer()]
        sess.run(init)
        
        for epoch in range(20):
            # epoch이 시작할 때 test summary를 기록
            summary, test_loss, test_acc = sess.run([summary_op,loss, accuracy], 
                                                    feed_dict={X:x_test,
                                                               y:y_test})
            test_writer.add_summary(summary, global_step=global_step.eval())
            print(console_format.format(" test ", test_acc*100, test_loss))
            
            for step in range(num_steps):
                # model 학습
                batch_x, batch_y = next(train_generator)
                sess.run(train_op,
                         feed_dict={X:batch_x,
                                    y: batch_y})
                
                if step % 100 == 0:
                    # 100 step 마다 train summary를 기록
                    summary,train_loss, train_acc = sess.run([summary_op,loss, accuracy], 
                               feed_dict={X:batch_x,
                                          y:batch_y})
                    train_writer.add_summary(summary,global_step=global_step.eval())
                    print(console_format.format(" train", train_acc*100, train_loss))

## CIFAR 데이터 셋 가져오기

In [None]:
# Training 관련 인자
batch_size = 100 
n_epochs = 20

In [None]:
# load cifar data
(x_train, y_train), (x_test, y_test) = load_data()

# normalize Cifar Data (Cifar 데이터셋에 한에 Normalize)
x_train = x_train/255.
x_train = x_train.reshape(-1,32*32*3)
y_train = y_train.reshape(-1).astype(np.int64)

x_test = x_test/255.
x_test = x_test.reshape(-1,32*32*3)
y_test = y_test.reshape(-1).astype(np.int64)

# create Generator
train_generator = cifar_generator(batch_size, x_train, y_train)

num_steps = len(x_train)//batch_size # epoch 별 step 횟수

## 1. Zero Initialization

모든 Fully Connected Layer의 Weight들을 0으로 통일하였을 때, 어떤 식으로 학습되는지를 보고자 한다.

In [None]:
# Build graph
init_op = tf.initializers.zeros()
graph = build_graph(init_op)

# Train Model
save_dir = os.path.join(log_dir,"zero-init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)


Loss가 늘 같은 값을 가진다. 출력값이 모두 0이기 때문에, gradient가 항상 0에 수렴하여, weight의 변화가 없기 때문이다.

## 2. Constant Initialization

모든 Fully Connected Layer의 Weight들을 0.01으로 통일하였을 때, 어떤 식으로 학습되는지를 보고자 한다.

In [None]:
# Build graph
init_op = tf.initializers.constant(0.01)
graph = build_graph(init_op)

# Train Model
save_dir = os.path.join(log_dir,"constant-init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)


모든 weight가 같은 값으로 초기화된다면, graident가 모든 layer별 weight에게는 같이 적용되기 때문에, 사실 상 1개의 weight가 존재하는 것과 같은 효과를 가진다. 그래서 전형적인 underfitting의 그래프를 그린다.

## 2. Normal distribution Initialization

모든 Fully Connected Layer의 Weight들을 표준편차 0.01의 정규분포로 무작위하게 배치하였을 때, 초기 학습이 어떤식으로 동작하는지 보고자 한다.

### 표준편차가 0.01인 경우

In [None]:
# Build graph
init_op = tf.initializers.random_normal(stddev=0.01)
graph = build_graph(init_op)

# Train Model
save_dir = os.path.join(log_dir,"normal-distribution_with_0.01_init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)


### 표준편차가 0.05인 경우

In [None]:
# Build graph
init_op = tf.initializers.random_normal(stddev=0.05)
graph = build_graph(init_op)

# Train Model
save_dir = os.path.join(log_dir,"normal-distribution_with_0.05_init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)


### 표준편차가 0.1인 경우

In [None]:
# Build graph
init_op = tf.initializers.random_normal(stddev=0.1)
graph = build_graph(init_op)

# Train Model
save_dir = os.path.join(log_dir,"normal-distribution_with_0.1_init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)

## 3. Uniform Distribution Initialization

### 최소 최대값이 0.01인 경우

In [None]:
# Build graph
init_op = tf.initializers.random_uniform(minval=-0.01, 
                                         maxval=0.01)
graph = build_graph(init_op)

# Train model
save_dir = os.path.join(log_dir,"uniform-distribution-with-0.01-init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)


### 최소 최대값이 0.05인 경우

In [None]:
# Build graph
init_op = tf.initializers.random_uniform(minval=-0.05, 
                                         maxval=0.05)
graph = build_graph(init_op)

# Train model
save_dir = os.path.join(log_dir,"uniform-distribution-with-0.05-init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)

### 최소 최대값이 0.1인 경우

In [None]:
# Build graph
init_op = tf.initializers.random_uniform(minval=-0.1, 
                                         maxval=0.1)
graph = build_graph(init_op)

# Train model
save_dir = os.path.join(log_dir,"uniform-distribution-with-0.1-init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)

### 적절한 표준편차를 정해주는 것은 어렵다.

(현재 정확히 왜 문제가 되는지 근거가 부족합니다, 작았을 때와 컸을 때의 문제를 서술하면 좋습니다.)
* 표준편차가 너무 크다면? -> <근거>
* 표준편차가 너무 작다면? -> <근거> 

각 layer 별로 적절한 표준편차를 잡아나가야 한다.

-----------

## 4. he Initialization

In [None]:
# Build graph
init_op = tf.initializers.he_normal()
graph = build_graph(init_op)

# Train model
save_dir = os.path.join(log_dir,"he-init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)


## 5. Glorot Initialization

In [None]:
# Build graph
init_op = tf.initializers.glorot_normal()
graph = build_graph(init_op)
graph = attach_summary(graph)

# Train model
save_dir = os.path.join(log_dir,"glorot-init/")
train_cifar_model(graph, save_dir, train_generator, x_test, y_test)

