# Import libraries

In [1]:
import numpy as np
import tensorflow as tf
#import matplotlib.pyplot as plt

# Topics
> ## 1. Build a model
>> ### 1.1 Create placeholder
>>> #### Coding 1: Create integer placeholder

>> ### 1.2 One-hot encoding
>> ### 1.3 Define the weight of CNN
>> ### 1.4 Understanding the blocks/layers
>>> #### Coding 2: Add some layers in the model


> ## 2. Optimization -- Gradient Descent
>> ### 2.1 Loss function and Learning rate
>>> #### Coding 3: Tune the learning rate on loss function

>> ### 2.2 Minibatch

> ## 3. Pre-trained model -- YOLOv3, ResNet (jupyter_2)
> ## 4. Training small dataset, using Tensorboard (jupyter_3)

# 1. Build a model
## 1.1 Create placeholder: 
> ## The pre-allocate dimensions for input and output
![Placeholder](https://imgur.com/lTf4ehx.png)

In [2]:
def create_placeholders(n_H0, n_W0, n_C0, n_y, n_state):
    # Create float placeholder on input_picture
    X = tf.placeholder(tf.float32,[None,n_H0,n_W0,n_C0])
    # Create integer placeholder on output_class
    Y = tf.placeholder(tf.int32,[None,n_y])
    
    ### Coding 1: Create integer placeholder on input_state
    ### START CODE HERE ### (1 line)
    Z = tf.placeholder(tf.int32,[None,n_state])
    #Z = None
    ### END CODE HERE ###
    
    return X, Y, Z

In [3]:
pic_height = 256 #64 #32
pic_width = 256 #64 #32
pic_channel = 3
output_class = 6
input_state = 4

In [4]:
pic_placehold, action_placehold, state_placehold = create_placeholders(pic_height, pic_width, pic_channel, output_class, input_state)
print ("pic_placehold = " + str(pic_placehold))
print ("action_placehold = " + str(action_placehold))
print ("state_placehold = " + str(state_placehold))

pic_placehold = Tensor("Placeholder:0", shape=(?, 256, 256, 3), dtype=float32)
action_placehold = Tensor("Placeholder_1:0", shape=(?, 6), dtype=int32)
state_placehold = Tensor("Placeholder_2:0", shape=(?, 4), dtype=int32)


**Expected Output**

<table> 
<tr>
<td>
    pic_placehold = Tensor("Placeholder:0", shape=(?, 256, 256, 3), dtype=float32)

</td>
</tr>
<tr>
<td>
    action_placehold  = Tensor("Placeholder_1:0", shape=(?, 6), dtype=int32)

</td>
</tr>
<tr>
<th>
    state_placehold  = Tensor("Placeholder_2:0", shape=(?, 4), dtype=int32)
        
</th>
</tr>
</table>

## 1.2 One-hot encoding
> ### ![one-hot](https://imgur.com/jCjGQgi.jpg)

In [26]:
# action = {0:"right", 1:"left", 2:"front", 3:"back", 4:"up", 5:"down"}
action_type = 6

In [33]:
action_onehot = tf.one_hot(action_placehold , action_type)
#print(action_onehot)

## 1.3 Define the weight of CNN: 
> ### Filter size and Channel (Create initialized parameters)
>> ## Filter example
![Sobel filter](https://imgur.com/YaAq4q7.png)
>> ## After filter
![Human](https://imgur.com/C8XqlHm.png)
>> ## Channel
![RGB](https://imgur.com/ygKlPRO.png)

In [26]:
def initialize_parameters():
    # so that your "random" numbers match ours
    tf.set_random_seed(1)                              
    
    # store weight
    # CNN part operation
    # filter_width:4,filter_height:4,input_channel:3,output_channel:8
    W1 = tf.get_variable("W1", [4,4,3,8], initializer=tf.contrib.layers.xavier_initializer(seed = 0))
    
    # filter_width:2,filter_height:2,input_channel:8,output_channel:16
    W2 = tf.get_variable("W2", [2,2,8,16], initializer=tf.contrib.layers.xavier_initializer(seed = 0))

    # make it into a dictionary of parameters
    parameters = {"W1": W1,
                  "W2": W2}
    
    return parameters

In [78]:
# have a look inside the weight
tf.reset_default_graph()
with tf.Session() as sess_test:
    parameters = initialize_parameters()
    init = tf.global_variables_initializer()
    sess_test.run(init)
    print("W1_first graph_channel values = " + str(parameters["W1"].eval()[1,1,1]))
    print("W2_first graph_channel values = " + str(parameters["W2"].eval()[1,1,1]))

W1_first graph_channel values = [ 0.00131723  0.14176141 -0.04434952  0.09197326  0.14984085 -0.03514394
 -0.06847463  0.05245192]
W2_first graph_channel values = [-0.08566415  0.17750949  0.11974221  0.16773748 -0.0830943  -0.08058
 -0.00577033 -0.14643836  0.24162132 -0.05857408 -0.19055021  0.1345228
 -0.22779644 -0.1601823  -0.16117483 -0.10286498]


** Expected Output:**

<table> 

    <tr>
        <td>
        W1_first graph_channel values = 
        </td>
        <td>
[ 0.00131723  0.14176141 -0.04434952  0.09197326  0.14984085 -0.03514394 <br>
 -0.06847463  0.05245192]
        </td>
    </tr>

    <tr>
        <td>
        W2_first graph_channel values = 
        </td>
        <td>
[-0.08566415  0.17750949  0.11974221  0.16773748 -0.0830943  -0.08058 <br>
 -0.00577033 -0.14643836  0.24162132 -0.05857408 -0.19055021  0.1345228 <br>
 -0.22779644 -0.1601823  -0.16117483 -0.10286498]
        </td>
    </tr>

</table>

## 1.4 Understanding the blocks/layers 
> ### CNN: strides, channel
> ### CNN to NN: Flatten()
> ### NN:  Fully_Connected()

> ### Some definitions of functions 
>> (you can check on Tensorflow for other function used: https://www.tensorflow.org/api_docs/python/tf)

> ## Hint for 1.4
- ## <span style="color:red">Graphic operations: CNN</span>
- ### tf.nn.conv2d()
- ### tf.nn.max_pool()
- ### tf.contrib.layers.flatten()
- ### tf.contrib.layers.fully_connected()

In [35]:
def conv_block(tensor_in, channel_in, channel_out, filter_height, filter_width, strides_height, strides_width, maxPool_height=2, maxPool_width=2):
    tf.set_random_seed(1)
    
    w = tf.Variable(tf.truncated_normal([filter_height,filter_width,channel_in,channel_out], stddev=0.1))
    b = tf.Variable(tf.constant(0.1, shape=[channel_out]))
    
    conv = tf.nn.conv2d(tensor_in, w, strides=[1, strides_height, strides_width,1], padding='SAME')                
    print("Conv\t\t%d\t%d\t%d\t\t| %s" % (conv.shape[1],conv.shape[2],conv.shape[3],conv.shape))
    # activate each nodes with bias
    activate_func = tf.nn.relu(conv + b)
    print("ReLU\t\t%d\t%d\t%d\t\t| %s" % (activate_func.shape[1],activate_func.shape[2],activate_func.shape[3],activate_func.shape))
    
    #maxPool_height = 2
    #maxPool_width = 2
    maxPool = tf.nn.max_pool(activate_func, ksize=[1,maxPool_height,maxPool_width,1], strides=[1,maxPool_height,maxPool_width,1], padding='SAME')
    print("max_pool\t%d\t%d\t%d\t\t| %s" %(maxPool.shape[1],maxPool.shape[2],maxPool.shape[3],maxPool.shape))
    return maxPool

In [84]:
s = "---------------------------------------------------------------"
print(s)
print("Layer\t\tHeight\tWidth\tChannel\tNode\t| What you will see")
print(s)
print("Input\t\t%d\t%d\t%d\t\t| %s" %(pic_placehold.shape[1],pic_placehold.shape[2],pic_placehold.shape[3],pic_placehold.shape))

### Coding 2: Add some layers in the model
### Step 1: Tunning the stride for first CNN layer
### Tunning part:
strides_height = 2
strides_width = 2
###
conv1 = conv_block(pic_placehold, 3, 8, 4, 4, strides_height, strides_width)

---------------------------------------------------------------
Layer		Height	Width	Channel	Node	| What you will see
---------------------------------------------------------------
Input		256	256	3		| (?, 256, 256, 3)
Conv		128	128	8		| (?, 128, 128, 8)
ReLU		128	128	8		| (?, 128, 128, 8)
max_pool	64	64	8		| (?, 64, 64, 8)


** Step 1: Expected Output **
<table>
    <tr>
        <td>
            Input | (?, 256, 256, 3)
        </td>
    </tr>
    <tr>
        <td>
            Conv | (?, 128, 128, 8)
        </td>
    </tr>
    <tr>
        <td>
            ReLU | (?, 128, 128, 8)
        </td>
    </tr>
    <tr>
        <td>
            max_pool | (?, 64, 64, 8)
        </td>
    </tr>
</table>

In [82]:
### Step 2: Tunning the stride for second CNN layer + tune the channel
### Tunning part: divisible stride
strides_height = #4
strides_width = #2
channel_in = #8
channel_out = #16
###
conv2 = conv_block(conv1, channel_in, channel_out, 4, 4, strides_height, strides_width)

Conv		16	32	16		| (?, 16, 32, 16)
ReLU		16	32	16		| (?, 16, 32, 16)
max_pool	8	16	16		| (?, 8, 16, 16)


** Step 2: Expected Output **
<table>
    <tr>
        <td>
            Conv | (?, 16, 32, 16)
        </td>
    </tr>
    <tr>
        <td>
            ReLU | (?, 16, 32, 16)
        </td>
    </tr>
    <tr>
        <td>
            max_pool | (?, 8, 16, 16)
        </td>
    </tr>
</table>

In [83]:
### Step 3: Tunning the stride + build the third layer of CNN
### Tunning part: non-divisible stride
strides_height = #2
strides_width = #3
###
#conv3 = conv_block(conv2, 16, 32, 4, 4, strides_height, strides_width)

Conv		4	6	32		| (?, 4, 6, 32)
ReLU		4	6	32		| (?, 4, 6, 32)
max_pool	2	3	32		| (?, 2, 3, 32)


** Step 3: Expected Output **
<table>
    <tr>
        <td>
            Conv | (?, 4, 6, 32)
        </td>
    </tr>
    <tr>
        <td>
            ReLU | (?, 4, 6, 32)
        </td>
    </tr>
    <tr>
        <td>
            max_pool | (?, 2, 3, 32)
        </td>
    </tr>
</table>

## Questions
### 1. How does "Step 1", "Step 2", "Step 3" happens?
> #### CNN operation
>> ![CNN](https://imgur.com/FIy5Ou4.gif)

> #### Max Pooling operation (Vote for the largest number)
>> ![Max Pool](https://imgur.com/ec0zNkC.png)

### 2. Why do we put pic_placehold in "Step 1" conv_block?

In [None]:
# 給一個 3層對的範例

# channel checkpoint: 5層之間亂給，固定 2,4層，希望 input是多少，output是多少


In [68]:
def fc_block(tensor_in, node_out, activate_func):
    # We do not care the reusable or not now, so we do not give variable a name
    node_in = int(tensor_in.shape[1])
    w = tf.Variable(tf.random_normal([node_in, node_out], stddev=0.35))
    b = tf.Variable(tf.zeros([node_out]))
    
    if activate_func == None:
        ### 2 way
        #fully_connect = tf.contrib.layers.fully_connected(tensor_in, num_outputs= node_out,activation_fn= None)
        fully_connect = tf.add(tf.matmul(tensor_in,w), b)
        ###
        print("fully_connect\t\t\t\t%d\t| %s" % (fully_connect.shape[1],fully_connect.shape))
    elif activate_func == tf.nn.relu:
        ### 2 way
        #fully_connect = tf.contrib.layers.fully_connected(tensor_in, num_outputs= node_out,activation_fn= activate_func)
        fully_connect = tf.nn.relu(tf.add(tf.matmul(tensor_in,w), b))
        ###
        print("fully_connect\t\t\t\t%d\t| %s" % (fully_connect.shape[1],fully_connect.shape))
        print("ReLU\t\t\t\t\t%d\t| %s" % (fully_connect.shape[1],fully_connect.shape))
    elif activate_func == tf.nn.softmax:
        ### 2 way
        #fully_connect = tf.contrib.layers.fully_connected(tensor_in, num_outputs= node_out,activation_fn= activate_func)
        fully_connect = tf.nn.softmax(tf.add(tf.matmul(tensor_in,w), b))
        ###
        print("fully_connect\t\t\t\t%d\t| %s" % (fully_connect.shape[1],fully_connect.shape))
        print("Softmax\t\t\t\t\t%d\t| %s" % (fully_connect.shape[1],fully_connect.shape))
    
    return fully_connect

In [87]:
# Mix with CNN, NN
s = "---------------------------------------------------------------"
print(s)
print("Layer\t\tHeight\tWidth\tChannel\tNode\t| What you will see")
print(s)
print("Input\t\t%d\t%d\t%d\t\t| %s" %(pic_placehold.shape[1],pic_placehold.shape[2],pic_placehold.shape[3],pic_placehold.shape))

### Step 4: Select max_pool size + Build 2 CNN layer + Select any filter size
#conv1 = conv_block(pic_placehold, 3, 16, 4, 4, 2, 2, 4, 1)
#conv2 = conv_block(conv1, 16, 64, 2, 2, 1, 1)
###

#flat = tf.contrib.layers.flatten(conv2)
flat = tf.reshape(conv2, [-1, int(conv2.shape[1])*int(conv2.shape[2])*int(conv2.shape[3])])
print("flatten\t\t\t\t\t%d\t| %s" %(flat.shape[1],flat.shape))

fc_layer1 = fc_block(flat, 256, None)
fc_layer2 = fc_block(fc_layer1, 128, tf.nn.relu)
predicted_prob = fc_block(fc_layer1, output_class, tf.nn.softmax)

---------------------------------------------------------------
Layer		Height	Width	Channel	Node	| What you will see
---------------------------------------------------------------
Input		256	256	3		| (?, 256, 256, 3)
Conv		128	128	16		| (?, 128, 128, 16)
ReLU		128	128	16		| (?, 128, 128, 16)
max_pool	32	128	16		| (?, 32, 128, 16)
Conv		32	128	64		| (?, 32, 128, 64)
ReLU		32	128	64		| (?, 32, 128, 64)
max_pool	16	64	64		| (?, 16, 64, 64)
flatten					65536	| (?, 65536)
fully_connect				256	| (?, 256)
fully_connect				128	| (?, 128)
ReLU					128	| (?, 128)
fully_connect				6	| (?, 6)
Softmax					6	| (?, 6)


** Step 4: Expected Output **
<table>
    <tr>
        <td>
            Input | (?, 256, 256, 3)
        </td>
    </tr>
    <tr>
        <td>
            Conv | (?, 128, 128, 16)
        </td>
    </tr>
    <tr>
        <td>
            ReLU | (?, 128, 128, 16)
        </td>
    </tr>
    <tr>
        <td>
            max_pool | (?, 32, 128, 16)
        </td>
    </tr>
    <tr>
        <td>
            Conv | (?, 32, 128, 64)
        </td>
    </tr>
    <tr>
        <td>
            ReLU | (?, 32, 128, 64)
        </td>
    </tr>
    <tr>
        <td>
            max_pool | (?, 16, 64, 64)
        </td>
    </tr>
</table>

# 2. Optimization -- Gradient Descent
## 2.1 Loss function and Learning rate
> ### Gradient Descent: update with Learning_rate*slope
>> ### 1. Loss function: (formula...)
![Optimization](https://imgur.com/LXBjfLb.png)
>> ### 2. Learning rate
![Learning rate](https://imgur.com/J8U8fu9.jpg)

> ## Meaning of loss function
>> ### "預測"和"實際"誤差

> ## Optimization Problems
>> ### 1. 局部極小值
>> ### 2. Saddle point

> ## Overfit

In [72]:
coefficients = np.array([[1.],[-10.],[25.]])
w = tf.Variable(0,dtype=tf.float32)
x = tf.placeholder(tf.float32,[3,1])

# cost = tf.add(tf.add(w**2,tf.multiply(-10.,w),25))
# operator overloading
# placeholder x, input
cost = x[0]*w**2 + x[1]*w + x[2]
#[1,2]
#[2,4,1]
#cost = x[0]*w**4 + x[1]*w**3 + x[2]*w**2
#[5,8,3,1]
#cost = x[0]*w**6+ x[1]*w**5+ x[2]*w**2+ x[3]*w
# broadcasting
#cost = w*x # (1*3)*(3*1)

In [73]:
# play for the value between: 0.9 ~ 0.01
learning_rate = 0.01
train = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

init = tf.global_variables_initializer()
#first way
'''
session = tf.Session()
session.run(init)
print(session.run(w))
'''
print("--------------------------------------------")
print("lr=%f\tweight\t\tvalue update" %learning_rate)
print("--------------------------------------------")
#second way
with tf.Session() as session:
    # initialize
    session.run(init)
    print("Initialize:\t%f" %session.run(w))
    wf = session.run(w)
    #session.run(train,feed_dict={x:coefficients})
    #print(session.run(w))
    for i in range(100):
        # put input inside
        session.run(train,feed_dict={x:coefficients})
        # print the weight & 差異, think about the curve & the lowest point
        print("Epoch %d:\t%f\t%f" %(i+1,session.run(w),session.run(w)-wf))
        wf = session.run(w)

--------------------------------------------
lr=0.010000	weight		value update
--------------------------------------------
Initialize:	0.000000
Epoch 1:	0.100000	0.100000
Epoch 2:	0.198000	0.098000
Epoch 3:	0.294040	0.096040
Epoch 4:	0.388159	0.094119
Epoch 5:	0.480396	0.092237
Epoch 6:	0.570788	0.090392
Epoch 7:	0.659372	0.088584
Epoch 8:	0.746185	0.086813
Epoch 9:	0.831261	0.085076
Epoch 10:	0.914636	0.083375
Epoch 11:	0.996343	0.081707
Epoch 12:	1.076416	0.080073
Epoch 13:	1.154888	0.078472
Epoch 14:	1.231790	0.076902
Epoch 15:	1.307155	0.075364
Epoch 16:	1.381011	0.073857
Epoch 17:	1.453391	0.072380
Epoch 18:	1.524323	0.070932
Epoch 19:	1.593837	0.069514
Epoch 20:	1.661960	0.068123
Epoch 21:	1.728721	0.066761
Epoch 22:	1.794147	0.065426
Epoch 23:	1.858264	0.064117
Epoch 24:	1.921098	0.062835
Epoch 25:	1.982676	0.061578
Epoch 26:	2.043023	0.060346
Epoch 27:	2.102162	0.059139
Epoch 28:	2.160119	0.057957
Epoch 29:	2.216917	0.056798
Epoch 30:	2.272578	0.055662
Epoch 31:	2.327127	0.0545