In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand
import math

from collections import deque
from sklearn.preprocessing import MinMaxScaler

In [2]:
import os
os.chdir('C:\\code\\activ')

In [3]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

np.set_printoptions(precision=6, suppress=True)

In [13]:
goal = np.array([0 for i in range(21)]).reshape(1, 21)
goal[0][0] = 1

In [14]:
start = np.array([0 for i in range(21)]).reshape(1, 21)

In [16]:
print(goal)
print(start)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [11]:
# dqn paramater
GAMMA = 0.9
BATCH_SIZE = 64
TRAIN_FLAG = 4000
EPISODE_DONE = 100
EPS_DECAY = 0.99

In [18]:
def return_action(i):
    a = np.zeros((1, 21))
    j = i // 2

    if i % 2 == 0:
        a[0][j] = -0.1
    
    else:
        a[0][j] = 0.1
    
    return a

In [19]:
def return_state(s, a):
    ns = s + a
    return ns

In [20]:
def return_reward(ns, gs):
    dist = np.sqrt(np.sum(np.square(gs - ns)))
    loss = dist
    
    return loss

In [21]:
class DQN_Network(tf.keras.models.Model):
    def __init__(self):
        super(DQN_Network, self).__init__()
        self.input_layer = tf.keras.layers.Dense(128, input_shape=(21, ), activation='relu')

        self.hidden_layer = tf.keras.models.Sequential()
        self.hidden_layer.add(tf.keras.layers.Dense(128, activation='relu'))
        self.hidden_layer.add(tf.keras.layers.Dense(128, activation='relu'))

        self.ouput_layer = tf.keras.layers.Dense(42, activation='linear')

    def call(self, x):
        i = self.input_layer(x)
        h = self.hidden_layer(i)
        o = self.ouput_layer(h)
        return o

In [26]:
class DQN_Agent:
    def __init__(self):
        self.train_model = self.set_model()
        self.target_model = self.set_model()

        self.p_memory = deque(maxlen=100000)
        self.n_memory = deque(maxlen=100000)
        self.episode = 1

        self.optim = tf.keras.optimizers.Adam(learning_rate=1e-10)
        self.loss_fn = tf.keras.losses.Huber()

    def set_model(self):
        net = DQN_Network()
        net.build(input_shape=(1, 21))

        optim = tf.keras.optimizers.Adam(learning_rate=1e-10)
        net.compile(optimizer=optim, loss=tf.keras.losses.Huber())
        return net

    def update_model(self):
        self.target_model.set_weights(self.train_model.get_weights())

    def memorize(self, cs, a_i, r, ns, d, sign):
        if d:
            self.episode += 1

        if sign == 0:
            self.n_memory.append(
                (
                    tf.convert_to_tensor(tf.cast(cs, tf.float32)),
                    a_i,
                    tf.convert_to_tensor(tf.cast(r, tf.float32)),
                    tf.convert_to_tensor(tf.cast(ns, tf.float32)),
                    d
                )
            )

        if sign == 1:
            self.p_memory.append(
                (
                    tf.convert_to_tensor(tf.cast(cs, tf.float32)),
                    a_i,
                    tf.convert_to_tensor(tf.cast(r, tf.float32)),
                    tf.convert_to_tensor(tf.cast(ns, tf.float32)),
                    d
                )
            )

    def convert_memory_to_input(self):
        n_batch = rand.sample(self.n_memory, BATCH_SIZE/2)
        p_batch = rand.sample(self.p_memory, BATCH_SIZE/2)

        batch = [sum(n_batch, []), sum(p_batch, [])]
        s, a_i, r, ns, d = zip(*batch)

        states = tf.convert_to_tensor(s).reshape(BATCH_SIZE, 21)
        action_indexs = tf.convert_to_tensor(a_i)
        rewards = tf.convert_to_tensor(r)
        next_states = tf.convert_to_tensor(ns).reshape(BATCH_SIZE, 21)
        dones = tf.convert_to_tensor(d)

        return states, action_indexs, rewards, next_states, dones

    def act(self, state):
        # if self.episode >= 0 and self.episode < 20:
        #     eps_threshold = 0.991 ** self.episode
        # else:
        #     eps_threshold = EPS_DECAY ** self.episode

        eps_threshold = 0.05 + (1 - 0.05) * math.exp(-1. * self.episode / 100)

        a_r = np.array(self.train_model(state))[0]

        if rand.random() > eps_threshold:
            a_i = np.argmin(a_r)
            c = 1

        else:
            a_i = rand.randint(0, 41)
            c = 0

        a = return_action(a_i)

        return a, a_i, c, eps_threshold

    def run(self):
        if len(self.n_memory) < TRAIN_FLAG or len(self.p_memory) < TRAIN_FLAG:
            return 1

        states, action_indexs, rewards, next_states, dones = self.convert_memory_to_input()
        loss = self.learn(states, action_indexs, rewards, next_states, dones)
    
        return loss.numpy()
        
    @tf.function
    def learn(self, states, action_indexs, rewards, next_states, dones):
        q_target = self.target_model(next_states)
        target_q = rewards + (1 - dones) * GAMMA * tf.reduce_min(q_target, axis=1, keepdims=True)

        with tf.GradientTape() as tape:
            current_q = self.train_model(states) # 현재 상황에서 할 수 있는 행동들의 q value
            current_q = tf.reduce_sum(current_q[action_indexs], axis=1, keepdims=True) # 실제 한 행동에 대한 q value

            loss = self.loss_fn(current_q, target_q)

        grads = tape.gradient(loss, self.train_model.trainable_weights)
        self.optim.apply_gradients(zip(grads, self.train_model.trainable_weights))

        return loss

In [27]:
agent = DQN_Agent()
rewards_hist = []
st_hist = []

for e in range(5000):
    counter = [0 for i in range(42)]
    state = start
    steps = 0
    rewards = 0
    c = 0

    if e % 200 == 0:
        agent.update_model()
        print("===update===")

    while True:
        action, idx, t, eps = agent.act(state)
        counter[idx] += 1
        c += t
        next_state = return_state(state, action)

        if reward < return_reward(next_state, goal):
            sign = 0
        else:
            sign = 1

        reward = return_reward(next_state, goal)

        if steps == EPISODE_DONE or all(state[0][i] == goal[0][i] for i in range(21)):
            done = 1
        else:
            done = 0

        agent.memorize(state, idx, reward, next_state, done, sign)
        loss = agent.run()
        
        state = next_state
        rewards += reward
        steps += 1

        # if steps == 1:
        #     print(f'steps: {steps}, reward: {reward}, a: {idx}')

        if done:
            rewards_hist.append(rewards)
            st_hist.append(state)
            print(f'============={e}=============')
            print(f"rewards: {round(rewards, 3)}, net_loss: {round(loss, 3)}, number of most decision: {max(counter)}, desicion tendecy: {c}, eps: {round(eps, 5)}")
            print(reward)

            break

===update===
rewards: 2335.212, net_loss: 1, number of most decision: 35, desicion tendecy: 11, eps: 0.99055
3.606937759374287
rewards: 2495.564, net_loss: 1, number of most decision: 37, desicion tendecy: 15, eps: 0.98119
3.769615364994153
rewards: 2855.859, net_loss: 1, number of most decision: 43, desicion tendecy: 24, eps: 0.97192
4.661544808322667
rewards: 2138.7, net_loss: 2.937000036239624, number of most decision: 42, desicion tendecy: 38, eps: 0.96275
3.410278580995987
rewards: 2632.747, net_loss: 3.009000062942505, number of most decision: 39, desicion tendecy: 55, eps: 0.95367
4.0853396431631
rewards: 2690.365, net_loss: 2.9489998817443848, number of most decision: 39, desicion tendecy: 51, eps: 0.94468
4.419275958796872
rewards: 2836.938, net_loss: 2.8350000381469727, number of most decision: 52, desicion tendecy: 59, eps: 0.93577
4.629254799641084
rewards: 3919.622, net_loss: 2.819000005722046, number of most decision: 52, desicion tendecy: 73, eps: 0.92696
5.3581713298475