In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand
import math

from collections import deque
from sklearn.preprocessing import MinMaxScaler

In [2]:
import os
os.chdir('C:\\code\\activ')

In [3]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

np.set_printoptions(precision=6, suppress=True)

In [4]:
goal = np.array([0 for i in range(21)]).reshape(1, 21)
goal[0][0] = 1

In [5]:
start = np.array([0 for i in range(21)]).reshape(1, 21)

In [6]:
print(goal)
print(start)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [7]:
# dqn paramater
GAMMA = 0.9
BATCH_SIZE = 32
TRAIN_FLAG = 64
EPISODE_DONE = 100
EPS_DECAY = 0.99

In [8]:
def return_action(i):
    a = np.zeros((1, 21))
    j = i // 2

    if i % 2 == 0:
        a[0][j] = -0.1
    
    else:
        a[0][j] = 0.1
    
    return a

In [9]:
def return_state(s, a):
    ns = s + a
    return ns

In [10]:
def return_reward(ns, gs):
    dist = np.sqrt(np.sum(np.square(gs - ns)))
    loss = dist
    
    return loss

In [11]:
class DQN_Network(tf.keras.models.Model):
    def __init__(self):
        super(DQN_Network, self).__init__()
        self.input_layer = tf.keras.layers.Dense(128, input_shape=(21, ), activation='relu')

        self.hidden_layer = tf.keras.models.Sequential()
        self.hidden_layer.add(tf.keras.layers.Dense(128, activation='relu'))
        self.hidden_layer.add(tf.keras.layers.Dense(128, activation='relu'))

        self.ouput_layer = tf.keras.layers.Dense(42, activation='linear')

    def call(self, x):
        i = self.input_layer(x)
        h = self.hidden_layer(i)
        o = self.ouput_layer(h)
        return o

In [14]:
class DQN_Agent:
    def __init__(self):
        self.train_model = self.set_model()
        self.target_model = self.set_model()

        self.p_memory = deque(maxlen=100000)
        self.n_memory = deque(maxlen=100000)
        self.episode = 1

        self.optim = tf.keras.optimizers.Adam(learning_rate=1e-10)
        self.loss_fn = tf.keras.losses.Huber()

    def set_model(self):
        net = DQN_Network()
        net.build(input_shape=(1, 21))

        optim = tf.keras.optimizers.Adam(learning_rate=1e-10)
        net.compile(optimizer=optim, loss=tf.keras.losses.Huber())
        return net

    def update_model(self):
        self.target_model.set_weights(self.train_model.get_weights())

    def len_of(self):
        return len(self.n_memory), len(self.p_memory)

    def memorize(self, cs, a_i, r, ns, d, sign):
        if d:
            self.episode += 1

        if sign == 0:
            self.n_memory.append(
                (
                    tf.convert_to_tensor(tf.cast(cs, tf.float32)),
                    a_i,
                    tf.convert_to_tensor(tf.cast(r, tf.float32)),
                    tf.convert_to_tensor(tf.cast(ns, tf.float32)),
                    d
                )
            )

        if sign == 1:
            self.p_memory.append(
                (
                    tf.convert_to_tensor(tf.cast(cs, tf.float32)),
                    a_i,
                    tf.convert_to_tensor(tf.cast(r, tf.float32)),
                    tf.convert_to_tensor(tf.cast(ns, tf.float32)),
                    d
                )
            )

    def convert_memory_to_input(self, batch):
        s, a_i, r, ns, d = zip(*batch)

        states = tf.convert_to_tensor(s).reshape(BATCH_SIZE, 21)
        action_indexs = tf.convert_to_tensor(a_i)
        rewards = tf.convert_to_tensor(r)
        next_states = tf.convert_to_tensor(ns).reshape(BATCH_SIZE, 21)
        dones = tf.convert_to_tensor(d)

        return states, action_indexs, rewards, next_states, dones

    def act(self, state):
        # if self.episode >= 0 and self.episode < 20:
        #     eps_threshold = 0.991 ** self.episode
        # else:
        #     eps_threshold = EPS_DECAY ** self.episode

        eps_threshold = 0.05 + (1 - 0.05) * math.exp(-1. * self.episode / 100)

        a_r = np.array(self.train_model(state))[0]

        if rand.random() > eps_threshold:
            a_i = np.argmin(a_r)
            c = 1

        else:
            a_i = rand.randint(0, 41)
            c = 0

        a = return_action(a_i)

        return a, a_i, c, eps_threshold

    def run(self):
        if len(self.n_memory) < TRAIN_FLAG or len(self.p_memory) < TRAIN_FLAG:
            return 1

        states, action_indexs, rewards, next_states, dones = self.convert_memory_to_input(rand.sample(self.n_memory, BATCH_SIZE))
        loss = self.learn(states, action_indexs, rewards, next_states, dones)

        states, action_indexs, rewards, next_states, dones = self.convert_memory_to_input(rand.sample(self.p_memory, BATCH_SIZE))
        loss = self.learn(states, action_indexs, rewards, next_states, dones)
    
        return loss.numpy()
        
    @tf.function
    def learn(self, states, action_indexs, rewards, next_states, dones):
        q_target = self.target_model(next_states)
        target_q = rewards + (1 - dones) * GAMMA * tf.reduce_min(q_target, axis=1, keepdims=True)
        with tf.GradientTape() as tape:
            current_q = self.train_model(states).reshape(42, BATCH_SIZE) # 현재 상황에서 할 수 있는 행동들의 q value
            current_q = tf.reduce_sum(current_q[action_indexs], axis=1, keepdims=True) # 실제 한 행동에 대한 q value

            loss = self.loss_fn(current_q, target_q)

        grads = tape.gradient(loss, self.train_model.trainable_weights)
        self.optim.apply_gradients(zip(grads, self.train_model.trainable_weights))

        return loss

In [15]:
agent = DQN_Agent()
rewards_hist = []
st_hist = []

for e in range(5000):
    counter = [0 for i in range(42)]
    state = start
    steps = 0
    reward = return_reward(state, goal)
    rewards = 0
    c = 0

    if e % 200 == 0:
        agent.update_model()
        print("===update===")

    while True:
        action, idx, t, eps = agent.act(state)
        counter[idx] += 1
        c += t
        next_state = return_state(state, action)

        if reward < return_reward(next_state, goal):
            sign = 0
        else:
            sign = 1

        reward = return_reward(next_state, goal)

        if steps == EPISODE_DONE or all(state[0][i] == goal[0][i] for i in range(21)):
            done = 1
        else:
            done = 0

        agent.memorize(state, idx, reward, next_state, done, sign)
        loss = agent.run()
        
        state = next_state
        rewards += reward
        steps += 1

        # if steps == 1:
        #     print(f'steps: {steps}, reward: {reward}, a: {idx}')

        if done:
            rewards_hist.append(rewards)
            st_hist.append(state)
            print(f'============={e}=============')
            print(f"rewards: {round(rewards, 3)}, net_loss: {round(loss, 3)}, number of most decision: {max(counter)}, desicion tendecy: {c}, eps: {round(eps, 5)}")
            print(reward)
            print(agent.len_of())

            break

===update===
rewards: 112.952, net_loss: 1, number of most decision: 6, desicion tendecy: 3, eps: 0.99055
1.3
(65, 36)
tf.Tensor(
[[-0.010304  0.018463 -0.000367 ...  0.041789 -0.084488  0.013527]
 [ 0.010655  0.016427  0.0108   ...  0.015083 -0.039764 -0.001033]
 [-0.005393  0.0068   -0.001461 ...  0.000693 -0.012059 -0.002953]
 ...
 [-0.013517  0.031307  0.002683 ...  0.04448  -0.088091  0.007215]
 [-0.030257  0.024665 -0.008627 ...  0.041527 -0.052116  0.002487]
 [-0.011403  0.028969  0.003707 ...  0.034716 -0.064551  0.001848]], shape=(32, 42), dtype=float32)
tf.Tensor(
[[-0.084488]
 [-0.061377]
 [-0.018317]
 [-0.085072]
 [-0.069329]
 [-0.107998]
 [-0.024856]
 [-0.014629]
 [-0.082265]
 [-0.072126]
 [-0.017931]
 [-0.080167]
 [-0.083253]
 [-0.072408]
 [-0.025468]
 [-0.0542  ]
 [-0.095283]
 [-0.041995]
 [-0.049403]
 [-0.066622]
 [-0.099002]
 [-0.085338]
 [-0.023726]
 [-0.084054]
 [-0.031288]
 [-0.087363]
 [-0.083736]
 [-0.087029]
 [-0.074701]
 [-0.088091]
 [-0.086961]
 [-0.067125]], s