In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand
from sklearn.preprocessing import MinMaxScaler
from collections import deque

In [2]:
import os
os.chdir('C:\\code\\activ')

In [3]:
df_name = 'nov_nine_var.xlsx'

In [4]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

np.set_printoptions(precision=6, suppress=True)

In [7]:
dnn_model = tf.keras.models.load_model('./model/dnn.h5')
# gan_model = tf.keras.models.load_model()

In [8]:
# dqn paramater
GAMMA = 0.9
BATCH_SIZE = 128
ACTION_NUM = 3
EPISODE_DONE = 100

In [10]:
# set action lstm network
for i in range(ACTION_NUM):
    break
    globals()[f'action_net{i}'] = tf.keras.models.load_model('./model/action_lstm{0}.h5'.format(i))

In [35]:
df = pd.read_excel('./documents/' + df_name).iloc[:,1::]

scaler = MinMaxScaler()
X = scaler.fit_transform(df.iloc[:,0:21])

starting_state = X[-1]

In [37]:
def set_goal(goal_df_name):
    """ set goal destination
    Args:
        goal_df_name(str): df_name in documents/result/
    Returns:
        goal_state(ndArray, (1, 21)): the state of lowest rate in df
    """
    goal_df = pd.read_excel('./documents/result/' + goal_df_name).iloc[:,1::].to_numpy()
    index = goal_df[:,23].argmin()

    goal_state = goal_df[:,0:21][index]

    return goal_state

In [33]:
def return_action(s):
    """
    Args:
        s(ndArray, (1, 21)): the state
    Returns:
        a(ndArray, (ACTION_NUM, 21)): the action predicted by lstm
    """
    action_list = []
    for i in range(ACTION_NUM):
        action_list.append(globals()[f'action_net{i}'].predict(s, verbose=0))

    a = np.array(action_list)
    return a

In [29]:
def return_state(s, a):
    """ return s+a, which mean next state
    Args:
        s(ndArray, (1, 21)): the current state
        a(ndArray, (1, 21)): the action on the current state
    Returns:
        ns(ndArray, (1, 21)): the next state
    """
    return s + a

In [39]:
def return_reward(ns, gs, ap):
    """ evaluate current action
    Args:
        ns(ndArray, (1, 21)): the consequence of action in the current state
        gs(ndArray, (1, 21)): the destination
        ap(int): reality of action possible predicted by GAN
    Returns:
        reward(int): distance to destination + reality of action possible
        dist(float): distance to destination
    """
    dist = np.sqrt(np.sum(np.square(gs - ns)))
    
    return -(ap + dist), dist


In [31]:
class DQN_Network(tf.keras.models.Model):
    def __init__(self):
        super(DQN_Network, self).__init__()
        self.input_layer = tf.keras.layers.Dense(128, activation='relu')

        self.hidden_layer = tf.keras.models.Sequential()
        self.hidden_layer.add(tf.keras.layers.Dense(128, activation='relu'))
        self.hidden_layer.add(tf.keras.layers.Dense(128, activation='relu'))

        self.ouput_layer = tf.keras.layers.Dense(ACTION_NUM, activation='linear')

    def call(self, x):
        a = return_action(x)
        i = self.input_layer(a)
        h = self.hidden_layer(h)
        o = self.ouput_layer(o)

        return o

In [41]:
class DQN_Agent:
    def __init__(self):
        self.train_model = self.set_model()
        self.target_model = self.set_model()

        self.memory = deque(maxlen=20000)
        self.episode = 1

    def set_model(self):
        net = DQN_Network()
        net.build(input_shape=(None, 21))

        optim = tf.keras.optimizers.Adam(learning_rate=1e-10)

        model = net.compile(optimizer=optim, loss='mse')
        return model

    def memorize(self, cs, a, a_i, r, ns, d):
        """ append to self.memory
        Args:
            cs(ndArray, (1, 21)): the current state
            a(ndArray, (1, 21)): the action on current state
            a_i(int): the index of the action chosen by the agent
            r(int): reward for action in the current state
            ns(ndArray, (1, 21)): the next state
            d(boolean): whether to proceed with the episode
        Returns:
            None
        """
        if d:
            self.episode += 1
        
        self.memory.append(
            (
                tf.convert_to_tensor(tf.cast(cs, tf.float32)),
                tf.convert_to_tensor(tf.cast(a, tf.float32)),
                a_i,
                tf.convert_to_tensor(tf.cast(r, tf.float32)),
                tf.convert_to_tensor(tf.cast(ns, tf.float32)),
                d
            )
        )

    def convert_memory_to_input(self):
        batch = rand.sample(self.memory, BATCH_SIZE)
        s, a, a_i, r, ns, d = zip(*batch)

        states = tf.convert_to_tensor(s).reshape(BATCH_SIZE, 21)
        actions = tf.convert_to_tensor(a).reshape(BATCH_SIZE, 21)
        action_indexs = tf.convert_to_tensor(a_i)
        rewards = tf.convert_to_tensor(r)
        next_states = tf.convert_to_tensor(ns).resahpe(BATCH_SIZE, 21)
        dones = tf.convert_to_tensor(d)

        return states, actions, action_indexs, rewards, next_states, dones

    def act(self, state):
        if self.episode >= 0 and self.episode < 200:
            eps_threshold = -(self.episode/1000)+1+(self.episode)*(self.episode-200)/300000
        else:
            eps_threshold = -(self.episode/1000)+1+(self.episode-200)*(self.episode-1000)

        a = return_action(state)
        r = self.train_model(state)

        if rand.random() > eps_threshold:
            a_i = np.argmax(r)
        else:
            a_i = rand.randint(0, ACTION_NUM-1)

        return a[a_i], a_i, eps_threshold

    def learn(self):
        if len(self.memory) < BATCH_SIZE:
            return

        states, actions, action_indexs, rewards, next_states, dones = self.convert_memory_to_input()

        current_q = self.model(states)
        target_q = self.target(next_states)

        for i in range(BATCH_SIZE):
            if dones[i]:
                next_q_value = rewards[i]
            else:
                next_q_value = rewards[i] + GAMMA * np.max(target_q[i])

            current_q[i][action_indexs[i]] = next_q_value

        self.train_model.train_on_batch(states, current_q)
        return self.train_model.evaluate(states, current_q, verbose=0)

In [42]:
agent = DQN_Agent()
sc_hist = []
st_hist = []

goal_state = set_goal('basic.xlsx')
for e in range(1000):
    state = starting_state
    steps = 0

    if e % 50 == 0:
        agent.target_model.set_weights(agent.train_model.get_weights())

    while True:
        action, idx, eps = agent.act(state)
        ap = 0
        # ap = gan_model.predict(action, verbose=0)

        next_state = return_state(state, action)
        reward, dist = return_reward(next_state, ap)

        if steps == EPISODE_DONE or state == goal_state:
            done = True
        else:
            done = False

        agent.memorize(state, action, idx, reward, next_state, done)
        agent.learn()

        state = next_state
        steps += 1

        if steps % 10 == 0:
            print(f'steps: {steps}, reward: {reward}, dist: {dist}')

        if done:
            print(f'============={e}=============')
            print(reward, eps)
            print("=============================")
            break

KeyError: 'action_net0'