In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand
from sklearn.preprocessing import MinMaxScaler
from collections import deque

In [2]:
import os
os.chdir('C:\\code\\activ')

In [3]:
df_name = 'nov_nine_var.xlsx'

In [4]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

np.set_printoptions(precision=6, suppress=True)

In [5]:
dnn_model = tf.keras.models.load_model('./model/dnn.h5')

In [6]:
GAMMA = 0.9
BATCH_SIZE = 128
ACTION_NUM = 2
EPISODE_DONE = 1000

In [7]:
df = pd.read_excel('./documents/'+df_name).iloc[:,1:23]

scaler = MinMaxScaler()
X = scaler.fit_transform(df.iloc[:,0:21])
DATUM_STATE = X[-1].reshape(1, 21)
DATUM_RATE = df.iloc[:,21:22].iloc[-1].to_numpy()

In [8]:
change_rate_represent_df = pd.read_excel('./documents/other/change_rate_representative.xlsx').iloc[:,1::]
FLOOR = change_rate_represent_df[0]
TOP = change_rate_represent_df[1]
MODE = change_rate_represent_df[2]

In [9]:
CORR = df.corr().iloc[-1].to_numpy()[0:21]

In [10]:
def return_state(s, a):
    ns = s + a.numpy()
    return ns

In [11]:
def return_reward(ns, yp):
    dist_loss = np.sqrt(np.sum(np.square(DATUM_STATE - ns)))/21
    pop_loss = DATUM_RATE - yp

    return -(dist_loss + pop_loss)

In [12]:
def action_loss(y_true, y_pred):
    input_trick = y_true.reshape(32, 22)

    reward_loss = input_trick[:,21:22]
    next_state = input_trick[:,0:21]
    population = dnn_model(next_state)

    actions = tf.concat([y_pred[:,0:21], y_pred[:,22:43]], axis=0)

    step_loss = 0.0
for act in actions:
    for i in range(act.shape[0]):
        if act[i] < FLOOR[i] or act[i] > TOP[i]:
            step_loss += 0.1

        step_loss += (act[i] - MODE[i]) ** 2

    # ns * x = pp
    # ns/pp = 1/x
    # x = pp/ns
    # corr_loss = (corr-x)^2
    
    corr_loss = 0
    print(next_state.shape)
    # for j in range(32):
    for i in range(21):
        tmp = tf.math.reduce_sum(tf.math.divide(population, next_state[:,i]))
        corr_loss += (CORR[i]-tmp) ** 2
    
    return tf.cast(tf.math.add(step_loss+reward_loss, corr_loss), tf.float32)

In [13]:
def convert_reward_net_input(data_num, s, a, d=False):
    input_state = tf.convert_to_tensor([s for i in range(ACTION_NUM)]).reshape(data_num, ACTION_NUM, 21)

    if d:
        input_action = tf.convert_to_tensor([a for i in range(ACTION_NUM)]).reshape(data_num, ACTION_NUM, 21)
    else:
        input_action = a.reshape(data_num, ACTION_NUM, 21)

    x = tf.convert_to_tensor([input_state, input_action]).reshape(data_num, ACTION_NUM, 2, 21)
    x = tf.keras.layers.Flatten()(x)
    return tf.cast(x, tf.float32)

In [14]:
class Action_Network(tf.keras.models.Model):
    def __init__(self):
        super(Action_Network, self).__init__()
        self.input_layer = tf.keras.layers.Dense(64, activation='relu')
        
        self.hidden_layer = tf.keras.models.Sequential()
        self.hidden_layer.add(tf.keras.layers.Dense(64, activation='relu'))
        self.hidden_layer.add(tf.keras.layers.Dense(64, activation='relu'))
        self.hidden_layer.add(tf.keras.layers.Dense(64, activation='relu'))

        self.output_layer = tf.keras.models.Sequential()
        self.output_layer.add(tf.keras.layers.Dense(ACTION_NUM*22, activation='linear'))

    def call(self, x):
        # print("a: x: ", x.shape)
        i = self.input_layer(x)
        # print("a: i: ", i.shape)
        h = self.hidden_layer(i)
        # print("a: h: ", h.shape)
        o = self.output_layer(h)
        # print("a: o: ", o.shape)

        return tf.cast(o, tf.float32)

In [15]:
class Reward_Network(tf.keras.models.Model):
    def __init__(self):
        super(Reward_Network, self).__init__()
        self.input_layer = tf.keras.layers.Dense(64, activation='relu')

        self.hidden_layer = tf.keras.models.Sequential()
        self.hidden_layer.add(tf.keras.layers.Dense(64, activation='relu'))
        self.hidden_layer.add(tf.keras.layers.Dense(64, activation='relu'))
        self.hidden_layer.add(tf.keras.layers.Dense(64, activation='relu'))

        self.output_layer = tf.keras.models.Sequential()
        self.output_layer.add(tf.keras.layers.Dense(ACTION_NUM, activation='linear'))

    def call(self, x):
        # print("r: x: ", x.shape)
        i = self.input_layer(x)
        # print("r: i: ", i.shape)
        h = self.hidden_layer(i)
        # print("r: h: ", h.shape)
        o = self.output_layer(h)
        # print("r: o: ", o.shape)

        return tf.cast(o, tf.float32)

In [16]:
class DQN_Network(tf.keras.models.Model):
    def __init__(self):
        super(DQN_Network, self).__init__()
        self.action_network = Action_Network()
        self.action_network.build(input_shape=(None, 21))
        self.action_network.compile(optimizer='adam', loss=action_loss)

        self.reward_network = Reward_Network()
        self.reward_network.build(input_shape=(None, ACTION_NUM*2*21))
        self.reward_network.compile(optimizer='adam', loss='mse')

    def call(self, x):
        action = self.action_network(x)
        input_data = convert_reward_net_input(x.shape[0], x, action[:,0:21], d=True)
        reward = self.reward_network(input_data)

        return reward

In [17]:
class DQN_Agent:
    def __init__(self):
        self.train_model = DQN_Network()
        # self.train_model.build(input_shape=(None, 21))
        self.train_model.compile(optimizer='adam', loss='mse')

        self.target_model = DQN_Network()
        # self.target_model.build(input_shape=(None, 21))
        self.target_model.compile(optimizer='adam', loss='mse')

        self.memory = deque(maxlen=200000)
        self.episode = 1
        self.max_reward_actions = {}

    def memorize(self, cs, a, a_i, r, ns, d):
        """
            cs: 현재 상태
            a: 현재 행동
            a_i: 현재 행동 번호(몇번 행동)
            r: 현재 행동에 대한 보상
            ns: 현재 상황에서 한 행동으로 인한 결과, 다음 상태
            s: 에피소드 진행 상태
        """

        done = d
        if done:
            self.episode += 1

        self.memory.append(
            (
                tf.convert_to_tensor(tf.cast(cs, tf.float32)),
                a,
                a_i,
                tf.convert_to_tensor(r),
                tf.convert_to_tensor(tf.cast(ns, tf.float32)),
                done
            )
        )

        try: # 현재 상황에서 최대의 보상을 주는 행동을 저장
            if self.max_reward_actions[tuple(map(tuple, cs))][1] < r:
                self.max_reward_actions[tuple(map(tuple, cs))] = [a, r]
        except KeyError:
            self.max_reward_actions[tuple(map(tuple, cs))] = [a, r]

    def convert_memory(self, length):
        batch = rand.sample(self.memory, length)
        s, a, a_i, r, ns, d = zip(*batch)

        states = tf.convert_to_tensor(s).reshape(length, 21)
        actions = tf.convert_to_tensor(a).reshape(length, 21)
        action_indexs = tf.convert_to_tensor(a_i)
        rewards = tf.convert_to_tensor(r)
        next_states = tf.convert_to_tensor(ns).reshape(length, 21)
        dones = tf.convert_to_tensor(d)

        # print(states.dtype, actions.dtype, action_indexs.dtype, rewards.dtype, next_states.dtype, dones.dtype)

        return states, actions, action_indexs, rewards, next_states, dones

    def act(self, state):
        if self.episode >= 0 and self.episode < 200:
            eps_threshold = -(self.episode/1000)+1+(self.episode)*(self.episode-200)/300000
        else:
            eps_threshold = -(self.episode/1000)+1+(self.episode-200)*(self.episode-1000)

        state = tf.cast(tf.convert_to_tensor(state), tf.float32)
        action = self.train_model.action_network(state).reshape(ACTION_NUM, 22).T[0:21].T

        x = convert_reward_net_input(1, state, action)
        reward = self.train_model.reward_network(x)

        if rand.random() > eps_threshold:
            act_index = int(tf.argmax(reward[0], 0))
        else:
            act_index = rand.randint(0, ACTION_NUM-1)

        return action[act_index], act_index, eps_threshold

    def learn(self):
        if len(self.memory) < BATCH_SIZE:
            return

        # reward_net_loss = self.reward_learn()

        # if reward_net_loss > 10:
        #     return

        action_net_loss = self.action_learn()

        # if action_net_loss > 10:
        #     return

        self.dqn_learn()

    def action_learn(self):
        states, actions, action_indexs, rewards, next_states, dones = self.convert_memory(len(self.max_reward_actions))

        max_reward = []
        for val in self.max_reward_actions.values():
            max_reward.append(val[1])
        max_reward = tf.convert_to_tensor(max_reward)

        expected_action = self.train_model.action_network(states).reshape(len(self.max_reward_actions), ACTION_NUM, 22).T[0:21].T

        x = convert_reward_net_input(len(self.max_reward_actions), states, expected_action)
        expected_reward = self.train_model.reward_network(x)

        a = tf.cast(max_reward.reshape(len(self.max_reward_actions), 1), tf.float32)
        b = expected_reward.reshape(len(self.max_reward_actions), 2).T[0].T.reshape(len(self.max_reward_actions), 1)

        reward_loss = tf.reduce_sum(tf.math.subtract(a, b))
        reward_loss = tf.convert_to_tensor([reward_loss for i in range(len(self.max_reward_actions))]).reshape(len(self.max_reward_actions), 1)
        
        trick_real = np.concatenate((next_states.reshape(len(self.max_reward_actions), 21), reward_loss), axis=1)
        # input_trick = trick_real.flatten()
        self.train_model.action_network.fit(states, trick_real, epochs=5, verbose=0)
        return self.train_model.action_network.evaluate(states, trick_real, verbose=0)

    def reward_learn(self):
        states, actions, action_indexs, rewards, next_states, dones = self.convert_memory(BATCH_SIZE)

        actions = tf.convert_to_tensor([actions for i in range(ACTION_NUM)]).reshape(BATCH_SIZE, ACTION_NUM, 21)
        
        x = convert_reward_net_input(BATCH_SIZE, states, actions)
        predicted_reward = self.train_model.reward_network(x)
        
        self.train_model.reward_network.fit(x, rewards, epochs=5, verbose=0)
        return self.train_model.reward_network.evaluate(x, rewards, verbose=0)

    def dqn_learn(self):
        states, actions, action_indexs, rewards, next_states, dones = self.convert_memory(BATCH_SIZE)

        current_q = self.train_model(tf.cast(states, tf.float32)).numpy()
        next_q = self.target_model(next_states)

        for i in range(BATCH_SIZE):
            if dones[i]:
                next_q_value = rewards[i]
            else:
                next_q_value = rewards[i] + GAMMA * np.max(next_q[i])

            current_q[i][action_indexs[i]] = next_q_value

        states = states.reshape(BATCH_SIZE, 21)

        self.train_model.fit(states, current_q, epochs=5, verbose=0)
        return self.train_model.evaluate(states, current_q, verbose=0)

In [18]:
agent = DQN_Agent()
sc_hist = []
st_hist = []
pop_hist = []

for e in range(1000):
    state = DATUM_STATE
    steps = 0

    if e % 5 == 0:
        agent.target_model.set_weights(agent.target_model.get_weights())
    
    while True:
        pred_y = dnn_model.predict(state, verbose=0)

        action, idx, eps = agent.act(state)

        next_state = return_state(state, action)
        reward = return_reward(state, pred_y)

        if steps == EPISODE_DONE or abs(reward[0][0]) > 1000000:
            done = True
        else:
            done = False

        agent.memorize(state, action, idx, reward, next_state, done)
        agent.learn()

        state = next_state
        steps += 1

        if steps % 10 == 0:
            print(f"steps: {steps}, reward: {reward}, pop: {pred_y}")

        if done:
            print(f"=================={e} done==================")
            print(reward, eps)
            print("==============================================")
            break

steps: 10, reward: [[-0.270316]], pop: [[0.440124]]
steps: 20, reward: [[-0.564475]], pop: [[0.394028]]
steps: 30, reward: [[-1.363472]], pop: [[0.188757]]
steps: 40, reward: [[-3.139391]], pop: [[0.010101]]
steps: 50, reward: [[-8.07559]], pop: [[0.]]
steps: 60, reward: [[-20.372745]], pop: [[0.876909]]
steps: 70, reward: [[-39.157586]], pop: [[0.]]
steps: 80, reward: [[-85.800047]], pop: [[0.]]
steps: 90, reward: [[-210.10703]], pop: [[0.]]
steps: 100, reward: [[-545.325334]], pop: [[0.]]
steps: 110, reward: [[-1172.93246]], pop: [[0.]]
steps: 120, reward: [[-3405.710475]], pop: [[0.]]
(32, 21)
(32, 21)
(32, 21)
(32, 21)
(32, 21)
steps: 130, reward: [[-6862.450376]], pop: [[0.]]
steps: 140, reward: [[-6944.195501]], pop: [[0.]]
steps: 150, reward: [[-6944.202616]], pop: [[0.]]
steps: 160, reward: [[-6944.180812]], pop: [[0.]]
steps: 170, reward: [[-6943.951026]], pop: [[0.]]
steps: 180, reward: [[-6943.885778]], pop: [[0.]]
steps: 190, reward: [[-6943.871137]], pop: [[0.]]
steps: 200