In [1]:
import torch
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand
import math
from sklearn.preprocessing import MinMaxScaler
from collections import deque

In [2]:
def turtle():
    print("""		      거북거북거북거북거북거북거북
		    거북거북거북거북거북거북거북거북
		  거북거북거북거북거북거북거북거북거북
		거북거북거북거북거북거북거북거북거북거북
  북거북거북      거북거북거북거북거북거북거북거북거북거북거북
거북거북거북거북거북거북거북거북거북거북거북거북거북거북거북거북거북이 꼬북이
  북거북거북      거북거북거북거북거북거북거북거북거북거북거북
	            거북거북	 거	 	   거	거북거북
 		거북거	북	   	    북	북거북
		 거북거	거	   	   거	  북거
		 거북				거북
""")


In [3]:
turtle()

		      거북거북거북거북거북거북거북
		    거북거북거북거북거북거북거북거북
		  거북거북거북거북거북거북거북거북거북
		거북거북거북거북거북거북거북거북거북거북
  북거북거북      거북거북거북거북거북거북거북거북거북거북거북
거북거북거북거북거북거북거북거북거북거북거북거북거북거북거북거북거북이 꼬북이
  북거북거북      거북거북거북거북거북거북거북거북거북거북거북
	            거북거북	 거	 	   거	거북거북
 		거북거	북	   	    북	북거북
		 거북거	거	   	   거	  북거
		 거북				거북



In [4]:
import os
os.chdir('c:\\code\\activ')

In [5]:
df_name = 'nov_nine_var.xlsx'

In [6]:
dnn_model = tf.keras.models.load_model('./model/dnn.h5')

In [7]:
# dqn 에이전트용 하이퍼 파라미터 정의
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 100

GAMMA = 0.9
BATCH_SIZE = 2
ACTION_NUM = 2
EPISODE_DONE = 1000

In [8]:
df = pd.read_excel('./documents/'+df_name).iloc[:,1:23]

scaler = MinMaxScaler()
X = scaler.fit_transform(df.iloc[:,0:21])
DATUM_STATE = X[-1].reshape(1, 21) # 거리를 계산하기 위해 기준점을 설정
DATUM_RATE = df.iloc[:,21:22].iloc[-1].to_numpy() # 제일 최근 데이터의 수도권 과밀화율

In [9]:
pop_scaler = MinMaxScaler()
y = pop_scaler.fit_transform(df.iloc[:,21:22].to_numpy())

In [10]:
change_rate_represent_df = pd.read_excel('./documents/other/change_rate_representative.xlsx').iloc[:,1::]
FLOOR = change_rate_represent_df[0]
TOP = change_rate_represent_df[1]
MODE = change_rate_represent_df[2]

In [11]:
CORR = df.corr().iloc[-1].to_numpy()[0:21]

In [12]:
def return_state(s, a):
    # 현 상황에서 한 행동에 대한 상황
    # 즉, 다음 상황을 반환
    return a.detach().numpy()+s

In [13]:
def return_reward(next_state, y_pred):
    # dqn 에이전트가 학습할 보상
    dist = np.sqrt(np.sum(np.square(DATUM_STATE-next_state)))
    pop = DATUM_RATE - y_pred

    return -10*(dist+pop)

In [14]:
def action_loss(act):
    # 보폭과 변수 상관관계 계산
    # 즉, 행동에 대한 손실
    loss = 0
    actions = act.detach().numpy()

    # BATCH_SIZE, ACTION_NUM, 21
    for batch_action in actions:
        for act in batch_action:
            for i, var_act in enumerate(act):
                if var_act < FLOOR[i] or var_act > TOP[i]:
                    loss += 0.1

                loss += abs(var_act - MODE[i])
    return loss


In [15]:
def correlation_loss(s, a):
    # ns * x = pp
    # ns/pp = 1/x
    # x = pp/ns
    # corr_loss = (corr-x)^2
    ns = return_state(s.numpy(), a)
    pp = dnn_model(ns)

    pp = pop_scaler.transform(pp)
    
    corr_loss = 0
    for i, var_ns in enumerate(ns):
        corr_loss += (CORR[i]-(pp/var_ns)) ** 2
    return corr_loss/21

In [16]:
class Action_network(torch.nn.Module):
    # 현재 상황을 입력 받아서 행동을 출력하는 신경망
    # 보폭, 변수 상관관계, 보상 고려
    def __init__(self):
        super(Action_network, self).__init__()
        self.input_layer = torch.nn.Sequential(
            torch.nn.Linear(in_features=21, out_features=64),
            torch.nn.ReLU()
        )
        self.hidden_layer = torch.nn.Sequential(
            torch.nn.Linear(64, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 64),
            torch.nn.ReLU()
        )
        self.output_layer = torch.nn.Sequential(
            torch.nn.Linear(64, 21*ACTION_NUM)
        )
        
    def forward(self, s):
        i = self.input_layer(s)
        h = self.hidden_layer(i)
        o = self.output_layer(h)
        o = o.reshape(len(s), ACTION_NUM, 21)

        # print(f"action network: state: {s.shape}, out: {o.shape}")

        return o

In [17]:
class Reward_network(torch.nn.Module):
    # 현재 상황과 행동을 입력 받아서 보상을 출력하는 신경망
    def __init__(self):
        super(Reward_network, self).__init__()
        self.flat = torch.nn.Flatten()
        self.input_layer = torch.nn.Sequential(
            torch.nn.Linear(in_features=ACTION_NUM*2*21, out_features=64),
            torch.nn.ReLU()
        )
        self.hidden_layer = torch.nn.Sequential(
            torch.nn.Linear(64, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 64),
            torch.nn.ReLU()
        )
        self.output_layer = torch.nn.Sequential(
            torch.nn.Linear(64, ACTION_NUM)
        )

    def forward(self, data_num, s, a):
        s = torch.stack([s for i in range(ACTION_NUM)]).reshape(data_num, ACTION_NUM, 21)
        a = a.reshape(data_num, ACTION_NUM, 21)
        x = torch.stack([s, a]).reshape(data_num, ACTION_NUM, 2, 21)

        f = self.flat(x)
        i = self.input_layer(f)
        h = self.hidden_layer(i)
        o = self.output_layer(h)

        # print(f"reward network: state: {s.shape}, action: {a.shape}, out: {o.shape}")

        return o

In [18]:
class Dqn_network(torch.nn.Module):
    # 상황을 입력 받아 보상을 출력하는 신경망
    def __init__(self):
        super(Dqn_network, self).__init__()
        self.action_network = Action_network()
        self.reward_network = Reward_network()
    
    def forward(self, s):
        action = self.action_network(s)
        reward = self.reward_network(BATCH_SIZE, s, action)

        return reward

In [19]:
class Dqn_agent:
    def __init__(self):
        self.train_model = Dqn_network()
        self.target_model = Dqn_network()

        self.dqn_optimizer = torch.optim.Adam(self.train_model.parameters(), 0.00025)
        self.action_optimizer = torch.optim.Adam(self.train_model.action_network.parameters(), 0.00025)
        self.reward_optimizer = torch.optim.Adam(self.train_model.reward_network.parameters(), 0.00025)

        self.memory = deque(maxlen=200000)
        self.max_reward_actions = {}
        
        self.episode = 1

    def memorize(self, state, action, act_idx, reward, next_state, step):
        if step == EPISODE_DONE:
            done = True
            self.episode += 1
        else:
            done = False

        self.memory.append(
            (
                torch.from_numpy(state).to(torch.float32),
                action,
                act_idx,
                torch.from_numpy(reward).to(torch.float32),
                torch.from_numpy(next_state).to(torch.float32),
                done
            )
        )

        try:
            if self.max_reward_actions[tuple(map(tuple, state))][1] < reward:
                self.max_reward_actions[tuple(map(tuple, state))] = [action, reward]

        except KeyError:
            self.max_reward_actions[tuple(map(tuple, state))] = [action, reward]

    def convert_memory(self, length):
        batch = rand.sample(self.memory, BATCH_SIZE)
        states, actions, act_idxs, rewards, next_states, dones = zip(*batch)

        states = torch.stack(list(states), dim=0).reshape(BATCH_SIZE, 21)
        actions = torch.stack(list(actions), dim=0)
        act_idxs = torch.tensor(list(act_idxs))
        rewards = torch.stack(list(rewards), dim=0)
        next_states = torch.stack(list(next_states), dim=0).reshape(BATCH_SIZE, 21)
        dones = list(dones)

        return states, actions, act_idxs, rewards, next_states, dones

    def act(self, state):
        if self.episode >= 0 and self.episode < 200:
            eps_threshold = -(self.episode/1000)+1+(self.episode)*(self.episode-200)/300000
        else:
            eps_threshold = -(self.episode/1000)+1+(self.episode-200)*(self.episode-1000)
            
        state = torch.from_numpy(state).to(torch.float32)
        action = self.train_model.action_network(state)
        reward = self.train_model.reward_network(1, state, action)

        if rand.random() > eps_threshold:
            act_index = reward.detach().argmax()
        else:
            act_index = rand.randint(0, ACTION_NUM-1)

        return action[0][act_index], act_index, eps_threshold

    def learn(self):
        if len(self.memory) < BATCH_SIZE:
            return

        reward_net_loss = self.reward_learn()

        # if reward_net_loss > 10:
        #     return
        
        action_net_loss = self.action_learn()

        # if action_net_loss > 10:
        #     return
        
        self.dqn_learn()

    def action_learn(self):
        states, actions, act_idxs, rewards, next_states, dones = self.convert_memory(BATCH_SIZE)

        expected_action = self.train_model.action_network.forward(states)

        max_reward = []
        for val in self.max_reward_actions.values():
            max_reward.append(val[1])
        max_reward = torch.tensor(np.array(max_reward))

        expected_action = expected_action.reshape(BATCH_SIZE, ACTION_NUM, 21)
        expected_reward = self.train_model.reward_network(BATCH_SIZE, states, expected_action)

        step_loss = []
        for expect in expected_action:
            step_loss.append(action_loss(expected_action))
        step_loss = torch.tensor(np.array(step_loss))

        cor_loss = correlation_loss(states, actions)

        loss = max_reward.mean() - expected_reward.mean() + step_loss.mean() + cor_loss.mean()

        loss = loss.mean()

        self.action_optimizer.zero_grad()
        loss.backward()
        self.action_optimizer.step()

        return loss

    def reward_learn(self):
        states, actions, act_idxs, rewards, next_states, dones = self.convert_memory(BATCH_SIZE)
        actions = torch.stack([actions for i in range(BATCH_SIZE)]).reshape(BATCH_SIZE, ACTION_NUM, 21)
        predicted_reward = self.train_model.reward_network.forward(BATCH_SIZE, states, actions)
        
        expected_reward = []
        for i in range(BATCH_SIZE):
            sum_predicted_reward = 0
            for j in range(ACTION_NUM):
                sum_predicted_reward += predicted_reward[i][j]
            expected_reward.append(sum_predicted_reward/ACTION_NUM)
        expected_reward = torch.stack(expected_reward).reshape(BATCH_SIZE, 1)

        loss = torch.nn.functional.mse_loss(expected_reward, rewards)
        self.reward_optimizer.zero_grad()
        loss.backward(retain_graph=True)
        self.reward_optimizer.step()

        return loss

    def dqn_learn(self):
        states, actions, act_idxs, rewards, next_states, dones = self.convert_memory(BATCH_SIZE)

        current_q = self.train_model.forward(states)[0].gather(0, act_idxs).reshape(BATCH_SIZE)
        max_next_q = self.target_model.forward(next_states).detach().max(1)[0]

        expected_q = torch.empty_like(rewards)

        for i in range(BATCH_SIZE):
            if dones[i]:
                expected_q[i] = max_next_q
            else:
                expected_q[i] = rewards[i] + (GAMMA * max_next_q[i])

        expected_q = expected_q.reshape(BATCH_SIZE)

        loss = torch.nn.functional.mse_loss(current_q, expected_q)
        self.dqn_optimizer.zero_grad()
        loss.backward(retain_graph=True)
        self.dqn_optimizer.step()

        return loss

In [20]:
agent = Dqn_agent()
sc_hist = []
st_hist = []
pop_hist = []

for e in range(1000):
    state = DATUM_STATE
    steps = 0

    if e % 10 == 0:
        agent.target_model.parameters = agent.train_model.parameters

    while True:
        pred_y = dnn_model.predict(state, verbose=0)[0]

        action, action_index, eps = agent.act(state)
        next_state = return_state(state, action)

        reward = return_reward(next_state, pred_y)

        agent.memorize(state, action, action_index, reward, next_state, steps)
        agent.learn()

        state = next_state
        steps += 1

        if steps % 100 == 0:
            print(f"steps: {steps}, reward: {np.round(reward, 6)}, pop: {np.round(pred_y, 6)}")

        if steps == EPISODE_DONE:
            print("=============episode done=============")
            print(f"=========={e}, {eps:.6f}==========")
            break

steps: 100, reward: [-344.282044], pop: [0.551683]
steps: 200, reward: [-1127.45043], pop: [0.775053]
steps: 300, reward: [-2784.923052], pop: [0.976532]
steps: 400, reward: [-6013.795366], pop: [0.996343]
steps: 500, reward: [-12597.170541], pop: [1.]
steps: 600, reward: [-26984.28549], pop: [1.]
steps: 700, reward: [-61963.143435], pop: [1.]
steps: 800, reward: [-131112.408201], pop: [1.]
steps: 900, reward: [-254593.380977], pop: [1.]
steps: 1000, reward: [-474200.510408], pop: [1.]
steps: 100, reward: [-346.424322], pop: [0.515371]
steps: 200, reward: [-1134.377694], pop: [0.763639]
steps: 300, reward: [-2802.734944], pop: [0.975899]


KeyboardInterrupt: 