In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from collections import deque
import random as rand
import math

In [2]:
from sklearn.preprocessing import MinMaxScaler

def return_latest():
    df = pd.read_excel('aug_nine_var.xlsx')
    X = df.iloc[:,1:22]
    y = df.iloc[:,22:23].to_numpy()

    scaler = sklearn.preprocessing.MinMaxScaler()
    X = scaler.fit_transform(X)
    
    return X[-1], y[-1]

In [3]:
def return_reward(state, y_pred):
    x, _ = return_latest()
    REAL_X = x.reshape(1, 21)[0]
    
    real = REAL_X * 10
    state = state * 10
    
    d = abs(real - state)
    
    ds = 0
    for i in range(21):
        ds = ds + d[0][i]
    
    reward = 10000 / (ds + y_pred)
    return reward, ds

In [4]:
def return_state(action, state):
    if action % 2 == 0:
        value = -0.01
    
    else:
        value = 0.01

    j = int(action / 2)
    
    state[0][j] = state[0][j] + value
    next_state = state

    return next_state, j

In [17]:
class DqnAgent:
    def __init__(self):
        self.model = tf.keras.models.Sequential()
        self.model.add(tf.keras.layers.Dense(256, input_dim=21, activation='relu'))
        self.model.add(tf.keras.layers.Dense(256, activation='relu'))
        self.model.add(tf.keras.layers.Dense(42, activation='linear'))
        
        self.model.compile(loss='mse', optimizer='adam')

        self.steps_done = 0 #학습 반복 시 증가
        self.memory = deque(maxlen=1000) #deque는 선입선출. 오래된 값을 지움
    
    def memorize(self, state, action, reward, next_state):
        self.memory.append((
                            state,
                            action,
                            reward,
                            next_state
        ))
        #현재 상태, 현재 행동, 현재 행동으로 인한 보상, 다음 상태
    
    def act(self, state):
        eps_threshold = eps_end + ((eps_start - eps_end) * math.exp(-1. * self.steps_done / eps_decay))
        self.steps_done = self.steps_done + 1

        if rand.random() > eps_threshold: #최대 보상
            return self.model.predict(state, verbose=0).max()
        
        else: #무작위
            return rand.randrange(10)

    def learn(self):
        if len(self.memory) < 64:
            return
        
        batch = rand.sample(self.memory, 64)
        states, actions, rewards, next_states = zip(*batch)
        
        states = tf.convert_to_tensor(states)
        actions = tf.convert_to_tensor(actions)
        rewards = tf.convert_to_tensor(rewards)
        next_states = tf.convert_to_tensor(next_states)

        states = tf.reshape(states, [len(states), 21])

        # print("states: {0}, actions: {1}, rewards: {2}, next_states: {3}".format(type(states), type(actions), type(rewards), type(next_states)))
        print("states: {0}, actions: {1}, rewards: {2}, next_states: {3}".format(states.shape, actions.shape, rewards.shape, next_states.shape))

        current_q = self.model.predict(states, verbose=0).gather(1, actions) # 64, 1. 그때 그때 한 행동들의 가치
        
#         print("cq: {0}, mq: {1}, eq: {2}, re: {3}".format(current_q.shape, max_next_q.shape, expected_q.shape, rewards.shape))
        return 0

In [18]:
episode = 1000
episode_step = 1000
eps_start = 0.9 #학습 시작 시 무작위 행동할 확률
eps_end = 0.05 #학습 종료 시 무작위 행동할 확률
eps_decay = 100 #학습이 반복되며 무작위로 행동할 확률 감소 값

#eps = epsilion

gamma = 0.8 #감마는 할인계수, 에이전트가 현재를 미래보다 더 가치있게 여기는 것

lr = 0.001
batch_size = 64

model = tf.keras.models.load_model('dnn.h5')
agent = DqnAgent()

score_history = []
state_history = []
predict_history = []

best_reward = 0

In [19]:
for e in range(1, episode+1):
    state, _ = return_latest()
    state = state.reshape(1, 21)
    steps = 0

    while True:
        predict_y = model.predict(state, verbose=0)
        predict_y = predict_y.astype(int)
        predict_y = predict_y[0][0]

        action = agent.act(state)

        next_state, j = return_state(action, state)
        reward, ds = return_reward(state, predict_y)

        agent.memorize(state, action, reward, next_state)
        a = agent.learn()
        
        if steps % 10 == 0:
            print("step: {0}, ds: {1}, y_pred: {2}, j: {3}, reward: {4}".format(steps, ds, predict_y, j, reward))
        
        if (best_reward < reward):
            best_reward = reward
            best_state = state

        state = next_state
        steps = steps + 1

        if steps == episode_step:
            print("=============episode done=============")
            print("episode: {0}, y_pred: {1}, score: {2}".format(e, predict_y, reward))
            print("=======================================")
            score_history.append(reward)
            state_history.append(scaler.inverse_transform(state))
            predict_history.append(predict_y)
            break


step: 0, ds: 0.1, y_pred: 42, j: 2, reward: 237.52969121140143
step: 10, ds: 0.8999999999999984, y_pred: 42, j: 0, reward: 233.1002331002331
step: 20, ds: 1.3, y_pred: 42, j: 0, reward: 230.9468822170901
step: 30, ds: 1.5000000000000007, y_pred: 42, j: 3, reward: 229.88505747126436
step: 40, ds: 2.0999999999999988, y_pred: 42, j: 2, reward: 226.75736961451247
step: 50, ds: 2.700000000000002, y_pred: 42, j: 3, reward: 223.71364653243847
step: 60, ds: 3.300000000000002, y_pred: 42, j: 0, reward: 220.75055187637966
states: (64, 21), actions: (64,), rewards: (64,), next_states: (64, 1, 21)


AttributeError: 'numpy.ndarray' object has no attribute 'gather'