In [1]:
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import deque
import random as rand
import math
import matplotlib.pyplot as plt

In [2]:
np.set_printoptions(precision=6, suppress=True)

In [3]:
dnn_model = tf.keras.models.load_model('dnn.h5')

In [4]:
EPS_START = 0.9
EPS_END = 0.01
EPS_DECAY = 200

GAMMA = 0.8
batch_size = 1
episode_done = 2000

In [5]:
df = pd.read_excel('aug_nine_var.xlsx').iloc[:,1:22]
scaler = MinMaxScaler()
X = scaler.fit_transform(df)

In [6]:
def return_latest():
    df = pd.read_excel('aug_nine_var.xlsx').iloc[:,1:22]
    transform = scaler.transform(df)
    return transform[-1]

In [7]:
def return_state(action, state):
    if action % 2 == 0:
        value = -0.01
    
    else:
        value = 0.01

    j = int(action / 2)
    
    state[0][j] = state[0][j] + value
    next_state = state

    return next_state

In [8]:
def return_reward(state, y_pred):
    real_state = return_latest().reshape(1, 21)
    dist = np.sqrt(np.sum(np.square(real_state-state)))

    return dist*10 + y_pred

In [9]:
class Dqn_agent:
    def __init__(self):
        self.target = self._create_model()
        self.model = self._create_model()

        self.target.set_weights(self.model.get_weights())

        self.memory = deque(maxlen=10000)
        
    def _create_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(10, input_shape=(21, ), activation='relu'))
        model.add(tf.keras.layers.Dense(10, activation='relu'))
        model.add(tf.keras.layers.Dense(10, activation='relu'))
        model.add(tf.keras.layers.Dense(42, activation='linear'))

        model.compile(optimizer='rmsprop', loss='mse')

        return model

    def forward(self, x):
        x = self.model(x)
        return x
    
    def memorize(self, state, action, reward, next_state):
        self.memory.append((
            state, action,
            tf.cast(reward, tf.float16), next_state
        ))

    def act(self, state, episode):
        eps_threshold = EPS_END + ((EPS_START - EPS_END) * math.exp(-1 * episode / EPS_DECAY))
        if rand.random() > eps_threshold:
            return np.argmin(self.forward(state))
        else:
            return tf.convert_to_tensor([rand.randrange(42)])

    def learn(self, episode):
        if len(self.memory) < batch_size:
            return

        if episode % 10:
            self.target.set_weights(self.model.get_weights())

        batch = rand.sample(self.memory, batch_size)
        states, actions, rewards, next_states = zip(*batch)

        states = tf.convert_to_tensor(states[0]) # (batch_size, 21)
        actions = tf.convert_to_tensor(actions[0]) # (batch_size, )
        rewards = tf.convert_to_tensor(rewards[0]) # (batch_size, )
        next_states = tf.convert_to_tensor(next_states[0]) # (batch_size, 21)

        if actions.shape != (batch_size, ):
            actions = [actions]

        current_q = self.model.predict(states, verbose=0)
        next_q = self.target.predict(next_states, verbose=0)

        for i in range(batch_size):
            if episode == episode_done:
                next_q_value = rewards[i]
            else:
                next_q_value = rewards[i] + GAMMA * np.min(next_q[i])
            
            current_q[i][actions[i]] = next_q_value

        self.model.fit(states, current_q, batch_size=batch_size, verbose=False)
        

In [14]:
agent = Dqn_agent()
sc_hist = []
st_hist = []
pop_hist = []

for e in range(1, 200):
    state = return_latest()
    state = state.reshape(1, 21)
    steps = 0

    while True:
        pred_y = dnn_model.predict(state, verbose=0)

        action = agent.act(state, e)

        next_state = return_state(action, state)
        reward = return_reward(state, pred_y)

        agent.memorize(state, action, reward, next_state)
        agent.learn(e)

        state = next_state
        steps = steps + 1

        if steps % 10 == 0:
            print(f"steps: {steps}, reward: {reward}, pop: {int(pred_y)}")

        if steps == episode_done:
            print("=============episode done=============")
            print("episode: {0}, score: {2}, y_pred {3}".format(e, reward, pred_y))
            print("=======================================")
            sc_hist.append(reward)
            st_hist.append(scaler.inverse_transform(state))
            pop_hist.append(pred_y)
            break

steps: 10, reward: [[42.594475]], pop: 42
steps: 20, reward: [[42.79346]], pop: 42
steps: 30, reward: [[43.065323]], pop: 42
steps: 40, reward: [[43.16742]], pop: 42
steps: 50, reward: [[43.147964]], pop: 42
steps: 60, reward: [[43.235397]], pop: 42
steps: 70, reward: [[43.307888]], pop: 42
steps: 80, reward: [[43.294895]], pop: 42
steps: 90, reward: [[43.289486]], pop: 42
steps: 100, reward: [[43.325176]], pop: 42
steps: 110, reward: [[43.388985]], pop: 42
steps: 120, reward: [[43.343697]], pop: 42
steps: 130, reward: [[43.487823]], pop: 42
steps: 140, reward: [[43.805286]], pop: 42
steps: 150, reward: [[43.838566]], pop: 42
steps: 160, reward: [[43.910053]], pop: 42
steps: 170, reward: [[43.890915]], pop: 42
steps: 180, reward: [[43.72577]], pop: 42
steps: 190, reward: [[43.758232]], pop: 42
steps: 200, reward: [[43.899258]], pop: 42
steps: 210, reward: [[43.974613]], pop: 42
steps: 220, reward: [[43.969257]], pop: 42
steps: 230, reward: [[44.38039]], pop: 42
steps: 240, reward: [[44

In [1]:
plt.plot(np.array(pop_hist).reshape(99))
plt.show()

NameError: name 'pop_hist' is not defined

In [None]:
plt.plot(np.array(sc_hist).reshape(99))
plt.show()