In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand
import math
from sklearn.preprocessing import MinMaxScaler

In [2]:
import os
os.chdir('../../')

In [3]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

np.set_printoptions(precision=6, suppress=True)

In [4]:
real_data = pd.read_excel('./documents/nov_nine_var.xlsx').to_numpy()
goal_data = pd.read_excel('./documents/result/basic_formula.xlsx').to_numpy()

scaler = MinMaxScaler()
scaler = scaler.fit(real_data[:,1:22])

In [5]:
def load_data(data, i):
    data = scaler.transform(data[:,1:22])[i].reshape(1, 21)

    return np.round(data, 2)

def argmax(l):
    return max(range(len(l)), key=lambda i: l[i])

def argmin(l):
    return min(range(len(l)), key=lambda i: l[i])

In [6]:
start = load_data(real_data, -1)
goal = load_data(goal_data, argmin(goal_data[:,-1]))

print(goal[0])
print(start[0])

need_step = int(np.sum(abs(goal-start))*100)
print(need_step)

[0.51 0.5  0.5  0.49 0.5  0.5  0.5  0.5  0.5  0.5  0.49 0.5  0.5  0.5
 0.5  0.5  0.49 0.49 0.51 0.5  0.5 ]
[0.   0.98 0.   1.   0.99 1.   0.   0.78 0.09 0.46 0.4  0.58 0.27 0.32
 0.23 0.23 1.   0.03 0.5  0.43 1.  ]
689


In [7]:
# actor-critic hyperparmater
GAMMA = 0.99
EPISODE_DONE = need_step * 10
EPS = np.finfo(np.float32).eps.item()
ACTION_NUM = 42

In [8]:
def return_action(i):
    a = np.zeros((1, 21))
    j = i // 2

    if i % 2 == 0:
        a[0][j] = -0.01
    
    else:
        a[0][j] = 0.01
    
    return a

In [9]:
def return_state(s, a):
    ns = s + a
    return ns

In [10]:
def return_reward(ns, gs):
    dist = np.sqrt(np.sum(np.square(gs - ns)))

    end = 0
    for i in range(21):
        if ns[0][i] == gs[0][i]:
            end += 5
    
    reward = -dist + end
    return reward

In [11]:
class Actor_Critic(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.input_layer = tf.keras.models.Sequential()
        self.input_layer.add(tf.keras.layers.Dense(128, activation='relu'))
        self.input_layer.add(tf.keras.layers.Dense(128, activation='relu'))

        self.actor_layer = tf.keras.models.Sequential()
        self.actor_layer.add(tf.keras.layers.Dense(ACTION_NUM, activation='softmax'))

        self.critic_layer = tf.keras.models.Sequential()
        self.critic_layer.add(tf.keras.layers.Dense(1, activation='linear'))

    def call(self, x):
        i = self.input_layer(x)

        a = self.actor_layer(i)
        c = self.critic_layer(i)

        return a, c

In [21]:
class AC_agent:
    def __init__(self):
        self.model = Actor_Critic()
        self.optim = tf.keras.optimizers.RMSprop(learning_rate=1e-10)

        self.huber_loss = tf.keras.losses.Huber()

    def expected_q(self, rewards):
        discounted_sum = 0
        returns = np.zeros_like(rewards)
        for i, r in enumerate(rewards):
            discounted_sum = r + GAMMA * discounted_sum
            returns[i] = discounted_sum
        returns = (returns - np.mean(returns)) / (np.std(returns) + EPS)

        return returns.tolist()

    def act(self, state):
        action_prob, value = self.model(state)
        action = np.random.choice(ACTION_NUM, p=np.squeeze(action_prob))

        return action, action_prob, value

    def run(self, states, action_indexs, rewards, steps):
        returns = self.expected_q(rewards)

        states = tf.convert_to_tensor(states).reshape(steps, 42)
        action_indexs = tf.convert_to_tensor(action_indexs)
        returns = tf.convert_to_tensor(returns)

        loss = self.learn(states, action_indexs, returns)
        return loss

    @tf.function
    def learn(self, s, a, r):
        with tf.GradientTape() as tape:
            p, v = self.model(s)
            p = tf.reduce_sum(tf.one_hot(a, ACTION_NUM) * p, axis=1)
            v = v[0]

            adv = r - v
            log_p = tf.math.log(p)

            actor_loss = -tf.math.reduce_sum(log_p * adv)
            critic_loss = self.huber_loss(v, r)

            loss = actor_loss + critic_loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optim.apply_gradients(zip(grads, self.model.trainable_variables))

        return loss

In [24]:
agent = AC_agent()
for e in range(10000):
    state = start
    model_states_hist = []
    action_indexs_hist = []
    rewards_hist = []

    steps = 0
    episode_reward = 0

    done = 0

    while True:
        model_state = np.array([state, goal]).reshape(1, 42)
        a_i, a_p, c_v = agent.act(model_state)

        action = return_action(a_i)
        next_state = return_state(state, action)

        reward = return_reward(next_state, goal)
        
        model_states_hist.append(model_state)
        action_indexs_hist.append(a_i)
        rewards_hist.append(reward)

        state = next_state
        episode_reward += reward
        steps += 1

        if steps == EPISODE_DONE or all(state[0] == goal[0]):
            done = 1
        
        if done:
            loss = agent.run(model_states_hist, action_indexs_hist, rewards_hist, steps)
            print(f'{e}: {round(reward, 2)}, {max(rewards_hist)} | {np.round(loss[0], 2)}')
            break

0: -3.5543635154553344 | 4568.31005859375
1: -4.215151242838144 | 4593.240234375
2: -3.9330776753072083 | 4551.169921875
3: -3.7888124788645796 | 4536.009765625
4: -3.825689480342073 | 4537.14990234375
5: -4.159218676626652 | 4572.83984375
6: -3.6519173046497047 | 4559.68994140625
7: -3.938845008374916 | 4581.66015625
8: -4.247999529190174 | 4559.60009765625
9: -3.9754873915030835 | 4558.2001953125
10: -3.9843569117236473 | 4555.18017578125
11: -3.656241239305741 | 4563.259765625
12: -3.956854811589627 | 4582.33984375
13: -3.257499040675225 | 4525.22021484375
14: -3.6713076689375934 | 4576.75
15: -3.711805490593491 | 4590.919921875
16: -4.138224256852199 | 4591.83984375
17: -4.01707605106002 | 4547.080078125
18: -3.5422168200153985 | 4551.669921875
19: -3.558187740971514 | 4558.56982421875
20: 1.2019347030362972 | 4560.2900390625
21: -4.016603042372994 | 4604.8701171875
22: -3.8709042871143153 | 4556.60986328125
23: -4.029106600724278 | 4551.64013671875
24: -3.5252659474144625 | 4537.7

KeyboardInterrupt: 