In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand
import math
import tensorflow_probability as tfp
from sklearn.preprocessing import MinMaxScaler

In [2]:
import os
os.chdir('../../')

In [3]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

np.set_printoptions(precision=6, suppress=True)

In [4]:
real_data = pd.read_excel('./documents/nov_nine_var.xlsx').to_numpy()
goal_data = pd.read_excel('./documents/result/basic_formula.xlsx').to_numpy()

scaler = MinMaxScaler()
scaler = scaler.fit(real_data[:,1:22])

In [5]:
def argmax(l):
    return max(range(len(l)), key=lambda i: l[i])

def argmin(l):
    return min(range(len(l)), key=lambda i: l[i])

In [6]:
start = np.round(scaler.transform(real_data[:,1:22])[-1].reshape(1, 21), 2)
goal = np.round(scaler.transform(goal_data[:,1:22])[argmin(goal_data[:,-1])].reshape(1, 21), 2)

print(goal[0])
print(start[0])

need_step = int(np.sum(abs(goal-start))*100)
print(need_step)

[0.51 0.5  0.5  0.49 0.5  0.5  0.5  0.5  0.5  0.5  0.49 0.5  0.5  0.5
 0.5  0.5  0.49 0.49 0.51 0.5  0.5 ]
[0.   0.98 0.   1.   0.99 1.   0.   0.78 0.09 0.46 0.4  0.58 0.27 0.32
 0.23 0.23 1.   0.03 0.5  0.43 1.  ]
689


In [7]:
model_state = np.array([start, goal]).reshape(1, 42)

In [34]:
# actor-critic hyperparmater
GAMMA = 0.99
EPISODE_DONE = need_step * 10
LEARN_FREQ = 1
ACTION_NUM = 42

In [9]:
def return_action(i):
    a = np.zeros((1, 21))
    j = i // 2

    if i % 2 == 0:
        a[0][j] = -0.01
    
    else:
        a[0][j] = 0.01
    
    return a

In [10]:
def return_state(s, a):
    ns = s + a
    return ns

In [11]:
def return_reward(ns, gs):
    dist = np.sqrt(np.sum(np.square(gs - ns)))

    end = 0
    for i in range(21):
        if ns[0][i] == gs[0][i]:
            end += 5
    
    reward = -dist + end
    return reward

In [12]:
class critic(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(128,activation='relu')
    self.v = tf.keras.layers.Dense(1, activation = None)

  def call(self, input_data):
    x = self.d1(input_data)
    v = self.v(x)
    return v
    

class actor(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(128,activation='relu')
    self.a = tf.keras.layers.Dense(2,activation='softmax')

  def call(self, input_data):
    x = self.d1(input_data)
    a = self.a(x)
    return a

In [13]:
class Agent:
    def __init__(self):
        self.gamma = GAMMA
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=7e-3)
        self.c_opt = tf.keras.optimizers.Adam(learning_rate=7e-3)
        self.actor = actor()
        self.critic = critic()
        self.clip_pram = 0.2

    def act(self,state):
        prob = self.actor(np.array([state]))
        prob_np = prob.numpy()
        dist = tfp.distributions.Categorical(probs=prob_np, dtype=tf.float32)
        action = dist.sample()
        return int(action.numpy()[0][0]), prob[0]

    def actor_loss(self, probs, actions, adv, old_probs, closs):
        probability = probs
        entropy = tf.reduce_mean(tf.math.negative(tf.math.multiply(probability,tf.math.log(probability))))

        sur1 = []
        sur2 = []

        for pb, t, op,a  in zip(probability, adv, old_probs, actions):
                        t =  tf.constant(t)
                        ratio = tf.math.divide(pb[a],op[a])
                        s1 = tf.math.multiply(ratio,t)
                        s2 =  tf.math.multiply(tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram),t)
                        sur1.append(s1)
                        sur2.append(s2)

        sr1 = tf.stack(sur1)
        sr2 = tf.stack(sur2)

        loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - closs + 0.001 * entropy)

        return loss

    def learn(self, states, actions,  adv , old_probs, discnt_rewards):
        discnt_rewards = tf.reshape(discnt_rewards, (len(discnt_rewards),))
        adv = tf.reshape(adv, (len(adv),))

        old_p = old_probs

        old_p = tf.reshape(old_p, (len(old_p),2))
        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            p = self.actor(states, training=True)
            v =  self.critic(states,training=True)
            v = tf.reshape(v, (len(v),))
            td = tf.math.subtract(discnt_rewards, v)
            c_loss = 0.5 * tf.keras.losses.mean_squared_error(discnt_rewards, v)
            a_loss = self.actor_loss(p, actions, adv, old_probs, c_loss)
            
        grads1 = tape1.gradient(a_loss, self.actor.trainable_variables)
        grads2 = tape2.gradient(c_loss, self.critic.trainable_variables)
        self.a_opt.apply_gradients(zip(grads1, self.actor.trainable_variables))
        self.c_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))
        return a_loss, c_loss

In [14]:
def test_reward(agent):
    total_reward = 0
    state = start
    done = 0
    while not done:
        model_state = np.array([state, goal]).reshape(1, 42)
        a, _ = agent.act(model_state)
        action = return_action(a)
        next_state = return_state(state, action)
        reward = return_reward(next_state, goal)
        
        if all(state[0] == goal[0]):
            done = 1

        state = next_state
        total_reward += reward
        
    return total_reward

In [21]:
def preprocess(states, actions, rewards, dones, values):
    g = 0
    lamb = 0.95
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + GAMMA * values[i+1] * dones[i] - values[i]
        g = delta + GAMMA * lamb * dones[i] * g
        returns.append(g + values[i])
    
    returns.reverse()
    adv = np.array(returns, dtype=np.float32) - values[:-1]
    adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-10)

    states = np.array(states, dtype=np.float32)
    actions = np.array(actions, dtype=np.int8)
    returns = np.array(returns, dtype=np.float32)

    return states, actions, returns, adv    

In [35]:
agent = Agent()
best_reward = 0
avg_reward_list = []

for e in range(10000):
    if e % LEARN_FREQ == 0:
        if e != 0:    
            values.append(c[0][0])

            states, actions, returns, adv = preprocess(states, actions, rewards, dones, values)
            al, cl = agent.learn(states, actions, adv, probs, returns)
        
            avg_reward = np.mean([test_reward(agent) for _ in range(5)])
            avg_reward_list.append(avg_reward)

            if avg_reward > best_reward:
                agent.actor.save('./model/ppo_actor')
                agent.critic.save('./model/ppo_critic')
                best_reward = avg_reward

            if best_reward == 200:
                break
        
        all_aloss = []
        all_closs = []
        rewards = []
        states = []
        actions = []
        probs = []
        dones = []
        values = []

    state = start
    steps = 0
    episode_reward = 0

    done = 0

    while True:
        model_state = np.array([state, goal]).reshape(1, 42)
        a, p = agent.act(model_state)
        c = agent.critic(model_state).numpy()

        action = return_action(a)
        next_state = return_state(state, action)

        reward = return_reward(next_state, goal)

        if steps == EPISODE_DONE or all(state[0] == goal[0]):
            done = 1
        
        dones.append(done)
        rewards.append(reward)
        states.append(model_state[0])
        actions.append(a)
        probs.append(p[0])
        values.append(c[0][0])

        state = next_state
        episode_reward += reward
        steps += 1
        
        if done:
            print(f'{e}: {round(reward, 2)}')
            break

0: -53.97


KeyboardInterrupt: 