In [1]:
import gym
import random
import numpy as np
import time
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from gym.envs.registration import registry, register
from IPython.display import clear_output

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery' : False},
        max_episode_steps=100,
        reward_threshold=0.70, # optimum = 0.74
    )
except:
    pass

env_name = 'FrozenLakeNoSlip-v0'
env = gym.make(env_name)
print(env.observation_space)
print(env.action_space)

Discrete(16)
Discrete(4)


In [3]:
class Agent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space == gym.spaces.discrete.Discrete)
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("action size", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("action range", self.action_low, self.action_high)
    
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, 
                                       self.action_high, 
                                       self.action_shape)
#         pole_angle = state[2]
#         action = 0 if pole_angle<0 else 1
        return action

In [4]:
class QNAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("state size", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def build_model(self):
        tf.reset_default_graph()
        self.state_in = tf.placeholder(tf.int32, shape=[1])
        self.action_in = tf.placeholder(tf.int32, shape=[1])
        self.target_in = tf.placeholder(tf.float32, shape=[1])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.layers.dense(self.state, units=self.action_size, name='q_table')
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)        
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = [[exp] for exp in experience]
        
        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate * np.max(q_next)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict=feed)
        
        if experience[4]:
            self.eps = self.eps * 0.99
            
    def __del__(self):
        self.sess.close()
        
agent = QNAgent(env)        

action size 4
state size 16




In [10]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        print("s", state, "a", action)
        print("Episode: {}, Total Reward: {}, eps: {}".format(ep, total_reward, agent.eps))
        env.render()
        with tf.variable_scope('q_table', reuse=True):
            weights = agent.sess.run(tf.get_variable("kernel"))
            print(weights)
#         time.sleep(0.05)
        clear_output(wait=True)
    
env.close()

s 15 a 2
Episode: 99, Total Reward: 100.0, eps: 0.002405009291311067
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[ 5.17819368e-04  3.82420607e-02  3.11563790e-01  1.84536967e-02]
 [ 7.11053088e-02 -6.97390079e-01  3.21342587e-01 -6.27411678e-02]
 [ 1.13401234e-01  1.73511013e-01  2.00005621e-01 -9.73510593e-02]
 [ 1.29013032e-01 -6.34758770e-01  1.91240132e-01 -2.57562071e-01]
 [-2.51524471e-05 -1.86018378e-01 -4.88577425e-01  7.81272277e-02]
 [ 5.54395318e-02  3.43335271e-01 -2.49466300e-02  3.97480011e-01]
 [-7.92857587e-01  2.14802340e-01 -5.58499038e-01 -3.06803151e-03]
 [ 2.50375271e-03  3.35695148e-01  5.36988854e-01 -4.60779667e-03]
 [-1.40862420e-01 -5.91722667e-01 -1.19295925e-01 -5.38777336e-02]
 [ 6.67535067e-02  1.66518927e-01  1.78903744e-01 -6.83779240e-01]
 [ 2.41397709e-01  2.23806813e-01 -5.50261915e-01  1.12169243e-01]
 [-1.59669280e-01  2.37816572e-02  1.88992798e-01 -2.25570530e-01]
 [ 1.55615091e-01  2.01305866e-01 -8.18233490e-02  3.13671827e-02]
 [-6.75615668e-01  1.