In [1]:
import gym
import random
import numpy as np
from gym.envs.registration import registry, register
import time
from IPython.display import clear_output

In [2]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery' : False},
        max_episode_steps=100,
        reward_threshold=0.70, # optimum = 0.74
    )
except:
    pass

env_name = 'FrozenLakeNoSlip-v0'
env = gym.make(env_name)
print(env.observation_space)
print(env.action_space)

Discrete(16)
Discrete(4)


In [3]:
class Agent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space == gym.spaces.discrete.Discrete)
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("action size", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("action range", self.action_low, self.action_high)
    
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, 
                                       self.action_high, 
                                       self.action_shape)
#         pole_angle = state[2]
#         action = 0 if pole_angle<0 else 1
        return action

In [4]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("state size", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-3*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)        
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * q_update        
        
        if done:
            self.eps = self.eps * 0.99
        
agent = QAgent(env)        

action size 4
state size 16


In [6]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        print("s", state, "a", action)
        print("Episode: {}, Total Reward: {}, eps: {}".format(ep, total_reward, agent.eps))
        env.render()
        print(agent.q_table)
        time.sleep(0.05)
        clear_output(wait=True)
    
env.close()

s 5 a 1
Episode: 99, Total Reward: 8.0, eps: 0.13397967485796175
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
[[3.47381445e-04 5.72585556e-04 7.91235856e-04 3.03838488e-04]
 [4.95917963e-04 3.79499243e-05 7.62584685e-04 7.51970248e-04]
 [2.98891019e-04 6.65092785e-04 8.46605763e-04 5.44226037e-04]
 [5.59094691e-04 4.16830512e-04 7.52246578e-04 6.33475690e-04]
 [8.30546981e-05 8.19996636e-04 5.65864964e-04 2.49178022e-04]
 [2.68983400e-04 7.77153514e-04 8.54062569e-04 8.03576647e-04]
 [5.55923385e-08 9.61861184e-04 9.24404283e-05 5.41781249e-04]
 [5.29812243e-05 3.29810394e-04 5.59267127e-04 1.86827181e-04]
 [9.45849436e-04 2.79700505e-04 1.06197361e-04 2.21854480e-04]
 [4.03987850e-05 5.21341995e-04 2.36545573e-04 4.96081711e-04]
 [2.00465940e-04 6.15841339e-03 6.20590136e-04 3.49934248e-04]
 [3.77398959e-04 8.47669971e-04 5.14171826e-05 9.95741055e-04]
 [8.31009006e-04 4.92553275e-05 7.84743053e-04 7.30921299e-04]
 [6.17432319e-04 2.24261998e-04 2.90919891e-03 7.12650405e-04]
 [2.72363632e-0