In [1]:
import gym
import random
import numpy as np
from gym.envs.registration import register
import time
from IPython.display import clear_output

In [2]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery' : False},
        max_episode_steps=100,
        reward_threshold=0.70, # optimum = 0.74
    )
except:
    pass

env_name = "FrozenLakeNoSlip-v0"

env = gym.make(env_name)

In [3]:
class Agent:
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
        
    def get_action(self, state):
        if self.is_discrete:
            return random.choice(range(self.action_size))
        else:
            return np.random.uniform(self.action_low,
                              self.action_high,
                              self.action_shape)

In [7]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.2):
        super().__init__(env)
        self.state_size = env.observation_space.n
        
        self.eps = 1.0
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.build_model()
    
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_q = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_q if self.eps < random.random() else action_random
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * q_update
        if done:
            self.eps = 0.99 * self.eps
        
agent = QAgent(env)

In [20]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        print(f"state {state} action: {action}")
        next_state, reward, done, info = env.step(action)
        agent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        env.render()
        print(f"Episode {ep}, Total Reward {total_reward}, eps {agent.eps}")
        print(agent.q_table)
        time.sleep(0.01)
        clear_output(wait=True)

state 14 action: 2
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode 99, Total Reward 99.0, eps 0.002405009291311067
[[8.32972004e-01 8.58734026e-01 8.07982771e-01 8.32972005e-01]
 [8.32971990e-01 3.67899666e-08 6.73192239e-01 8.01478170e-01]
 [7.74972040e-01 4.05778873e-01 2.11438233e-02 3.88608473e-01]
 [2.07035976e-01 4.40081114e-05 8.13369749e-03 1.03272854e-05]
 [8.58734016e-01 8.85292810e-01 2.42580640e-14 8.32971982e-01]
 [7.81168516e-05 1.89693743e-05 9.41110435e-05 2.50635455e-06]
 [1.17956136e-05 8.38131407e-01 2.01766711e-05 1.58414392e-01]
 [3.48547056e-05 7.72091684e-05 2.91711450e-05 9.13222963e-05]
 [8.85290924e-01 1.29599499e-11 9.12673000e-01 8.58733289e-01]
 [8.85288128e-01 9.40900000e-01 9.40691153e-01 1.61149522e-09]
 [8.76192588e-01 9.69998640e-01 3.53286547e-08 5.51293032e-01]
 [8.56633101e-05 9.81646490e-05 1.30465638e-05 5.04467236e-05]
 [5.17190424e-05 1.87603069e-05 8.17049456e-05 7.70417503e-05]
 [4.63389254e-10 9.39888800e-01 9.70000000e-01 9.08098306e-01]
 [9.