# We will try to make a Q Table for solving. 
We are going to disable slippery for reducing complications.

In [1]:
import gym
import numpy as np
import random
from time import sleep
from IPython.display import clear_output
from gym.envs.registration import register        # <<<--- To manipulate the registry of the FrozenLake Source to remove slipping 

# Lets make the basic agent from the last exercise

In [2]:
class Agent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        #     ^^^^ <<< Takes 1 if discrete else 0
        
        if self.is_discrete:
            self.action_size = env.action_space.n
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_size = env.action_space.shape
        
    def get_action(self):
        if self.is_discrete:
            a = random.choice(range(self.action_size))
        else:             # vvvv <<< takes low high and shape of output as args
            a = np.random.uniform(self.action_low,
                                  self.action_high,
                                  self.action_size)
        return a

try:    
    register(
        id='FrozenLake-NoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery' : False},     # <<<--- Passing to register as a **kwarg... 
        max_episode_steps=100,                                  # Refer https://github.com/openai/gym/blob/master/gym/envs/__init__.py
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

env = gym.make('FrozenLake-NoSlip-v0')
agent = Agent(env)
state = env.reset()

for _ in range(250):
    action = agent.get_action()
    state, reward, done, info = env.step(action)
    if done:
        break
    env.render()
    sleep(0.5)
    clear_output(wait=True)
env.close()

  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG


# We will now make the Q Agent

In [45]:
class QAgent(Agent):
    def __init__(self, env, learning_rate = 0.01, discount = 0.97):
        super().__init__(env)
        self.state_size = env.observation_space.n
        self.epsilon = 1.0
        self.learning_rate = learning_rate
        self.discount = discount
        self.make_table()

    def make_table(self):
        self.QTable = np.random.random([self.state_size, self.action_size]) * 1e-4

    def get_action(self, state):
        q_state = self.QTable[state]
        greedy = np.argmax(q_state)
        explore = super().get_action()      # <<<--- Calls for a random action
        return explore if random.random() < self.epsilon else greedy

    def train(self, exp):
        state, action, next_state, done, reward = exp
        q_next = self.QTable[next_state] if not done else np.zeros([self.action_size])
        q_target = reward + self.discount * np.max(q_next)
        q_update = q_target - self.QTable[state, action]
        self.QTable[state, action] += self.learning_rate * q_update
        if done and reward:
            self.epsilon *= 0.99

# Lets call the trainer now

In [46]:
agent = QAgent(env)

In [47]:
print(agent.epsilon)

1.0


In [48]:
total_reward = 0

while True:
    for episode in range(100):
        state = env.reset()
        done = False
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.train((state, action, next_state, done, reward))
            state = next_state
            total_reward += reward
            print(f"Episode : {episode}, Total Reward : {total_reward}, Epsilon : {agent.epsilon}")
            env.render()
            print(agent.QTable)
            sleep(0.01)
            clear_output(wait=True)
    if episode == int(total_reward):
        break
    episode = 0
    total_reward = 0


Episode : 99, Total Reward : 99.0, Epsilon : 0.0005714123018121786
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[5.57314055e-03 6.59822960e-01 4.15799400e-04 7.68978823e-03]
 [4.83919965e-03 9.13463483e-06 1.13457428e-04 9.20151709e-05]
 [8.94757745e-05 5.74736938e-04 4.55401007e-05 8.87939075e-05]
 [3.54008574e-05 5.27210073e-05 2.19170805e-05 2.49307648e-05]
 [2.02244228e-02 7.72199020e-01 1.37388909e-06 4.41949219e-03]
 [3.76523757e-05 2.14330868e-05 2.19622403e-06 2.29181631e-05]
 [5.49001008e-05 1.13850854e-02 5.39584207e-05 6.77495724e-05]
 [2.84705023e-05 9.80933346e-05 3.05642251e-05 2.36984576e-05]
 [1.79159318e-02 2.30251012e-05 8.59897344e-01 7.62311497e-03]
 [2.20257727e-02 9.22010876e-01 3.17004949e-02 3.20942257e-05]
 [4.44576634e-03 2.30900956e-01 7.13248162e-05 6.13902428e-04]
 [9.23496133e-06 9.78955466e-05 8.54181376e-05 9.94680607e-05]
 [1.95315576e-05 9.61275890e-05 9.28639529e-05 8.36582644e-05]
 [1.76566790e-05 4.93500582e-02 9.65366794e-01 3.94602263e-02]
 [7.53092454