In [16]:
#import packages

import gym
import random
import numpy as np
import time

In [17]:
#invoke the environment
env_name = "FrozenLake-v0"
# instantiate environment
env = gym.make(env_name)

# output variables for state and action
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)


Observation space: Discrete(16)
Action space: Discrete(4)


In [18]:
#create agent class
class Agent():
    def __init__(self, env):
        #initialize variable
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete

        if self.is_discrete:
        # save action size -discrete space
            self.action_numbers = env.action_space.n
            print ("Action size:", self.action_numbers)
        else:
        # indetify parameters for the action space
            self.action_min = env.action_space.min
            self.action_max = env.action_space.max
            self.action_dist = env.action_space.dist
            print("Action range:" , self.action_min,self.action_max)
    def  get_action(self, current_state):
        if self.is_discrete:
            action  = random.choice(range(self.action_numbers))
        else:
            #get an action that is uniformly selected from the range 
            action = np.random.uniform(self.action_min,self.action_max, self.action_dist)
        return action

In [19]:
# create Q-learning agent class
class QLAgent(Agent):
    def __init__(self, env, gamma = 0.97, alpha = 0.01):
        super().__init__(env)
        self.state_numbers = env.observation_space.n
        print('State size:', self.state_numbers)

        self.greedy = 1.0
        self.gamma = gamma
        self.alpha = alpha
        self.build_model()

    def build_model(self):
            self.q_table = 1e-4*np.random.random([self.state_numbers,self.action_numbers])

    def get_action(self, current_state):
            current_q = self.q_table[current_state]
            action_greedy = np.argmax(current_q)
            action_random  = super().get_action(current_state)
            return action_random if random.random() < self.greedy else action_greedy

    def train (self,current_state, action, new_state, reward, experience):
            done = experience

            new_q = self.q_table[new_state]
            new_q= np.zeros([self.action_numbers]) if done else new_q
            target_q = reward + self.gamma*np.max(new_q)

            update_q = target_q - self.q_table[current_state,action]
            self.q_table[current_state,action] += self.alpha*update_q

            if done:
                self.greedy = self.greedy * 0.99

In [20]:
agent = QLAgent(env)
total_reward = 0
for ep in range (200):
    current_state = env.reset()
    done = False
    while not done:
        action = agent.get_action(current_state)
        new_state, reward, done, info  = env.step(action)
        agent.train(current_state,action,new_state,reward,done)
        current_state = new_state
        total_reward += reward

        print("state:", current_state, "action:", action, "next_state:", new_state)
        print("episode: {}, total reward: {}, epsilon: {}" .format(ep,total_reward, agent.greedy))
        env.render()
        time.sleep(0.05)

Action size: 4
State size: 16
state: 4 action: 0 next_state: 4
episode: 0, total reward: 0.0, epsilon: 1.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 5 action: 1 next_state: 5
episode: 0, total reward: 0.0, epsilon: 0.99
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 1, total reward: 0.0, epsilon: 0.99
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 8 action: 2 next_state: 8
episode: 1, total reward: 0.0, epsilon: 0.99
  (Right)
SFFF
FHFH
[41mF[0mFFH
HFFG
state: 8 action: 0 next_state: 8
episode: 1, total reward: 0.0, epsilon: 0.99
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
state: 4 action: 3 next_state: 4
episode: 1, total reward: 0.0, epsilon: 0.99
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 1, total reward: 0.0, epsilon: 0.99
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 5 action: 2 next_state: 5
episode: 1, total reward: 0.0, epsilon: 0.9801
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 0 action: 0 next_state: 0
episode:

state: 5 action: 2 next_state: 5
episode: 10, total reward: 1.0, epsilon: 0.8953382542587163
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 0 action: 2 next_state: 0
episode: 11, total reward: 1.0, epsilon: 0.8953382542587163
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 11, total reward: 1.0, epsilon: 0.8953382542587163
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 4 action: 2 next_state: 4
episode: 11, total reward: 1.0, epsilon: 0.8953382542587163
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 11, total reward: 1.0, epsilon: 0.8953382542587163
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 11, total reward: 1.0, epsilon: 0.8953382542587163
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 1 next_state: 0
episode: 11, total reward: 1.0, epsilon: 0.8953382542587163
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 11, total reward: 1.0, epsilon: 0.8953382542

state: 4 action: 0 next_state: 4
episode: 21, total reward: 1.0, epsilon: 0.8097278682212583
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 21, total reward: 1.0, epsilon: 0.8097278682212583
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 21, total reward: 1.0, epsilon: 0.8097278682212583
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 21, total reward: 1.0, epsilon: 0.8097278682212583
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 1 next_state: 0
episode: 21, total reward: 1.0, epsilon: 0.8097278682212583
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 1 next_state: 0
episode: 21, total reward: 1.0, epsilon: 0.8097278682212583
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 21, total reward: 1.0, epsilon: 0.8097278682212583
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 21, total reward: 1.0, epsilon: 0.8097278682212

state: 1 action: 2 next_state: 1
episode: 25, total reward: 1.0, epsilon: 0.7778213593991465
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 2 action: 1 next_state: 2
episode: 25, total reward: 1.0, epsilon: 0.7778213593991465
  (Down)
SF[41mF[0mF
FHFH
FFFH
HFFG
state: 1 action: 0 next_state: 1
episode: 25, total reward: 1.0, epsilon: 0.7778213593991465
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 1 action: 2 next_state: 1
episode: 25, total reward: 1.0, epsilon: 0.7778213593991465
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 5 action: 2 next_state: 5
episode: 25, total reward: 1.0, epsilon: 0.7700431458051551
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 1 action: 1 next_state: 1
episode: 26, total reward: 1.0, epsilon: 0.7700431458051551
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 5 action: 2 next_state: 5
episode: 26, total reward: 1.0, epsilon: 0.7623427143471035
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 27, total reward: 1.0, epsilon: 0.762

state: 0 action: 3 next_state: 0
episode: 38, total reward: 1.0, epsilon: 0.682554595010387
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 2 next_state: 0
episode: 38, total reward: 1.0, epsilon: 0.682554595010387
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 0 next_state: 0
episode: 38, total reward: 1.0, epsilon: 0.682554595010387
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 2 next_state: 0
episode: 38, total reward: 1.0, epsilon: 0.682554595010387
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 2 next_state: 0
episode: 38, total reward: 1.0, epsilon: 0.682554595010387
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 0 next_state: 0
episode: 38, total reward: 1.0, epsilon: 0.682554595010387
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 1 action: 3 next_state: 1
episode: 38, total reward: 1.0, epsilon: 0.682554595010387
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 2 action: 1 next_state: 2
episode: 38, total reward: 1.0, epsilon: 0.682554595010387

state: 1 action: 2 next_state: 1
episode: 48, total reward: 1.0, epsilon: 0.617290140942288
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 2 action: 2 next_state: 2
episode: 48, total reward: 1.0, epsilon: 0.617290140942288
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
state: 6 action: 0 next_state: 6
episode: 48, total reward: 1.0, epsilon: 0.617290140942288
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
state: 2 action: 0 next_state: 2
episode: 48, total reward: 1.0, epsilon: 0.617290140942288
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
state: 1 action: 0 next_state: 1
episode: 48, total reward: 1.0, epsilon: 0.617290140942288
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 1 action: 2 next_state: 1
episode: 48, total reward: 1.0, epsilon: 0.617290140942288
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 2 action: 1 next_state: 2
episode: 48, total reward: 1.0, epsilon: 0.617290140942288
  (Down)
SF[41mF[0mF
FHFH
FFFH
HFFG
state: 6 action: 0 next_state: 6
episode: 48, total reward: 1.0, epsilon: 0.61729014094

state: 3 action: 3 next_state: 3
episode: 57, total reward: 1.0, epsilon: 0.5639051904523876
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
state: 3 action: 0 next_state: 3
episode: 57, total reward: 1.0, epsilon: 0.5639051904523876
  (Left)
SFF[41mF[0m
FHFH
FFFH
HFFG
state: 3 action: 0 next_state: 3
episode: 57, total reward: 1.0, epsilon: 0.5639051904523876
  (Left)
SFF[41mF[0m
FHFH
FFFH
HFFG
state: 7 action: 0 next_state: 7
episode: 57, total reward: 1.0, epsilon: 0.5582661385478638
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 58, total reward: 1.0, epsilon: 0.5582661385478638
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 4 action: 0 next_state: 4
episode: 58, total reward: 1.0, epsilon: 0.5582661385478638
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 3 next_state: 4
episode: 58, total reward: 1.0, epsilon: 0.5582661385478638
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 8 action: 0 next_state: 8
episode: 58, total reward: 1.0, epsilon: 0.5582661385478

state: 5 action: 1 next_state: 5
episode: 68, total reward: 2.0, epsilon: 0.4998370298991989
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 4 action: 0 next_state: 4
episode: 69, total reward: 2.0, epsilon: 0.4998370298991989
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 5 action: 3 next_state: 5
episode: 69, total reward: 2.0, epsilon: 0.49483865960020695
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 70, total reward: 2.0, epsilon: 0.49483865960020695
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 70, total reward: 2.0, epsilon: 0.49483865960020695
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 3 next_state: 4
episode: 70, total reward: 2.0, epsilon: 0.49483865960020695
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 0 next_state: 4
episode: 70, total reward: 2.0, epsilon: 0.49483865960020695
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 3 next_state: 4
episode: 70, total reward: 2.0, epsilon: 0.494838

state: 1 action: 0 next_state: 1
episode: 78, total reward: 2.0, epsilon: 0.45660974774391455
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 1 action: 3 next_state: 1
episode: 78, total reward: 2.0, epsilon: 0.45660974774391455
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 78, total reward: 2.0, epsilon: 0.45660974774391455
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 0 next_state: 0
episode: 78, total reward: 2.0, epsilon: 0.45660974774391455
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 78, total reward: 2.0, epsilon: 0.45660974774391455
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 0 action: 0 next_state: 0
episode: 78, total reward: 2.0, epsilon: 0.45660974774391455
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 4 action: 0 next_state: 4
episode: 78, total reward: 2.0, epsilon: 0.45660974774391455
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 8 action: 1 next_state: 8
episode: 78, total reward: 2.0, epsilon: 0.456609

state: 8 action: 1 next_state: 8
episode: 84, total reward: 2.0, epsilon: 0.4298890135238936
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
state: 4 action: 3 next_state: 4
episode: 84, total reward: 2.0, epsilon: 0.4298890135238936
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 0 action: 0 next_state: 0
episode: 84, total reward: 2.0, epsilon: 0.4298890135238936
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 84, total reward: 2.0, epsilon: 0.4298890135238936
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 8 action: 0 next_state: 8
episode: 84, total reward: 2.0, epsilon: 0.4298890135238936
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
state: 9 action: 3 next_state: 9
episode: 84, total reward: 2.0, epsilon: 0.4298890135238936
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
state: 5 action: 3 next_state: 5
episode: 84, total reward: 2.0, epsilon: 0.42559012338865465
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 0 action: 0 next_state: 0
episode: 85, total reward: 2.0, epsilon: 0.425590123388

state: 1 action: 1 next_state: 1
episode: 88, total reward: 2.0, epsilon: 0.41294967113388825
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 0 action: 3 next_state: 0
episode: 88, total reward: 2.0, epsilon: 0.41294967113388825
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 88, total reward: 2.0, epsilon: 0.41294967113388825
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 8 action: 1 next_state: 8
episode: 88, total reward: 2.0, epsilon: 0.41294967113388825
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
state: 4 action: 2 next_state: 4
episode: 88, total reward: 2.0, epsilon: 0.41294967113388825
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 5 action: 3 next_state: 5
episode: 88, total reward: 2.0, epsilon: 0.40882017442254937
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 4 action: 1 next_state: 4
episode: 89, total reward: 2.0, epsilon: 0.40882017442254937
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
state: 4 action: 3 next_state: 4
episode: 89, total reward: 2.0, epsilon: 0.408

state: 5 action: 1 next_state: 5
episode: 96, total reward: 2.0, epsilon: 0.37723664692350434
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 1 action: 1 next_state: 1
episode: 97, total reward: 2.0, epsilon: 0.37723664692350434
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 1 action: 3 next_state: 1
episode: 97, total reward: 2.0, epsilon: 0.37723664692350434
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 2 action: 3 next_state: 2
episode: 97, total reward: 2.0, epsilon: 0.37723664692350434
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
state: 6 action: 0 next_state: 6
episode: 97, total reward: 2.0, epsilon: 0.37723664692350434
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
state: 5 action: 0 next_state: 5
episode: 97, total reward: 2.0, epsilon: 0.37346428045426927
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
state: 1 action: 1 next_state: 1
episode: 98, total reward: 2.0, epsilon: 0.37346428045426927
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
state: 2 action: 3 next_state: 2
episode: 98, total reward: 2.0, epsilon: 0.3734