In [None]:
#!pip install matplotlib

In [1]:
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:
#Change environment name to whichever enevironment to be trained on. Check : https://gym.openai.com/envs/#toy_text
env = gym.make('FrozenLake-v0')

In [3]:
episodes = 10

for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        score, reward, done, info = env.step(env.action_space.sample())
        score += reward
        clear_output(wait=True)
        print("Episode: {}\nScore: {}".format(episode, score))

env.close()

Episode: 9
Score: 5.0


In [4]:
#Creating Q Table
action = env.action_space.n
state = env.observation_space.n

q_table = np.zeros((state, action))

In [None]:
q_table.shape

In [None]:
#To see q_table array

#q_table.shape
#q_table

In [5]:
#Parameters for Q Learning
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

rewards_all_episodes = []

In [9]:
#Q-Learning Algorithm

for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        #Exploration vs Exploitation Trade-off
        exploration_threshold = random.uniform(0, 1)
        if exploration_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        
        #Update Q-Table
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + (learning_rate * (reward + discount_rate * (np.max(q_table[new_state, :]))))
        
        state = new_state
        rewards_current_episode += reward
        
        if done == True:
            break
        
        exploration_rate = min_exploration_rate + \
                                (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
        
        rewards_all_episodes.append(rewards_current_episode)
        
print("*********** Training Finished ***********")

*********** Training Finished ***********


In [10]:
#Most Optimal Q-Table
q_table

array([[0.54068213, 0.46857279, 0.46437806, 0.46668397],
       [0.33615464, 0.2773223 , 0.30054463, 0.46872215],
       [0.4155097 , 0.39765041, 0.41133141, 0.43636624],
       [0.34410967, 0.31100665, 0.25840948, 0.41994956],
       [0.56486933, 0.41555107, 0.29668138, 0.27415904],
       [0.        , 0.        , 0.        , 0.        ],
       [0.1931075 , 0.24668493, 0.40081547, 0.14779965],
       [0.        , 0.        , 0.        , 0.        ],
       [0.39575195, 0.34729593, 0.36212874, 0.6403392 ],
       [0.48755026, 0.6873846 , 0.50412454, 0.35192124],
       [0.65619665, 0.34588327, 0.29825087, 0.31341614],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.42556353, 0.45379913, 0.76704576, 0.54200276],
       [0.72387899, 0.88036834, 0.71638282, 0.71404724],
       [0.        , 0.        , 0.        , 0.        ]])

In [None]:
#Calculate and print average reward per thousand epidsodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), (num_episodes/1000))
count = 1000

print("Average per thousand episodes")
for r in rewards_per_thousand_episodes:
    print(count,  " : ", str(sum(r/1000)))
    count += 1000

In [None]:
#Visualise Agent
import time

for episode in range(3):
    state = env.reset()
    done = False
    print("Episode is : " + str(episode))
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.4)
        action = np.argmax(q_table[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****** Reached Goal ******")
                time.sleep(2)
                clear_output(wait=True)
            else:
                print("****** Failed ******")
                time.sleep(2)
                clear_output(wait=True)
            break
        
        state = new_state

env.close()

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
