In [1]:
import gym
import numpy as np
import random
import torch
import time
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
map = ['SFFF',
          'FHFH',
          'HFFH',
          'HFFG']



In [3]:
env = gym.make('FrozenLake-v0', is_slippery=False, desc = map)


# visualize 4x4 frozen lake
env.render()


[41mS[0mFFF
FHFH
HFFH
HFFG


In [4]:
# Total number of States and Actions
n_states = env.observation_space.n
n_actions = env.action_space.n
n_rows = 4
n_cols = 4
print( "States = ", n_states)
print( "Actions = ", n_actions)

States =  16
Actions =  4


In [5]:
def restrict_actions(Q, n_states, n_rows):

  Q.at[n_states -1, :] = np.zeros(n_actions,)
  for i in range( 0, n_states, n_rows): 
    Q.at[i,0] = np.NaN
  for i in range( n_rows -1 , n_states, n_rows): 
    Q.at[i,2] = np.NaN
  for i in range(0, n_rows):
    Q.at[i,3] = np.NaN
  for i in range(n_states - n_rows , n_states):
    Q.at[i,1 ]= np.NaN
  
  return Q
  

In [6]:
def choose_action(Q, state, epsilon):
  random_for_epsilon = np.random.rand()
  if random_for_epsilon <= epsilon:
    s = Q.loc[state].notna()
    vals = s[s].index.values
    action = random.choice(vals)
  else: 
    Q.loc[state] += np.random.rand(n_actions,)/100
    action = np.argmax(Q.loc[state])
  return action

In [9]:
def rewarder(new_state, reward):
  if map[rowsandcols(new_state)[0]][rowsandcols(new_state)[1]]== 'H':
    reward -= 20
  elif map[rowsandcols(new_state)[0]][rowsandcols(new_state)[1]]== 'F':
    reward -= 1 
  elif map[rowsandcols(new_state)[0]][rowsandcols(new_state)[1]]== 'S':
    reward -= 1
  else: #goal
    reward += 100

  return reward

In [7]:
##assign index to each state using state-matrix

state_matrix = np.arange(0,n_states).reshape(n_rows,n_cols)
state_matrix

def rowsandcols(state):
  ''' input: state returned by env
      output: location of state as (row,col) tuple'''
  return int(np.where(state_matrix ==state)[0]), int(np.where(state_matrix ==state)[1])

In [8]:
reps = 100
num_episodes = 1000

In [17]:
n_successes = []

for i_rep in range(reps):
  
  steps_total = [] # store number of steps taken in each episode
  rewards_total = [] #store reward obtained for each episode
  epsilon_total = [] #store epsilon obtained at the end of each episode
  terminal_state = [] 

  epsilon = 0.8
  epsilon_final = 0.1
  epsilon_decay = 0.999
  gamma = 0.90 # discount factor
  learning_rate = 0.9 #how important is the difference between q-val from q-table and what's observed

  Q = pd.DataFrame(np.random.rand(n_states,n_actions)/1000)
  Q.loc[15] = np.zeros(n_actions,)
  Q = restrict_actions(Q, n_states, n_rows)

  for i_episode in range(num_episodes):
    
    # resets the environment
    state = env.reset()
    step = 0
    reward = 0

  ## as epsilon decays with more timesteps, the prob. of selecting a random val < e decays --> more likely to exploit. 
    if epsilon > epsilon_final:
            epsilon *= epsilon_decay

    while True:
        
        step += 1
        
        action = choose_action(Q, state, epsilon)
  
         
        ## env gives reward and next state and whether we've reached terminal state upon taking action at current state.. 
        new_state, _ , done, info = env.step(action)

        ##if you want reward penalized at for each timestep
        reward = rewarder(new_state, reward)

        # filling the Q Table - 
        
        Q.loc[state, action] = (1- learning_rate)*Q.at[state, action] + learning_rate*(reward + gamma * np.max(Q.loc[new_state]))
        
        # Setting new state for next action
        state = new_state
        tile = map[rowsandcols(state)[0]][rowsandcols(state)[1]]
        #env.render()
        
        if done:
          #print(Q)
          
          terminal_state.append(tile)
          #steps_total.append(step)
          #rewards_total.append(reward)
          #epsilon_total.append(epsilon)
          #if i_episode % 10 == 0:
            #print('Episode: {} Reward: {} Steps Taken: {} Terminal State: {}, Epsilon: {}'.format(i_episode,reward, step, tile, epsilon))
          break
  n_successes.append(terminal_state.count('G'))

In [18]:
np.mean(n_successes)

315.28

In [19]:
np.std(n_successes)

15.988170627060493

In [15]:
import plotly.express as px

In [20]:

fig = px.bar( x= np.arange(1,101), y= n_successes)
fig.update_layout(
    title="N_success 4x4 with negative reward structure and restricted actions",
    xaxis=dict(
        title='rep',
        tickmode='linear'),
    yaxis_title="n_successes per 1000 episodes",
    font=dict(
        family="Courier New, monospace",
        size=18))

fig.show()