In [0]:
import gym
import numpy as np

from gym.envs.registration import register
register(
    id='Deterministic-4x4-FrozenLake-v0', # name given to this new environment
    entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', # env entry point
    kwargs={'map_name': '4x4', 'is_slippery': False} # argument passed to the env
)
"""We specify the start state at 1..
This can be reconfiguired as per our requirements """
env = gym.make('Deterministic-4x4-FrozenLake-v0') # load the environment
my_desk = [
    "GSFFF",
    "FFFFF",
    "FFFFG",
    "FFFFF",
    "FGFFG"
]


In [2]:

import gym

class CustomizedFrozenLake(gym.envs.toy_text.frozen_lake.FrozenLakeEnv):
    def __init__(self, **kwargs):
        super(CustomizedFrozenLake, self).__init__(**kwargs)

        for state in range(self.nS): # for all states
            for action in range(self.nA): # for all actions
                my_transitions = []
                for (prob, next_state, _, is_terminal) in self.P[state][action]:
                    row = next_state // self.ncol
                    col = next_state - row * self.ncol
                    tile_type = self.desc[row, col]
                    if tile_type == b'F':
                        reward = -1
                    elif tile_type == b'G':
                        reward = 10
                    else: 
                        reward = 0
                        

                    my_transitions.append((prob, next_state, reward, is_terminal))
                self.P[state][action] = my_transitions

from gym.envs.registration import register

register(
    id='Stochastic-5x5-FrozenLake-v0',
    entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
    kwargs={'desc': my_desk, 'is_slippery': False})
env = gym.make('Stochastic-5x5-FrozenLake-v0')
env.render()
print(env.action_space.n)
print(env.observation_space.n)


G[41mS[0mFFF
FFFFF
FFFFG
FFFFF
FGFFG
4
25


In [0]:
#Parameters
epsilon = 0.9
total_episodes = 5500
max_steps = 100
alpha = 0.70
gamma = 0.75




In [0]:
#Initializing the Q-matrix 
Q = np.zeros((env.observation_space.n, env.action_space.n)) 
#print(Q)

#Function to choose the next action 
def choose_action(state): 
	action=0
	if np.random.uniform(0, 1) < epsilon: 
		action = env.action_space.sample() 
	else: 
		action = np.argmax(Q[state, :]) 
	return action 

#Function to learn the Q-value 
def update(state, state2, reward, action, action2): 
	predict = Q[state, action] 
	target = reward + gamma * Q[state2, action2] 
	Q[state, action] = Q[state, action] + alpha * (target - predict) 
  
#print(Q)

In [16]:
#Initializing the reward 
reward=0

# Starting the SARSA learning 
for episode in range(total_episodes): 
	t = 0
	state1 = env.reset() 
	action1 = choose_action(state1) 

	while t < max_steps: 
		#Visualizing the training 
		env.render() 
		
		#Getting the next state 
		state2, reward, done, info = env.step(action1) 

		#Choosing the next action 
		action2 = choose_action(state2) 
		
		#Learning the Q-value 
		update(state1, state2, reward, action1, action2) 

		state1 = state2 
		action1 = action2 
		
		#Updating the respective vaLues 
		t += 1
		reward += -1
		
		#If at the end of learning process 
		if done: 
			break 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
FGFFG
  (Left)
GSFFF
[41mF[0mFFFF
FFFFG
FFFFF
FGFFG

G[41mS[0mFFF
FFFFF
FFFFG
FFFFF
FGFFG
  (Up)
G[41mS[0mFFF
FFFFF
FFFFG
FFFFF
FGFFG

G[41mS[0mFFF
FFFFF
FFFFG
FFFFF
FGFFG
  (Up)
G[41mS[0mFFF
FFFFF
FFFFG
FFFFF
FGFFG
  (Down)
GSFFF
F[41mF[0mFFF
FFFFG
FFFFF
FGFFG
  (Right)
GSFFF
FF[41mF[0mFF
FFFFG
FFFFF
FGFFG
  (Up)
GS[41mF[0mFF
FFFFF
FFFFG
FFFFF
FGFFG
  (Up)
GS[41mF[0mFF
FFFFF
FFFFG
FFFFF
FGFFG
  (Down)
GSFFF
FF[41mF[0mFF
FFFFG
FFFFF
FGFFG
  (Up)
GS[41mF[0mFF
FFFFF
FFFFG
FFFFF
FGFFG
  (Right)
GSF[41mF[0mF
FFFFF
FFFFG
FFFFF
FGFFG
  (Right)
GSFF[41mF[0m
FFFFF
FFFFG
FFFFF
FGFFG
  (Down)
GSFFF
FFFF[41mF[0m
FFFFG
FFFFF
FGFFG
  (Left)
GSFFF
FFF[41mF[0mF
FFFFG
FFFFF
FGFFG
  (Right)
GSFFF
FFFF[41mF[0m
FFFFG
FFFFF
FGFFG
  (Up)
GSFF[41mF[0m
FFFFF
FFFFG
FFFFF
FGFFG
  (Left)
GSF[41mF[0mF
FFFFF
FFFFG
FFFFF
FGFFG
  (Up)
GSF[41mF[0mF
FFFFF
FFFFG
FFFFF
FGFFG
  (Up)
GSF[41mF[0mF
FFFFF
FFFFG
FFFFF
FGF

In [10]:
#Evaluating the performance 
print ("Performace : ", reward/total_episodes) 

#Visualizing the Q-matrix 
print(Q) 
"A positive performance is highly acceptable, given that with every step a penalty of -1 is incured"


Performace :  0.0
[[0.         0.         0.         0.        ]
 [1.         0.13274721 0.07648872 0.56953974]
 [0.54072524 0.12122495 0.15814572 0.0489783 ]
 [0.07776374 0.19327976 0.4114731  0.30618066]
 [0.08640925 0.3275145  0.17826196 0.08731207]
 [0.39577389 0.11300707 0.09764022 1.        ]
 [0.12497262 0.36135288 0.09676008 0.19409453]
 [0.13187797 0.08586729 0.36476933 0.07487011]
 [0.10040456 0.70236679 0.74703373 0.09673464]
 [0.2501247  1.         0.69773896 0.11994224]
 [0.13446335 0.20802134 0.18088891 0.56541349]
 [0.23756802 0.2693264  0.06477107 0.20374487]
 [0.05947002 0.08085113 0.08264882 0.12529833]
 [0.07471843 0.37871433 1.         0.41217226]
 [0.         0.         0.         0.        ]
 [0.42556701 0.63598816 0.59889383 0.32766487]
 [0.24501957 1.         0.30190637 0.11578603]
 [0.10318084 0.21278995 0.15782044 0.09638   ]
 [0.36341704 0.58674952 0.70785036 0.2871282 ]
 [0.13864157 1.         0.605381   1.        ]
 [0.58651346 0.49714581 1.         0.44345