# Grid World with Slippery Surfaces

S 0 0 0 G
0 X 0 X 0
0 0 S 0 0
0 X 0 X 0
0 0 0 0 0


α: Learning rate
γ: Discount factor
r: Reward for the current action
s ′: Next state after taking action a from state 𝑠


In [14]:
# Setting up Q-Learning 

import numpy as np
import random

# Grid parameters 
grid_size = 5
slippery_cells = [(1,1), (3,2), (2,3), (3, 3)] # Examples slipeery 

#Q-learnig table Setup
q_table = np.zeros((grid_size, grid_size, 4)) # 4 Actions: up, down, left, right


def choose_action(state, epsilon=0.1):
    if random.uniform(0, 1) < epsilon: # Exploration
        rchoice = random.choices([0,1,2,3])
        print("Exploring: Taking a random action")
        return rchoice
    else:
        action = np.argmax(q_table[state])  # Best-known action (exploiting)
        print("Exploiting: Taking the best-known action")
        return action
    
# Policy Gradient setup (The brain)
policy = np.full((grid_size, grid_size, 4), 0.25) # Initialize with equal prob for each action
    


In [15]:
# Policy Gradient setup (The brain)
policy = np.full((grid_size, grid_size, 4), 0.25) # Initialize with equal prob for each action
print(policy)

[[[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]]


# This is a simple basic policy update

In [16]:
def policy_gradient_step(state, action, reward): #
    if reward > 0:
        policy[state][action] += 0.01 * reward 
    else:
        policy[state][action] -= 0.01 * reward 

    # Normalize probablities to keep the valid
    policy[state] = policy[state] / np.sum(policy[state])
    print(policy[state])

# Define slippery effect
def apply_slippery(action):
    if random.uniform(0, 1) < 0.1:  # 10% slip chance
        return random.choice([0, 1, 2, 3])  # Random action
    return action

# Testing the my setuo

In [None]:
# Environment setup
goal = (1,4)
reward_goal = 1
reward_step = -0.01 # Small penalty to encourage shorter path
reward_slip = -0.1

# Run multiple episodes to test learing
num_episodes = 50
for ep in range(num_episodes):
    state = (0,0) # Starting position
    total_reward = 0
    epsilon = max(0.1, 1 - ep / num_episodes) # Decrease epsilon to shift exploring to exploiting

    for step in range(50): # Max step per episodes
        # Choose an action (explore/exploit on Q-learning)
        action = choose_action(state, epsilon)

        # Apply slippery effect if the cell is slippery 
        if state in slippery_cells:
            action = apply_slippery(action=action)

        # Move the agent based on the action (0: up, 1: down, 2: left, 3: right)
        next_state = (state[0] + (action == 1) - (action == 0), state[1] + (action == 3) - (action == 2))


        # Check if next_state is within bounds
        if not (0 <= next_state[0] < grid_size and 0 <= next_state[1] < grid_size):
            next_state = state  # Stay in place if out of bounds
            reward = reward_step  # Penalty for attempting invalid move
        elif next_state == goal:
            reward = reward_goal  # Reward for reaching the goal
        elif next_state in slippery_cells:
            reward = reward_slip  # Penalty for landing on a slippery cell
        else:
            reward = reward_step  # Default step penalty


        # Update Q-table (Q-learning update)
        old_q_value = q_table[state][action]
        next_best_q = np.max(q_table[next_state])
        q_table[state][action] = old_q_value + 0.1 * (reward + 0.9 * next_best_q - old_q_value)

        # Update Policy using Policy Gradient (if goal reached)
        policy_gradient_step(state, action, reward)
        # Move to the next state
        state = next_state
        total_reward += reward

        # End the episode if the goal is reached
        if state == goal:
            print(f"Episode {ep + 1}: Reached goal in {step + 1} steps with total reward {total_reward:.2f}")
            break
    else:
        print(f"Episode {ep + 1}: Did not reach the goal. Total reward: {total_reward:.2f}")

# Print final Q-table and policy probabilities for inspection
print("Final Q-table:")
print(q_table)
print("Final Policy Probabilities:")
print(policy)

Exploring: Taking a random action
[0.249975   0.249975   0.25007499 0.249975  ]
Exploring: Taking a random action
[0.24995001 0.24995001 0.25014998 0.24995001]
Exploring: Taking a random action
[0.24992501 0.250025   0.25012497 0.24992501]
Exploring: Taking a random action
[0.25000001 0.25       0.25009996 0.24990002]
Exploring: Taking a random action
[0.24997502 0.24997501 0.25017494 0.24987504]
Exploring: Taking a random action
[0.25005001 0.24995001 0.25014992 0.24985005]
Exploring: Taking a random action
[0.25002501 0.24992502 0.25012491 0.24992506]
Exploring: Taking a random action
[0.2501     0.24990003 0.2500999  0.24990007]
Exploring: Taking a random action
[0.25007499 0.24987504 0.25007489 0.24997507]
Exploring: Taking a random action
[0.25004999 0.24985006 0.25014988 0.24995008]
Exploring: Taking a random action
[0.25012497 0.24982507 0.25012487 0.24992508]
Exploring: Taking a random action
[0.25009997 0.24980009 0.25009986 0.25000008]
Exploring: Taking a random action
[0.250

In [19]:
# Environment setup
goal_position = (4, 4)
reward_goal = 1
reward_step = -0.01  # Small penalty to encourage shorter paths
reward_slip = -0.1   # Penalty for slipping

# Run multiple episodes to test learning
num_episodes = 50
for episode in range(num_episodes):
    state = (0, 0)  # Starting position
    total_reward = 0
    epsilon = max(0.1, 1 - episode / num_episodes)  # Decrease epsilon over time

    for step in range(10):  # Max steps per episode
        # Choose action (explore/exploit based on Q-learning)
        action = choose_action(state, epsilon)
        
        # Apply slippery effect if the cell is slippery
        if state in slippery_cells:
            action = apply_slippery(action)
        
        # Move agent based on action (0: up, 1: down, 2: left, 3: right)
        next_state = (state[0] + (action == 1) - (action == 0), 
                      state[1] + (action == 3) - (action == 2))

        # Check if next_state is within bounds
        if not (0 <= next_state[0] < grid_size and 0 <= next_state[1] < grid_size):
            next_state = state  # Stay in place if out of bounds
            reward = reward_step  # Penalty for attempting invalid move
        elif next_state == goal_position:
            reward = reward_goal  # Reward for reaching the goal
        elif next_state in slippery_cells:
            reward = reward_slip  # Penalty for landing on a slippery cell
        else:
            reward = reward_step  # Default step penalty

        # Update Q-table (Q-learning update)
        old_q_value = q_table[state][action]
        next_best_q = np.max(q_table[next_state])
        q_table[state][action] = old_q_value + 0.1 * (reward + 0.9 * next_best_q - old_q_value)

        # Update Policy using Policy Gradient (if goal reached)
        policy_gradient_step(state, action, reward)
        
        # Move to the next state
        state = next_state
        total_reward += reward

        # End the episode if the goal is reached
        if state == goal_position:
            print(f"Episode {episode + 1}: Reached goal in {step + 1} steps with total reward {total_reward:.2f}")
            break
    else:
        print(f"Episode {episode + 1}: Did not reach the goal. Total reward: {total_reward:.2f}")

# Print final Q-table and policy probabilities for inspection
print("Final Q-table:")
print(q_table)
print("Final Policy Probabilities:")
print(policy)


Exploring: Taking a random action
[0.24969836 0.24599188 0.24953119 0.25477857]
Exploring: Taking a random action
[0.24977338 0.24596728 0.24950624 0.25475309]
Exploring: Taking a random action
[0.24974841 0.24594269 0.24948129 0.25482761]
Exploring: Taking a random action
[0.24972343 0.2459181  0.24945635 0.25490212]
Exploring: Taking a random action
[0.24969846 0.24589351 0.24943141 0.25497662]
Exploring: Taking a random action
[0.2496735  0.24586892 0.24940647 0.25505112]
Exploring: Taking a random action
[0.24964853 0.24594432 0.24938153 0.25502562]
Exploring: Taking a random action
[0.24962357 0.24591973 0.24945658 0.25500012]
Exploring: Taking a random action
[0.24959861 0.24599513 0.24943164 0.25497462]
Exploring: Taking a random action
[0.24967364 0.24597054 0.2494067  0.25494912]
Episode 1: Did not reach the goal. Total reward: -0.10
Exploiting: Taking the best-known action
[0.24964868 0.24594594 0.24948175 0.25492363]
Exploring: Taking a random action
[0.24962372 0.24602134 0