In [7]:
"""
Small demo to illustrate how the plot function and the gridworld environment work
"""
import numpy as np
import optuna

from gridworld import *
from plot import *

In [28]:
env = Random(size=12, water=0, mountain=0.3)
for row in env.grid:
    print(row)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', ' ', '#', ' ', ' ', '#', ' ', 'S', ' ', '#']
[' ', ' ', ' ', '#', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', '#', '#', '#', ' ', '#', '#', ' ', ' ', '#', ' ']
['#', '#', ' ', ' ', '#', ' ', ' ', ' ', ' ', '#', '#', '#']
[' ', ' ', ' ', '#', '#', ' ', ' ', ' ', ' ', '#', ' ', ' ']
[' ', ' ', ' ', ' ', '#', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
['#', ' ', ' ', ' ', ' ', ' ', '#', ' ', '#', '#', ' ', ' ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', '#', '#', ' ', ' ', ' ']
['#', ' ', ' ', ' ', ' ', '#', '#', '#', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', '#', ' ', ' ', '#', ' ', ' ', 'G', ' ', '#']
[' ', ' ', '#', '#', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [1]:

# SARSA parameters
EPS = 0.1
ALPHA = 0.1
GAMMA = 0.9
EPISODES = 10000
MAX_EPISODE_LENGTH = 200

In [76]:
def qlearning_episode(env):
    q_table = np.zeros((env.num_states(), env.num_actions()))
    q_table.fill(1)
    learning_data={}
    cum_reward = 0
    # run a certain number of episodes
    for episode in range(EPISODES):
        state = env.reset()
        action = select_action(state, q_table)

        done = False
        episode_length = 0

        # run episode until a goal state or the maximum number of steps has been reached
        while not done and episode_length < MAX_EPISODE_LENGTH:
            next_state, reward, done = env.step(action)
            next_action = select_action(next_state, q_table)

            # Q-Learning update rule
            delta = reward + GAMMA * np.max(q_table[next_state, next_action]) * (done < 0.5) - q_table[state, action]
            q_table[state, action] += ALPHA * delta
            cum_reward += ALPHA * delta
            
            state = next_state
            action = next_action
            episode_length += 1
            
        learning_data[episode] = [episode_length, cum_reward]

    return learning_data


In [9]:

def select_action(state, q_table):
    # do random action
    if np.random.random() < EPS:
        return np.random.randint(0, len(q_table[0]))
    # or do best action
    else:
        return np.argmax(q_table[state])


In [77]:
data={}
for i in range(10):
    env = Random(size=12, water=0, mountain=0.3)

    data[i] = qlearning_episode(env)



In [None]:
# Create a scatter plot
for key, values in data[9].items():
    plt.scatter(values[0], values[1])

# Adding labels and title
plt.xlabel('number of steps')
plt.ylabel('Cumulative reward')
plt.title('number of steps vs Cumulative reward')

# Adding a legend
plt.legend()

# Display the plot
plt.show()