In [1]:
# from agents.monte_carlo import MonteCarloAgent
from environment import TreasureCube

In [3]:
def test_cube(agent, max_episode=500, max_step=500):
    env = TreasureCube(max_step=max_step)

    for episode_num in range(0, max_episode):
        """Generate an episode using PI"""

        state = env.reset()
        terminate = False
        no_of_steps = 0
        episode_reward = 0

        # Start exploration
        while not terminate:
            action = agent.take_action(state)
            reward, terminate, next_state = env.step(action)
            episode_reward += reward
            # you can comment the following two lines, if the output is too much
            # env.render()  # comment
            # print(f'step: {t}, action: {action}, reward: {reward}')  # comment
            no_of_steps += 1
            agent.train(state, action, next_state, reward)
            state = next_state
        print(f'episode: {episode_num}, total_steps: {no_of_steps} episode reward: {episode_reward}')

In [78]:
action_space = ['left', 'right', 'forward', 'backward', 'up', 'down']

In [103]:
def create_random_policy(env):
    policy = {}

    for i in range(env.dim):
        for j in range(env.dim):
            for k in range(env.dim):
                p = {}

                for action in action_space:
                    p[action] = 1 / len(action_space)

                policy[(i, j, k)] = p

    return policy

In [80]:
def create_state_action_policy(env, policy):
    Q = {}

    for key in policy.keys():
        Q[key] = {action: 0.0 for action in action_space }

    return Q

In [81]:
from random import uniform, choice

In [87]:
def run_game(env, policy, display=True):
    env.reset()
    episode = []
    terminate = False
    no_of_steps = 0
    episode_reward = 0

    while not terminate:
        state = tuple(env.curr_pos)

        if display:
            clear_output(True)
            env.render()
        
        timestep = []
        timestep.append(state)
        n = uniform(0, sum(policy[state].values()))
        top_range = 0
        for probability in policy[state].items():
            top_range = probability[1]
            if n < top_range:
                action = probability[0]
                break
        
        reward, terminate, next_state = env.step(action)
        timestep.append(action)
        timestep.append(reward)

        episode.append(timestep)

    if display:
        clear_output(True)
        env.render()

    return episode

In [77]:
def test_policy(policy, env):
    wins = 0
    r = 100
    for i in range(r):
        w = run_game(env, policy, display=False)[-1][-1]
        if w == 1:
            wins += 1
    
    return wins / r

In [106]:
def monte_carlo_e_soft(env, episodes=100, policy=None, epsilon=0.01):
    if not policy:
        policy = create_random_policy(env)

    Q = create_state_action_policy(env, policy)

    returns = {}

    for _ in range(episodes):
        G = 0

        episode = run_game(env, policy, display=False)

        for i in reversed(range(len(episode))):
            s_t, a_t, r_t = episode[i]
            state_action = (s_t, a_t)
            G += r_t

            if not state_action in [(x[0], x[1]) for x in episode[0:i]]:
                if returns.get(state_action):
                    returns[state_action].append(G)

                else:
                    returns[state_action] = G

                Q[s_t][a_t] = sum(returns[state_action]) / len(returns[state_action])

                Q_list = list(map(lambda x: x[1], Q[s_t].items()))

                indices = [i for i, x in enumerate(Q_list) if x == max(Q_list)]
                max_Q = choice(indices)

                A_star = max_Q

                for a in policy[s_t].items():
                    if a[0] == A_star:
                        policy[s_t][a[0]] = 1 - epsilon + (epsilon / abs(sum(policy[s_t].values())))

                    else:
                        policy[s_t][a[0]] = (epsilon / abs(sum(policy[s_t].values())))
    
    return policy

In [108]:
env = TreasureCube(max_step=500)
policy = monte_carlo_e_soft(env, episodes=100)
test_policy(policy, env)

TypeError: &#39;float&#39; object is not iterable