# FrozenLake 8x8 solution

In [134]:
import gym
from gym import wrappers
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
import logging

In [135]:
env = gym.make("FrozenLake8x8-v0")

n_states = env.observation_space.n
n_actions = env.action_space.n

# constants, hyperparams
episode_max_steps = 1000
evaluate_n_times = 100
crossover_p = 0.5
mutation_p = 0.1
decay_mutation = 0.99

n_epoches = 100
pool_size = 100
n_crossovers = 50
n_mutations = 50

recording = True
recording_dir = "result"

if recording:
    # shut up logger
    gym.logger.setLevel(logging.ERROR)
    env = wrappers.Monitor(env, recording_dir)

In [136]:
def get_random_policy():
    return np.random.choice(n_actions, size=n_states)

In [137]:
def sample_reward(env, policy):
    s = env.reset()
    total_reward = 0
    for _ in range(episode_max_steps):
        s, reward, is_done, _ = env.step(policy[s])
        total_reward += reward
        if is_done:
            break
    return total_reward

In [138]:
def evaluate(policy, n_times=evaluate_n_times):
    rewards = [sample_reward(env, policy) for _ in range(n_times)]
    return float(np.mean(rewards))

In [139]:
def crossover(policy1, policy2, p):
    """
    for each state, with probability p take action from policy1, else policy2
    """
    idx = np.random.rand(len(policy1)) < p
    return np.choose(idx, [policy1, policy2])

In [140]:
def mutation(policy, p):
    """
    for each state, with probability p replace action with random action
    Tip: mutation can be written as crossover with random policy
    """
    return crossover(get_random_policy(), policy, p)

In [141]:
# initialization
pool = [get_random_policy() for _ in range(pool_size)]
scores = [evaluate(p) for p in pool]

In [142]:
for epoch in tqdm(range(n_epoches)):
    mutations = [mutation(random.choice(pool), mutation_p) 
                 for _ in range(n_mutations)]
    crossovers = [crossover(random.choice(pool), random.choice(pool), crossover_p)
                 for _ in range(n_crossovers)]
    pool.extend(mutations)
    scores.extend([evaluate(p) for p in mutations])
    pool.extend(crossovers)
    scores.extend([evaluate(p) for p in crossovers])

    # truncate the pool
    selected_indices = np.argsort(scores)[-pool_size:]
    pool = [pool[i] for i in selected_indices]
    scores = [scores[i] for i in selected_indices]

    best_score = scores[-1]
    print("Epoch %d: Best score: %.2f" % (epoch, best_score))
    
    mutation_p *= decay_mutation

Epoch 0: Best score: 0.05
Epoch 1: Best score: 0.06
Epoch 2: Best score: 0.06
Epoch 3: Best score: 0.08
Epoch 4: Best score: 0.08
Epoch 5: Best score: 0.08
Epoch 6: Best score: 0.18
Epoch 7: Best score: 0.18
Epoch 8: Best score: 0.18
Epoch 9: Best score: 0.18
Epoch 10: Best score: 0.26
Epoch 11: Best score: 0.43
Epoch 12: Best score: 0.43
Epoch 13: Best score: 0.43
Epoch 14: Best score: 0.67
Epoch 15: Best score: 0.67
Epoch 16: Best score: 0.67
Epoch 17: Best score: 0.67
Epoch 18: Best score: 0.67
Epoch 19: Best score: 0.67
Epoch 20: Best score: 0.67
Epoch 21: Best score: 0.74
Epoch 22: Best score: 0.74
Epoch 23: Best score: 0.77
Epoch 24: Best score: 0.77
Epoch 25: Best score: 0.77
Epoch 26: Best score: 0.85
Epoch 27: Best score: 0.91
Epoch 28: Best score: 0.92
Epoch 29: Best score: 0.92
Epoch 30: Best score: 0.92
Epoch 31: Best score: 0.92
Epoch 32: Best score: 0.92
Epoch 33: Best score: 0.93
Epoch 34: Best score: 0.93
Epoch 35: Best score: 0.94
Epoch 36: Best score: 0.94
Epoch 37: B

In [143]:
env.close()

https://gym.openai.com/evaluations/eval_u01ZYLDmRbq5Z1rY6iPvLA