# FrozenLake 8x8 solution

In [1]:
import gym
from gym import wrappers
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
import logging

In [2]:
env = gym.make("FrozenLake-v0")

n_states = env.observation_space.n
n_actions = env.action_space.n

# constants, hyperparams
episode_max_steps = 1000
evaluate_n_times = 100
crossover_p = 0.5
mutation_p = 0.1
decay_mutation = 0.99

n_epoches = 100
pool_size = 100
n_crossovers = 50
n_mutations = 50

recording = True
recording_dir = "result"

if recording:
    # shut up logger
    gym.logger.setLevel(logging.ERROR)
    env = wrappers.Monitor(env, recording_dir)

[2017-04-10 15:32:08,893] Making new env: FrozenLake-v0


In [3]:
def get_random_policy():
    return np.random.choice(n_actions, size=n_states)

In [4]:
def sample_reward(env, policy):
    s = env.reset()
    total_reward = 0
    for _ in range(episode_max_steps):
        s, reward, is_done, _ = env.step(policy[s])
        total_reward += reward
        if is_done:
            break
    return total_reward

In [5]:
def evaluate(policy, n_times=evaluate_n_times):
    rewards = [sample_reward(env, policy) for _ in range(n_times)]
    return float(np.mean(rewards))

In [6]:
def crossover(policy1, policy2, p):
    """
    for each state, with probability p take action from policy1, else policy2
    """
    idx = np.random.rand(len(policy1)) < p
    return np.choose(idx, [policy1, policy2])

In [7]:
def mutation(policy, p):
    """
    for each state, with probability p replace action with random action
    Tip: mutation can be written as crossover with random policy
    """
    return crossover(get_random_policy(), policy, p)

In [8]:
# initialization
pool = [get_random_policy() for _ in range(pool_size)]
scores = [evaluate(p) for p in pool]

In [9]:
for epoch in tqdm(range(n_epoches)):
    mutations = [mutation(random.choice(pool), mutation_p) 
                 for _ in range(n_mutations)]
    crossovers = [crossover(random.choice(pool), random.choice(pool), crossover_p)
                 for _ in range(n_crossovers)]
    pool.extend(mutations)
    scores.extend([evaluate(p) for p in mutations])
    pool.extend(crossovers)
    scores.extend([evaluate(p) for p in crossovers])

    # truncate the pool
    selected_indices = np.argsort(scores)[-pool_size:]
    pool = [pool[i] for i in selected_indices]
    scores = [scores[i] for i in selected_indices]

    best_score = scores[-1]
    print("Epoch %d: Best score: %.2f, mutation_p: %f" % (epoch, best_score, mutation_p))
    
    mutation_p *= decay_mutation

Epoch 0: Best score: 0.21, mutation_p: 0.100000
Epoch 1: Best score: 0.21, mutation_p: 0.099000
Epoch 2: Best score: 0.23, mutation_p: 0.098010
Epoch 3: Best score: 0.30, mutation_p: 0.097030
Epoch 4: Best score: 0.30, mutation_p: 0.096060
Epoch 5: Best score: 0.30, mutation_p: 0.095099
Epoch 6: Best score: 0.30, mutation_p: 0.094148
Epoch 7: Best score: 0.67, mutation_p: 0.093207
Epoch 8: Best score: 0.67, mutation_p: 0.092274
Epoch 9: Best score: 0.72, mutation_p: 0.091352
Epoch 10: Best score: 0.73, mutation_p: 0.090438
Epoch 11: Best score: 0.73, mutation_p: 0.089534
Epoch 12: Best score: 0.75, mutation_p: 0.088638
Epoch 13: Best score: 0.77, mutation_p: 0.087752
Epoch 14: Best score: 0.79, mutation_p: 0.086875
Epoch 15: Best score: 0.79, mutation_p: 0.086006
Epoch 16: Best score: 0.79, mutation_p: 0.085146
Epoch 17: Best score: 0.79, mutation_p: 0.084294
Epoch 18: Best score: 0.83, mutation_p: 0.083451
Epoch 19: Best score: 0.83, mutation_p: 0.082617
Epoch 20: Best score: 0.83, mu

In [10]:
env.close()

https://gym.openai.com/evaluations/eval_u01ZYLDmRbq5Z1rY6iPvLA