In [1]:
import numpy as np
import gym
from tqdm import tqdm
from collections import defaultdict

In [2]:
from gym import wrappers
def running_average(x, window_size, mode='valid'):
    return np.convolve(x, np.ones(window_size) / window_size, mode=mode).max()

def check_solution(env, policy, n_episodes = 100, max_steps = 100, to_wrap = False, to_send = False, name2save = ''):
    ns = env.observation_space.n
    na = env.action_space.n
    count_dones = np.zeros(n_episodes)
    count_steps = np.zeros(n_episodes)
    if to_wrap:
        env = wrappers.Monitor(env, name2save)
        
    for i in range(n_episodes):
        observation = env.reset() 
        for step in range(max_steps): 
            action = policy[observation].argmax()
            observation, reward, done, info = env.step(action)
            count_dones[i] += reward
            count_steps[i] += 1
            if done:
                break
    

    env.close()
    if to_wrap and to_send:
        gym.upload(name2save, api_key='sk_bExD4VfCSQukGlQkYKBhdQ')
        
    return running_average(count_dones, 100).max()

In [3]:
def select_a_with_epsilon_greedy(curr_s, q_value, epsilon=0.1):
    a = np.argmax(q_value[curr_s, :])
    if np.random.rand() < epsilon:
        a = np.random.randint(q_value.shape[1])
    return a

In [4]:
def get_greedy_policy(Q):
    ns, na = Q.shape
    policy = np.zeros((ns, na))
    best_actions = Q.argmax(axis = 1)
    policy[np.arange(ns), best_actions] = 1 
    return policy

In [5]:
def q_learning(env, n_episodes = 1000, gamma = 0.999, alpha = 0.1, eps = 0.9, eps_decay = 0.995, kappa = 0.01):
    max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    
    ns = env.observation_space.n
    na = env.action_space.n
    
    Q = np.zeros((ns, na))
    
    history_length = np.zeros(n_episodes)
    history_reward = np.zeros(n_episodes)
    avg_reward = None
    for i_episode in range(n_episodes):
        s = env.reset()
        for t in range(max_steps):
            a = select_a_with_epsilon_greedy(s, Q, eps)
            new_s, reward, done, info = env.step(a)
            new_a = Q[new_s].argmax()
            Q[s][a] += alpha * (reward + gamma * Q[new_s, new_a] - Q[s][a])
            
            history_length[i_episode] = t
            history_reward[i_episode] += reward * gamma ** t
            
            s = new_s
            
            #eps *= eps_decay
            if done:
                # Running average of the terminal reward, which is used for controlling an exploration rate
                # (This idea of controlling exploration rate by the terminal reward is suggested by JKCooper2)
                # See https://gym.openai.com/evaluations/eval_xSOlwrBsQDqUW7y6lJOevQ
                if avg_reward == None:
                    avg_reward = reward
                else:
                    avg_reward = kappa * reward + (1 - kappa) * avg_reward
                if reward > avg_reward:
                    # Bias the current policy toward exploitation
                    eps *= eps_decay

            if done:
                break
    return get_greedy_policy(Q)

In [6]:
import itertools
def find_q_hyperparams(env):
    alphas = [0.01, 0.1, 0.5, 0.8]
    eps = [0.5, 0.7, 1.0]
    kappas = [0.001, 0.01, 0.1]
    n_episodes = [20000]
    eps_decays = [1.0, 0.995]
    
    best_score = 0
    history = []
    for i,(alpha, eps,eps_decay, kappa, n) in enumerate(itertools.product(alphas, eps, eps_decays, kappas, n_episodes)):
        policy = q_learning(env, n_episodes = n, eps = eps, eps_decay = eps_decay,  
                              alpha = alpha, kappa = kappa)
        score = check_solution(env,policy, n_episodes=250)
        history.append((alpha, eps,eps_decay, kappa, n, score))
        if score > best_score:
            best_score = score
            best_params = (alpha, eps,eps_decay, kappa, n)
        print(i, score)
    print(best_params)
    return history

### Frozen Lake 8x8

In [7]:
lake_env = gym.make('FrozenLake8x8-v0')

[2017-07-03 16:52:49,825] Making new env: FrozenLake8x8-v0


In [8]:
%%time
np.random.seed(42)
policy = q_learning(lake_env, n_episodes = 20000, 
                    eps = 0.7, eps_decay = 0.995,
                    alpha = 0.8)

CPU times: user 13.9 s, sys: 0 ns, total: 13.9 s
Wall time: 13.9 s


In [11]:
np.random.RandomState(1)
check_solution(lake_env, policy, n_episodes = 1000, max_steps = 250, to_wrap = True, to_send = True,
               name2save='qlake-1')

[2017-07-03 16:53:36,018] Attempted to wrap env <FrozenLakeEnv instance> after .configure() was called. All wrappers must be applied before calling .configure()
[2017-07-03 16:53:36,022] Creating monitor directory qlake-1
[2017-07-03 16:53:36,025] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/qlake-1/openaigym.video.0.8332.video000000.json
[2017-07-03 16:53:36,048] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/qlake-1/openaigym.video.0.8332.video000001.json
[2017-07-03 16:53:36,082] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/qlake-1/openaigym.video.0.8332.video000008.json
[2017-07-03 16:53:36,141] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/qlake-1/openaigym.video.0.8332.video000027.json
[2017-07-03 16:53:36,236] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/qlake-1/openaigym.video.0.8332.video000064.json
[2017-07-03 1

0.94999999999999996

In [120]:
check_solution(lake_env, policy, n_episodes = 1000, max_steps = 250)

0.95999999999999996

In [130]:
taxi_env = gym.make('Taxi-v1')

[2017-06-30 21:46:50,959] Making new env: Taxi-v1


In [131]:
%%time
np.random.seed(42)
taxi_policy = q_learning(taxi_env, n_episodes = 20000, 
                    eps = 0.7, eps_decay = 0.995,
                    alpha = 0.8)

CPU times: user 13.2 s, sys: 0 ns, total: 13.2 s
Wall time: 13.2 s


In [136]:
np.random.RandomState(1)
check_solution(taxi_env, taxi_policy, n_episodes = 1000, max_steps = 250)

10.81