In [1]:
import numpy as np
import gym
from tqdm import tqdm
from gym import wrappers

In [2]:
def running_average(x, window_size, mode='valid'):
    return np.convolve(x, np.ones(window_size) / window_size, mode=mode).max()

def check_solution(env, policy, n_episodes = 100, max_steps = 100, to_wrap = False, to_send = False, name2save = ''):
    ns = env.observation_space.n
    na = env.action_space.n
    count_dones = np.zeros(n_episodes)
    count_steps = np.zeros(n_episodes)
    if to_wrap:
        env = wrappers.Monitor(env, name2save)
        
    for i in range(n_episodes):
        observation = env.reset() 
        for step in range(max_steps): 
            action = policy[observation].argmax()
            observation, reward, done, info = env.step(action)
            count_dones[i] += reward
            count_steps[i] += 1
            if done:
                break
    
    env.close()
    if to_wrap and to_send:
        gym.upload(name2save, api_key='sk_bExD4VfCSQukGlQkYKBhdQ')
    
    return running_average(count_dones, 100).max()

In [3]:
def select_a_with_epsilon_greedy(curr_s, q_value, epsilon=0.1):
    a = np.argmax(q_value[curr_s, :])
    if np.random.rand() < epsilon:
        a = np.random.randint(q_value.shape[1])
    return a

In [4]:
def get_greedy_policy(Q):
    ns, na = Q.shape
    policy = np.zeros((ns, na))
    best_actions = Q.argmax(axis = 1)
    policy[np.arange(ns), best_actions] = 1 
    return policy

In [5]:
def sarsa(env, n_episodes = 1, eps = 1.0, eps_decay = 0.999,
          gamma = 0.99, alpha = 0.1, kappa = 0.01, max_steps = 100, print_options = True):
    eps *= 1.0
    alpha *= 1.0
    
    ns = env.observation_space.n
    na = env.action_space.n

    #terminal_states = find_terminal_states(env)
    Q = np.zeros((ns, na))
    """ 
    Q = np.random.normal(scale = 0.01, size = ns * na).reshape((ns, na))
    for s in range(ns):
        if terminal_states[s] == 1:
            Q[s] = np.zeros(na)
    """
    
    avg_reward = None

    for i in range(n_episodes):
        if i % 1000 == 0 and print_options:
            print(i)
        s = env.reset()
        a = np.random.choice(na, 1, p = policy[s])[0]
        for step in range(max_steps):
            new_s, reward, done, info = env.step(a)
            #new_a = np.random.choice(na, 1, p = policy[new_s])[0]
            new_a = select_a_with_epsilon_greedy(new_s, Q, epsilon = eps)
            
            if done:
                # Running average of the terminal reward, which is used for controlling an exploration rate
                # (This idea of controlling exploration rate by the terminal reward is suggested by JKCooper2)
                # See https://gym.openai.com/evaluations/eval_xSOlwrBsQDqUW7y6lJOevQ
                if avg_reward == None:
                    avg_reward = reward
                else:
                    avg_reward = kappa * reward + (1 - kappa) * avg_reward
                if reward > avg_reward:
                    # Bias the current policy toward exploitation
                    eps *= eps_decay
                    
            Q[s][a] += alpha  * (reward + gamma * Q[new_s][new_a] - Q[s][a])
            s = new_s
            a = new_a
            if done:
                break
    return get_eps_greedy_policy(Q)

In [6]:
def sarsa_lambda(env, n_episodes = 1000, gamma = 0.99, alpha = 0.1, lambda_coef = 0.0, 
                 eps = 0.1, eps_decay = 0.995, kappa = 0.01, max_steps = 100):
    ns = env.observation_space.n
    na = env.action_space.n
    
    E = np.zeros((ns, na))
    Q = np.zeros((ns, na))
    avg_reward = None
    for i in range(n_episodes):
        s = env.reset()
        a = np.random.choice(na)
        for t in range(max_steps):
            new_s, reward, done, info = env.step(a)
            new_a = select_a_with_epsilon_greedy(new_s, Q, epsilon = eps)
            delta = reward + gamma * Q[new_s][new_a] - Q[s][a]
            E[s][a] += 1
            Q += alpha * delta * E
            E *= (gamma * lambda_coef)
            s = new_s
            a = new_a
            
            if done:
                # Running average of the terminal reward, which is used for controlling an exploration rate
                # (This idea of controlling exploration rate by the terminal reward is suggested by JKCooper2)
                # See https://gym.openai.com/evaluations/eval_xSOlwrBsQDqUW7y6lJOevQ
                kappa = 0.01
                if avg_reward == None:
                    avg_reward = reward
                else:
                    avg_reward = kappa * reward + (1 - kappa) * avg_reward
                if reward > avg_reward:
                    # Bias the current policy toward exploitation
                    eps *= eps_decay
            
            if done:
                break
    policy = get_greedy_policy(Q)
    return policy

In [7]:
import itertools
def find_sarsa_hyperparams(env):
    alphas = [0.01, 0.1, 0.5]
    eps = [0.5, 0.7, 1.0]
    kappas = [0.001, 0.01, 0.1]
    n_episodes = [20000]
    eps_decays = [1.0, 0.995]
    
    best_score = 0
    history = []
    for (alpha, eps,eps_decay, kappa, n) in itertools.product(alphas, eps, eps_decays, kappas, n_episodes):
        policy = sarsa_lambda(env, n_episodes = n, eps = eps, eps_decay = eps_decay,  
                              alpha = alpha, kappa = kappa, lambda_coef = 0)
        score = check_solution(env,policy, n_episodes=250)
        history.append((alpha, eps,eps_decay, kappa, n, score))
        if score > best_score:
            best_score = score
            best_params = (alpha, eps,eps_decay, kappa, n)
            
    print(best_params)
    return history

## Frozer Lake 8x8

In [8]:
name = 'FrozenLake8x8-v0'
env = gym.make(name)

[2017-07-03 16:55:00,372] Making new env: FrozenLake8x8-v0


In [121]:
scores = []
for lambda_coef in np.linspace(0,1,11):
    print(lambda_coef)
    np.random.seed(42)
    policy_lambda = sarsa_lambda(env, n_episodes = 20000, 
                                 eps = 0.7, eps_decay = 0.995,
                                 alpha = 0.1, lambda_coef = lambda_coef)
    score = check_solution(env, policy_lambda, n_episodes = 1000, max_steps = 250)
    scores.append(score)

0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0


In [130]:
best_lambda = np.argmax(scores) * 0.1

In [133]:
best_lambda

0.20000000000000001

In [9]:
policy__lake_lambda = sarsa_lambda(env, n_episodes = 20000, 
                                 eps = 0.7, eps_decay = 0.995,
                                 alpha = 0.1, lambda_coef = 0.2)

In [12]:
check_solution(env, policy__lake_lambda, n_episodes = 1000, max_steps = 250)

0.92999999999999994

In [191]:
check_solution(env, policy__lake_lambda, n_episodes = 1000, max_steps = 250)

0.95999999999999996

#### Лучший средний результат 0.9 для $\lambda$=0.2

## Taxi-v1

In [200]:
name = 'Taxi-v1'
env_taxi = gym.make(name)

[2017-06-30 20:42:10,275] Making new env: Taxi-v1


In [165]:
history = find_sarsa_hyperparams(env_taxi)

(0.1, 0.7, 0.995, 0.1, 20000)


In [166]:
scores = []
for lambda_coef in np.linspace(0,1,11):
    print(lambda_coef)
    np.random.seed(42)
    policy_lambda = sarsa_lambda(env, n_episodes = 20000, 
                                 eps = 0.7, eps_decay = 0.995, kappa = 0.1,
                                 alpha = 0.1, lambda_coef = lambda_coef)
    score = check_solution(env, policy_lambda, n_episodes = 1000, max_steps = 250)
    scores.append(score)

0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0


In [193]:
best_lambda = np.argmax(scores) * 0.1

In [168]:
scores

[11.15,
 11.16,
 10.720000000000001,
 11.06,
 11.24,
 11.16,
 10.82,
 10.81,
 10.970000000000002,
 10.99,
 -1038.29]

In [204]:
policy_taxi_lambda = sarsa_lambda(env_taxi, n_episodes = 20000, 
                                 eps = 0.7, eps_decay = 0.995,
                                 alpha = 0.1, lambda_coef = best_lambda)

In [208]:
check_solution(env_taxi, policy_taxi_lambda, n_episodes = 1000, max_steps = 250)

11.08

In [210]:
best_lambda

0.40000000000000002

#### Лучший средний результат 11.08 для $\lambda$=0.4