In [2]:
import gym
import numpy as np
from IPython.display import Image
from tqdm import tqdm

In [3]:
def get_mean_rewards(rewards, transitions, policy):
    return (rewards * transitions * policy[:,:,np.newaxis]).sum(axis = (1,2))

In [4]:
def get_mean_transitions(rewards, transitions, policy):
    return (transitions * policy[:,:,np.newaxis]).sum(axis = (1))

In [136]:
def evaluate_value_func(rewards, transitions, policy, gamma, max_iter, tolerance):
    mean_rewards = get_mean_rewards(rewards, transitions, policy)
    mean_transitions = get_mean_transitions(rewards, transitions, policy)

    v = np.zeros(rewards.shape[0])
    for k in range(max_iter):
        old_v = v
        v = mean_rewards + gamma * mean_transitions.dot(v)
    return v

In [6]:
def do_policy_iteration(rewards, transitions, gamma = 1, 
                        max_iter_eval = 100, max_iter_update = 1000, tolerance = 10e-8):
    ns = rewards.shape[0]
    na = rewards.shape[1]
    policy = np.zeros((ns, na))
    
    for i in range(max_iter_update):
        v = evaluate_value_func(rewards, transitions, policy, gamma, max_iter_eval, tolerance) 
        q = (transitions * (rewards + gamma * v[np.newaxis, np.newaxis, :]) ).sum(axis = 2)
       
        new_policy = np.zeros_like(policy)
        new_policy[np.arange(ns), np.argmax(q, axis = 1)] = 1
        
        if np.array_equal(policy, new_policy):
            break
        policy = np.copy(new_policy)
        
    return policy

In [146]:
def do_value_iteration(rewards, transitions, gamma = 1, 
                       max_iter_update = 1000, tolerance = 10e-8):
    ns = rewards.shape[0]
    na = rewards.shape[1]
    v = np.zeros(ns)
    for _ in range(max_iter_update):
        mean_rewards_by_action = (rewards * transitions).sum(axis = (2))
        mean_transitions_by_action = transitions

        s = mean_rewards_by_action + gamma * mean_transitions_by_action.dot(v)
        v = s[np.arange(ns), s.argmax(axis=1)]
        
    q = (transitions * (rewards + gamma * v[np.newaxis, np.newaxis, :]) ).sum(axis = 2)
    policy = np.zeros((ns, na))
    policy[np.arange(ns), np.argmax(q, axis = 1)] = 1
        
    return policy

In [8]:
np.ones((2, 5, 3)).dot(np.ones(3))

array([[ 3.,  3.,  3.,  3.,  3.],
       [ 3.,  3.,  3.,  3.,  3.]])

In [9]:
np.ones(3)

array([ 1.,  1.,  1.])

In [10]:
def get_params(env, ns, na):
    transitions = np.zeros([ns, na, ns])
    rewards = np.zeros([ns, na, ns])
    for s in range(ns):
        for a in range(na):
            current_transitions = env.P[s][a]
            for p_trans,  next_s, rew, done in current_transitions:
                transitions[s,a,next_s] += p_trans
                rewards[s,a,next_s] = rew
            transitions[s,a,:]/=np.sum(transitions[s,a,:])
    return rewards, transitions

In [52]:
def solve_env(env, method = 'policy_iter', to_wrap = False, to_send = False, max_k = 100):
    env.reset()
    
    ns = env.observation_space.n
    na = env.action_space.n
    rewards, transitions = get_params(env, ns, na)
    
    if method == 'value_iter':
        best_policy = do_value_iteration(rewards, transitions, max_iter_update = max_k, gamma = 0.999)
    else:
        best_policy = do_policy_iteration(rewards, transitions,max_iter_update = max_k,  gamma = 0.999)
    
    return best_policy

In [46]:
from helper import check_solution

## Среда FrozenLake8x8

In [50]:
env = gym.make('FrozenLake8x8-v0')

[2017-07-03 14:53:06,099] Making new env: FrozenLake8x8-v0


#### Policy iteration

In [73]:
policy1 = solve_env(env, method = 'policy_iter')

In [99]:
np.random.seed(0)
check_solution(env, policy1)

0.6100000000000001

#### Value iteration

In [148]:
policy2 = solve_env(env, method = 'value_iter')

In [149]:
np.random.seed(0)
check_solution(env, policy2)

0.63000000000000012

In [141]:
not_equals = np.zeros_like(policy1)
not_equals[policy1 != policy3] = 1

In [142]:
not_equals.nonzero()

(array([ 0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
         8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
        17, 17, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27,
        27, 30, 30, 31, 31, 32, 32, 33, 33, 36, 36, 37, 37, 38, 38, 39, 39,
        43, 43, 44, 44, 47, 47, 50, 50, 55, 55, 57, 57, 60, 60, 61, 61]),
 array([0, 3, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 3, 0, 3, 0, 3, 0,
        3, 0, 3, 0, 2, 0, 2, 0, 1, 0, 3, 0, 3, 0, 2, 0, 3, 0, 2, 0, 2, 0, 3,
        0, 3, 0, 3, 0, 1, 0, 2, 0, 2, 0, 3, 0, 3, 0, 2, 0, 1, 0, 3, 0, 2, 0,
        1, 0, 3, 0, 2, 0, 1, 0, 2, 0, 1, 0, 1, 0, 2]))

#### Загрузим решение

In [218]:
solve_env('FrozenLake8x8-v0', method = 'value_iter', to_wrap = True, to_send = True)

[2017-06-25 12:56:30,200] Making new env: FrozenLake8x8-v0
[2017-06-25 12:56:30,274] Attempted to wrap env <FrozenLakeEnv instance> after .configure() was called. All wrappers must be applied before calling .configure()
[2017-06-25 12:56:30,275] Creating monitor directory FrozenLake8x8-v0_solution_value_iter
[2017-06-25 12:56:30,277] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/FrozenLake8x8-v0_solution_value_iter/openaigym.video.2.4312.video000000.json
[2017-06-25 12:56:30,307] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/FrozenLake8x8-v0_solution_value_iter/openaigym.video.2.4312.video000001.json
[2017-06-25 12:56:30,345] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/FrozenLake8x8-v0_solution_value_iter/openaigym.video.2.4312.video000008.json
[2017-06-25 12:56:30,410] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/FrozenLake8x8-v0_solution_value_iter/op

875.0


[2017-06-25 12:56:35,641] [FrozenLake8x8-v0] Uploading videos of 10 training episodes (2923 bytes)
[2017-06-25 12:56:36,250] [FrozenLake8x8-v0] Creating evaluation object from FrozenLake8x8-v0_solution_value_iter with learning curve and training video
[2017-06-25 12:56:36,757] 
****************************************************
You successfully uploaded your evaluation on FrozenLake8x8-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_fWkDWw9uSpWB6mdicnnjxg

****************************************************


## Среда Taxi-v1

In [157]:
taxi_env = gym.make('Taxi-v1')

[2017-07-03 15:22:50,653] Making new env: Taxi-v1


#### Policy iteration

In [158]:
policy1 = solve_env(taxi_env, method = 'policy_iter')

In [159]:
np.random.seed(0)
check_solution(taxi_env, policy1)

10.02

#### Value iteration

In [160]:
policy2 = solve_env(taxi_env, method = 'value_iter')

In [161]:
np.random.seed(0)
check_solution(taxi_env, policy2)

10.270000000000001

In [165]:
solve_env('Taxi-v1', to_wrap = True, to_send = True)

[2017-06-25 10:52:54,505] Making new env: Taxi-v1


('last k ', 0)


[2017-06-25 10:52:54,916] Attempted to wrap env <TaxiEnv instance> after .configure() was called. All wrappers must be applied before calling .configure()
[2017-06-25 10:52:54,917] Creating monitor directory Taxi-v1_solution_policy_iter
[2017-06-25 10:52:54,918] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/Taxi-v1_solution_policy_iter/openaigym.video.1.4312.video000000.json
[2017-06-25 10:52:54,927] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/Taxi-v1_solution_policy_iter/openaigym.video.1.4312.video000001.json
[2017-06-25 10:52:54,950] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/Taxi-v1_solution_policy_iter/openaigym.video.1.4312.video000008.json
[2017-06-25 10:52:55,001] Starting new video recorder writing to /home/liza/Документы/Python-projects/RL/Taxi-v1_solution_policy_iter/openaigym.video.1.4312.video000027.json
[2017-06-25 10:52:55,063] Starting new video recorder writing to /h

10364.0


[2017-06-25 10:52:59,045] [Taxi-v1] Uploading videos of 10 training episodes (1817 bytes)
[2017-06-25 10:52:59,632] [Taxi-v1] Creating evaluation object from Taxi-v1_solution_policy_iter with learning curve and training video
[2017-06-25 10:53:00,348] 
****************************************************
You successfully uploaded your evaluation on Taxi-v1 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_9xUnOhbTkWuZyHDD9NpuQ

****************************************************
