# Обучение Taxi-v3 из Gymnasium при помощи кросс-энтропийного метода

Ноутбук сделан на основе материалов практического задания [курса](https://ods.ai/tracks/drlcourse23/tasks/drl23_hw_1).
Проделан собственный эксперимент: динамический параметр q, отвечающий за долю лучших траекторий для обучения.

In [1]:
import gym
import numpy as np
from src.agents import CrossEntropyAgent
from src.utils import get_trajectory, evaluate_agent
from hydra import compose, initialize


config_path = 'configs/taxiv3/'
initialize(version_base=None, config_path=config_path)

hydra.initialize()

## Бейзлайн: константный q_param

In [2]:
cfg = compose(config_name="exp_default")

env = gym.make(cfg.env.name)
state_n = cfg.env.state_n
action_n = cfg.env.action_n


agent = CrossEntropyAgent(state_n, action_n)
q_param = cfg.params.q_param
iteration_n = cfg.params.iteration_n
trajectory_n = cfg.params.trajectory_n

for iteration in range(iteration_n):
    #policy evaluation
    trajectories = [get_trajectory(env, agent) for _ in range(trajectory_n)]
    total_rewards = [np.sum(trajectory['rewards']) for trajectory in trajectories]
    print('iteration:', iteration, 'mean total reward:', np.mean(total_rewards))

    #policy improvement
    quantile = np.quantile(total_rewards, q_param)
    elite_trajectories = []
    for trajectory in trajectories:
        total_reward = np.sum(trajectory['rewards'])
        if total_reward > quantile:
            elite_trajectories.append(trajectory)

    agent.fit(elite_trajectories)

iteration: 0 mean total reward: -768.189
iteration: 1 mean total reward: -690.4
iteration: 2 mean total reward: -600.454
iteration: 3 mean total reward: -494.388
iteration: 4 mean total reward: -333.726
iteration: 5 mean total reward: -177.77
iteration: 6 mean total reward: -96.065
iteration: 7 mean total reward: -50.389
iteration: 8 mean total reward: -22.977
iteration: 9 mean total reward: -5.707
iteration: 10 mean total reward: 3.13
iteration: 11 mean total reward: 5.342
iteration: 12 mean total reward: 6.258
iteration: 13 mean total reward: 6.728
iteration: 14 mean total reward: 6.739
iteration: 15 mean total reward: 7.102
iteration: 16 mean total reward: 6.647
iteration: 17 mean total reward: 7.151
iteration: 18 mean total reward: 7.148
iteration: 19 mean total reward: 7.41
iteration: 20 mean total reward: 6.97
iteration: 21 mean total reward: 7.124
iteration: 22 mean total reward: 7.435
iteration: 23 mean total reward: 7.502
iteration: 24 mean total reward: 7.171
iteration: 25 me

In [3]:
trajectory = get_trajectory(env, agent, max_len=cfg.params.trajectory.max_len)
print('total reward:', sum(trajectory['rewards']))
print('model:')
print(agent.model)

total reward: 7
model:
[[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.         0.         0.         0.         1.         0.        ]
 [0.         0.         0.         0.         1.         0.        ]
 ...
 [0.         0.25       0.         0.5        0.25       0.        ]
 [0.         1.         0.         0.         0.         0.        ]
 [0.         0.         0.         1.         0.         0.        ]]


In [4]:
mean, std = evaluate_agent(env, agent, 10000)
print(f'Reward: {mean.round(2)}±{std.round(2)}')

Reward: 7.32±4.9


## Пробуем улучшение: линейно увеличиваем q_param с каждой итерацией

In [9]:
cfg = compose(config_name="exp_dynamic_q")
env = gym.make(cfg.env.name)
state_n = cfg.env.state_n
action_n = cfg.env.action_n


agent = CrossEntropyAgent(state_n, action_n)
initial_q = cfg.params.initial_q
end_q = cfg.params.end_q
iteration_n = cfg.params.iteration_n
trajectory_n = cfg.params.trajectory_n

qs = np.linspace(initial_q, end_q, iteration_n)
for iteration in range(iteration_n):

    #policy evaluation
    trajectories = [get_trajectory(env, agent) for _ in range(trajectory_n)]
    total_rewards = [np.sum(trajectory['rewards']) for trajectory in trajectories]
    print('iteration:', iteration, 'mean total reward:', np.mean(total_rewards))

    #policy improvement
    quantile = np.quantile(total_rewards, qs[iteration])
    elite_trajectories = []
    for trajectory in trajectories:
        total_reward = np.sum(trajectory['rewards'])
        if total_reward > quantile:
            elite_trajectories.append(trajectory)

    agent.fit(elite_trajectories)

iteration: 0 mean total reward: -774.525
iteration: 1 mean total reward: -706.394
iteration: 2 mean total reward: -635.415
iteration: 3 mean total reward: -544.687
iteration: 4 mean total reward: -433.251
iteration: 5 mean total reward: -302.729
iteration: 6 mean total reward: -171.996
iteration: 7 mean total reward: -94.623
iteration: 8 mean total reward: -48.965
iteration: 9 mean total reward: -21.567
iteration: 10 mean total reward: -5.809
iteration: 11 mean total reward: 1.792
iteration: 12 mean total reward: 4.913
iteration: 13 mean total reward: 6.455
iteration: 14 mean total reward: 6.983
iteration: 15 mean total reward: 7.095
iteration: 16 mean total reward: 7.421
iteration: 17 mean total reward: 7.274
iteration: 18 mean total reward: 7.274
iteration: 19 mean total reward: 7.335
iteration: 20 mean total reward: 7.339
iteration: 21 mean total reward: 7.509
iteration: 22 mean total reward: 7.495
iteration: 23 mean total reward: 7.448
iteration: 24 mean total reward: 7.374
iterati

In [10]:
mean, std = evaluate_agent(env, agent, 10000)
print(f'Reward: {mean.round(2)}±{std.round(2)}')

Reward: 7.53±3.12


Немного выросло среднее вознаграждение и уменьшился его разброс