# Обучение Taxi-v3 из Gymnasium при помощи кросс-энтропийного метода

Ноутбук сделан на основе материалов практического задания [курса](https://ods.ai/tracks/drlcourse23/tasks/drl23_hw_1).

In [1]:
import gym
import numpy as np
from src.agents import CrossEntropyAgent
from src.utils import get_trajectory, evaluate_agent
from hydra import compose, initialize


config_path = 'configs/taxiv3/'
initialize(version_base=None, config_path=config_path)
cfg = compose(config_name="config")

In [32]:
cfg = compose(config_name="config")

In [33]:
env = gym.make(cfg.env.name)
state_n = cfg.env.state_n
action_n = cfg.env.action_n


agent = CrossEntropyAgent(state_n, action_n)
q_param = cfg.params.q_param
iteration_n = cfg.params.iteration_n
trajectory_n = cfg.params.trajectory_n

for iteration in range(iteration_n):

    #policy evaluation
    trajectories = [get_trajectory(env, agent) for _ in range(trajectory_n)]
    total_rewards = [np.sum(trajectory['rewards']) for trajectory in trajectories]
    print('iteration:', iteration, 'mean total reward:', np.mean(total_rewards))

    #policy improvement
    quantile = np.quantile(total_rewards, q_param)
    elite_trajectories = []
    for trajectory in trajectories:
        total_reward = np.sum(trajectory['rewards'])
        if total_reward > quantile:
            elite_trajectories.append(trajectory)

    agent.fit(elite_trajectories)

iteration: 0 mean total reward: -766.61
iteration: 1 mean total reward: -682.484
iteration: 2 mean total reward: -580.202
iteration: 3 mean total reward: -460.93
iteration: 4 mean total reward: -297.426
iteration: 5 mean total reward: -161.062
iteration: 6 mean total reward: -86.585
iteration: 7 mean total reward: -45.662
iteration: 8 mean total reward: -18.505
iteration: 9 mean total reward: -3.542
iteration: 10 mean total reward: 3.431
iteration: 11 mean total reward: 5.131
iteration: 12 mean total reward: 5.974
iteration: 13 mean total reward: 6.616
iteration: 14 mean total reward: 6.559
iteration: 15 mean total reward: 6.726
iteration: 16 mean total reward: 7.026
iteration: 17 mean total reward: 7.122
iteration: 18 mean total reward: 7.245
iteration: 19 mean total reward: 6.881
iteration: 20 mean total reward: 6.862
iteration: 21 mean total reward: 6.898
iteration: 22 mean total reward: 7.08
iteration: 23 mean total reward: 6.999
iteration: 24 mean total reward: 7.128
iteration: 25

In [34]:
trajectory = get_trajectory(env, agent, max_len=cfg.params.trajectory.max_len)
print('total reward:', sum(trajectory['rewards']))
print('model:')
print(agent.model)

total reward: 7
model:
[[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]
 [0.         0.         0.         0.         1.         0.        ]
 [0.         0.         0.         0.         1.         0.        ]
 ...
 [0.         1.         0.         0.         0.         0.        ]
 [0.         0.66666667 0.33333333 0.         0.         0.        ]
 [0.         0.         0.         1.         0.         0.        ]]


In [36]:
mean, std = evaluate_agent(env, agent, 10000)
print(f'Reward: {mean.round(2)}±{std.round(2)}')

Reward: 6.97±5.62
