# Cross-Entropy for Taxi-v3
После лекции 1 и практического занятия 1 требуется выполнить три домашних задания:

1. Пользуясь алгоритмом Кросс-Энтропии обучить агента решать задачу Taxi-v3 из Gym. Исследовать гиперпараметры алгоритма и выбрать лучшие.
2. Реализовать алгоритм Кросс-Энтропии с двумя типами сглаживания,  указанными в лекции 1. При выбранных в пункте 1 гиперпараметров сравнить их результаты с результатами алгоритма без сглаживания.
3. Реализовать модификацию алгоритм Кросс-Энтропии для стохастических сред, указанную в лекции 1. Сравнить ее результат с алгоритмами из пунктов 1 и 2.

## Импорт библиотек 

In [64]:
import gym
import numpy as np
import random
import time

import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'iframe'

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate

## Класс `CrossEntropyAgent`
Метод Кросс-Энтропии состоит из 2 шагов:
1. **Оценка стратегии** - поиск $\mathbb{E}_\pi[G]$

Пусть $\pi_0$ -- это исходная стратегия, $N$ -- это число итераций. На каждой итерации выбираем $K$ траекторий и выбираем *"элитные" траектории*: такие траектории, чья награда превышает $q$-квантиль. 

2. **Улучшение стратегии** - поиск $\pi' \geq \pi: \mathbb{E}_\pi[G] \geq\mathbb{E}_{\pi'}[G]$

Выбрав "элитные" траектории, обновляем стратегию следующим образом:$$ \pi_{n+1}(a|s) = \frac{\text{число пар $(a|s)$ в "элитных" траекториях}}{\text{число состояний $s$ в "элитных" траекториях}} $$

In [112]:
class CrossEntropyAgent:
    def __init__(self, state_n=500, action_n=6):
        self.state_n = state_n
        self.action_n = action_n
        self.model = np.ones((self.state_n, self.action_n)) / self.action_n

    def get_action(self, state):
        action = np.random.choice(np.arange(self.action_n), p=self.model[state])
        return int(action)

    def fit(self, elite_trajectories):
        new_model = np.zeros((self.state_n, self.action_n))
        for trajectory in elite_trajectories:
            for state, action in zip(trajectory["states"], trajectory["actions"]):
                new_model[state][action] += 1

        for state in range(self.state_n):
            if np.sum(new_model[state]) > 0:
                new_model[state] /= np.sum(new_model[state])
            else:
                new_model[state] = self.model[state].copy()

        self.model = new_model

## Методы для получения состояния и траекторий 

In [66]:
def get_state(obs):
    return obs


def get_trajectory(env, agent, max_len=1000, visualize=False):
    trajectory = {"states": [], "actions": [], "rewards": []}

    obs = env.reset()
    state = get_state(obs)

    for _ in range(max_len):
        trajectory["states"].append(state)

        action = agent.get_action(state)
        trajectory["actions"].append(action)

        obs, reward, done, _ = env.step(action)
        trajectory["rewards"].append(reward)

        state = get_state(obs)

        if visualize:
            time.sleep(0.5)
            env.render()

        if done:
            break

    return trajectory

## Класс `Environment` и метод для обучения агента

In [117]:
class Environment:
    def __init__(self, env_name='Taxi-v3', q_param=0.9, iteration_n=100, trajectory_n=50):
        self.env_name = env_name
        self.q_param = q_param
        self.iteration_n = iteration_n
        self.iteration_range = np.array(range(self.iteration_n))
        self.trajectory_n = trajectory_n

def train_agent(environment, agent, verbose=True):
    env = gym.make(environment.env_name)
    env.reset()
    mean_total_reward_per_iteration = np.zeros(environment.iteration_n)
    
    for iteration in environment.iteration_range:
    
        #policy evaluation
        trajectories = [get_trajectory(env, agent) for _ in range(environment.trajectory_n)]
        total_rewards = [np.sum(trajectory['rewards']) for trajectory in trajectories]
        mean_total_reward_per_iteration[iteration] = np.mean(total_rewards)
        if verbose:
            print('iteration:', iteration, 'mean total reward:', mean_total_reward_per_iteration[iteration])
    
        #policy improvement
        quantile = np.quantile(total_rewards, environment.q_param)
        elite_trajectories = []
        for trajectory in trajectories:
            total_reward = np.sum(trajectory['rewards'])
            if total_reward > quantile:
                elite_trajectories.append(trajectory)
    
        agent.fit(elite_trajectories)

    return mean_total_reward_per_iteration

## 1. Обучим агента и подберем гиперпараметры

### Создадим игру и обучим агента

In [114]:
environment = Environment()
agent = CrossEntropyAgent()

In [76]:
mean_total_reward_per_iteration = train_agent(environment, agent)

trajectory = get_trajectory(env, agent, max_len=100, visualize=False)
env.close()
print('total reward:', sum(trajectory['rewards']))
print('model:')
print(agent.model)

iteration: 0 mean total reward: -787.0
iteration: 1 mean total reward: -710.6
iteration: 2 mean total reward: -668.54
iteration: 3 mean total reward: -669.26
iteration: 4 mean total reward: -551.04
iteration: 5 mean total reward: -503.98
iteration: 6 mean total reward: -546.72
iteration: 7 mean total reward: -498.4
iteration: 8 mean total reward: -417.78
iteration: 9 mean total reward: -399.26
iteration: 10 mean total reward: -485.5
iteration: 11 mean total reward: -480.62
iteration: 12 mean total reward: -464.7
iteration: 13 mean total reward: -445.88
iteration: 14 mean total reward: -514.78
iteration: 15 mean total reward: -510.1
iteration: 16 mean total reward: -504.5
iteration: 17 mean total reward: -433.28
iteration: 18 mean total reward: -456.0
iteration: 19 mean total reward: -485.1
iteration: 20 mean total reward: -412.82
iteration: 21 mean total reward: -520.34
iteration: 22 mean total reward: -395.9
iteration: 23 mean total reward: -478.86
iteration: 24 mean total reward: -43

In [77]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=environment.iteration_range, y=mean_total_reward_per_iteration,
                    mode='lines+markers',
                    name='CE_agent_q_0.9_traj_50'))

fig.update_layout(
    title="Cross-Entropy Agent: q=0.9, trajectory_n=50",
    xaxis_title="Iteration",
    yaxis_title="Mean Total Reward",
    legend_title="Legend Title",
)

fig.show()

### Подбор гиперпараметров

In [138]:
def objective(trial):
    q_param = trial.suggest_float('q_param', 0, 1, log=False)
    trajectory_n = trial.suggest_int('trajectory_n', 10, 500)

    agent = CrossEntropyAgent()
    environment = Environment(q_param=q_param, trajectory_n=trajectory_n)
    
    mean_reward = train_agent(environment, agent, verbose=False)
    
    return mean_reward[-1]

In [139]:
sampler = TPESampler()#n_startup_trials=10
pruner = MedianPruner()

study = optuna.create_study(sampler=sampler, pruner=pruner, direction='maximize')  # For reward maximization

# Run the hyperparameter optimization
study.optimize(objective, n_trials=20)  # You can adjust n_trials

# Print the best hyperparameters and result
print("Best trial:")
trial = study.best_trial
print("Value: ", trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f"  {key}: {value}")

[I 2023-10-12 14:49:40,700] A new study created in memory with name: no-name-e1a07723-eb74-4d8a-981e-368462d0ff88
[I 2023-10-12 14:50:02,504] Trial 0 finished with value: 6.8995098039215685 and parameters: {'q_param': 0.43512845271204725, 'trajectory_n': 408}. Best is trial 0 with value: 6.8995098039215685.
[I 2023-10-12 14:50:28,529] Trial 1 finished with value: -270.0357142857143 and parameters: {'q_param': 0.9003996761756318, 'trajectory_n': 196}. Best is trial 0 with value: 6.8995098039215685.
[I 2023-10-12 14:52:45,025] Trial 2 finished with value: -168.5935960591133 and parameters: {'q_param': 0.04370846597180911, 'trajectory_n': 406}. Best is trial 0 with value: 6.8995098039215685.
[I 2023-10-12 14:53:13,720] Trial 3 finished with value: -461.0378787878788 and parameters: {'q_param': 0.947143760805957, 'trajectory_n': 132}. Best is trial 0 with value: 6.8995098039215685.
[I 2023-10-12 14:53:55,903] Trial 4 finished with value: -225.57926829268294 and parameters: {'q_param': 0.56

Best trial:
Value:  6.8995098039215685
Params: 
  q_param: 0.43512845271204725
  trajectory_n: 408


In [140]:
fig = plot_optimization_history(study, target_name='Reward')
fig.show()

In [141]:
fig = plot_parallel_coordinate(study)
fig.show()

Обучим агента на лучших параметрах:

In [142]:
environment = Environment(**trial.params)
agent = CrossEntropyAgent()

mean_total_reward_per_iteration = train_agent(environment, agent)

trajectory = get_trajectory(env, agent, max_len=100, visualize=False)
env.close()
print('total reward:', sum(trajectory['rewards']))
print('model:')
print(agent.model)

iteration: 0 mean total reward: -759.25
iteration: 1 mean total reward: -694.3872549019608
iteration: 2 mean total reward: -608.7720588235294
iteration: 3 mean total reward: -531.1617647058823
iteration: 4 mean total reward: -447.3014705882353
iteration: 5 mean total reward: -354.97549019607845
iteration: 6 mean total reward: -270.22058823529414
iteration: 7 mean total reward: -170.9436274509804
iteration: 8 mean total reward: -108.22058823529412
iteration: 9 mean total reward: -70.36029411764706
iteration: 10 mean total reward: -43.26960784313726
iteration: 11 mean total reward: -23.61519607843137
iteration: 12 mean total reward: -10.034313725490197
iteration: 13 mean total reward: -1.5416666666666667
iteration: 14 mean total reward: 2.4779411764705883
iteration: 15 mean total reward: 2.877450980392157
iteration: 16 mean total reward: 4.509803921568627
iteration: 17 mean total reward: 4.7818627450980395
iteration: 18 mean total reward: 5.703431372549019
iteration: 19 mean total reward

In [143]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=environment.iteration_range, y=mean_total_reward_per_iteration,
                    mode='lines+markers',
                    name='CE_agent_q_0.9_traj_50'))

fig.update_layout(
    title=f"Cross-Entropy Agent: q={environment.q_param:1f}, trajectory_n={environment.trajectory_n}",
    xaxis_title="Iteration",
    yaxis_title="Mean Total Reward",
    legend_title="Legend Title",
)

fig.show()

## 2. Сглаживания 
Обновление стратегии сильно зависит от случайности (начиная от случайной инициализации начальной стратегии, заканчивая ситуациями, когда в одном и том же состоянии в элитных траекториях выбирались неоптимальные действия), для того, чтобы с этим бороться, применяют техники сглаживания:
1. Сглаживание по Лапласу $$  \pi_{n+1}(a|s) = \frac{|(a|s)\in T_n| + \lambda}{|s\in T_n|+\lambda}, \text{ for }\lambda > 0  $$
2. Сглаживание стратегии $$  \pi_{n+1}(a|s) = \lambda\pi_{n+1}(a|s) + (1-\lambda)\pi_n(a|s), \text{ for }\lambda >0  $$

In [115]:
class CrossEntropyAgent:
    def __init__(
        self, state_n=500, action_n=6, smoothing=None, laplace_lambda=0.5, policy_lambda=0.5
    ):
        self.state_n = state_n
        self.action_n = action_n
        self.smoothing = smoothing
        self.laplace_lambda = laplace_lambda
        self.policy_lambda = policy_lambda
        self.model = np.ones((self.state_n, self.action_n)) / self.action_n

    def get_action(self, state):
        action = np.random.choice(np.arange(self.action_n), p=self.model[state])
        return int(action)

    def fit(self, elite_trajectories):
        new_model = np.zeros((self.state_n, self.action_n))
        for trajectory in elite_trajectories:
            for state, action in zip(trajectory["states"], trajectory["actions"]):
                new_model[state][action] += 1

        for state in range(self.state_n):
            if np.sum(new_model[state]) > 0:
                if self.smoothing == "laplace" or self.smoothing == "both":
                    new_model[state] += self.laplace_lambda
                    new_model[state] /= np.sum(new_model[state])
                else:
                    new_model[state] /= np.sum(new_model[state])
            else:
                new_model[state] = self.model[state].copy()

        if self.smoothing == "policy" or self.smoothing == "both":
            new_model = (
                self.policy_lambda * new_model + (1 - self.policy_lambda) * self.model
            )

        self.model = new_model

In [119]:
environment = Environment(iteration_n=100)
smoothings = [None, "laplace", "policy", "both"]
mean_rewards_per_smoothings = [None] * 4
for i, smoothing in enumerate(smoothings):
    env.reset()
    print(f"\n---------\n{i}: Smoothing = {smoothing}")
    agent = CrossEntropyAgent(smoothing=smoothing)
    mean_total_reward = train_agent(environment, agent)
    mean_rewards_per_smoothings[i] = mean_total_reward

    trajectory = get_trajectory(env, agent, max_len=100, visualize=False)
    print('total reward:', sum(trajectory['rewards']))


---------
0: Smoothing = None
iteration: 0 mean total reward: -769.78
iteration: 1 mean total reward: -734.88
iteration: 2 mean total reward: -576.22
iteration: 3 mean total reward: -552.4
iteration: 4 mean total reward: -522.34
iteration: 5 mean total reward: -619.52
iteration: 6 mean total reward: -619.18
iteration: 7 mean total reward: -634.58
iteration: 8 mean total reward: -553.64
iteration: 9 mean total reward: -612.72
iteration: 10 mean total reward: -664.06
iteration: 11 mean total reward: -555.6
iteration: 12 mean total reward: -651.4
iteration: 13 mean total reward: -553.0
iteration: 14 mean total reward: -579.1
iteration: 15 mean total reward: -646.48
iteration: 16 mean total reward: -629.12
iteration: 17 mean total reward: -626.08
iteration: 18 mean total reward: -513.02
iteration: 19 mean total reward: -521.08
iteration: 20 mean total reward: -566.9
iteration: 21 mean total reward: -535.36
iteration: 22 mean total reward: -578.0
iteration: 23 mean total reward: -529.9
ite

### Подбор гиперпараметров

In [122]:
def objective(trial):
    q_param = trial.suggest_float('q_param', 0, 1, log=False)
    trajectory_n = trial.suggest_int('trajectory_n', 10, 500)
    smoothing = trial.suggest_categorical('smoothing', [None, "laplace", "policy", "both"])
    laplace_lambda=trial.suggest_float('laplace_lambda', 0, 1, log=False)
    policy_lambda=trial.suggest_float('policy_lambda', 0, 1, log=False)
    

    agent = CrossEntropyAgent(smoothing=smoothing, laplace_lambda=laplace_lambda, policy_lambda=policy_lambda)
    environment = Environment(q_param=q_param, trajectory_n=trajectory_n)
    
    mean_reward = train_agent(environment, agent, verbose=False)
    
    return mean_reward[-1]

In [123]:
sampler = TPESampler()#n_startup_trials=10
pruner = MedianPruner()

study = optuna.create_study(sampler=sampler, pruner=pruner, direction='maximize')

study.optimize(objective, n_trials=10)  

print("Best trial:")
trial = study.best_trial
print("Value: ", trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f"  {key}: {value}")

[I 2023-10-12 14:24:31,022] A new study created in memory with name: no-name-a2bf70a5-62ae-4744-8479-6acdfdbcbdce
[I 2023-10-12 14:24:43,692] Trial 0 finished with value: 6.934782608695652 and parameters: {'q_param': 0.41892372441782144, 'trajectory_n': 138, 'smoothing': 'policy', 'laplace_lambda': 0.7755780490725028, 'policy_lambda': 0.21999652492336763}. Best is trial 0 with value: 6.934782608695652.
[I 2023-10-12 14:25:09,200] Trial 1 finished with value: -173.95857988165682 and parameters: {'q_param': 0.3785105156529047, 'trajectory_n': 169, 'smoothing': 'laplace', 'laplace_lambda': 0.6507360468460001, 'policy_lambda': 0.2387468040290298}. Best is trial 0 with value: 6.934782608695652.
[I 2023-10-12 14:25:31,200] Trial 2 finished with value: 7.3937360178970915 and parameters: {'q_param': 0.37250789161018794, 'trajectory_n': 447, 'smoothing': None, 'laplace_lambda': 0.8646139991968085, 'policy_lambda': 0.9118078004954452}. Best is trial 2 with value: 7.3937360178970915.
[I 2023-10-1

Best trial:
Value:  7.3937360178970915
Params: 
  q_param: 0.37250789161018794
  trajectory_n: 447
  smoothing: None
  laplace_lambda: 0.8646139991968085
  policy_lambda: 0.9118078004954452


In [128]:
fig = plot_optimization_history(study, target_name='Reward')
fig.show()

In [130]:
fig = plot_parallel_coordinate(study)
fig.show()

Как следует из графиков и результатов выше, агент лучше всего показывает себя без сглаживания, при этом среди сглаживаний лучше показало себя сглаживание по стратегии.

## 3. Модификация для стохастических сред
Если среда стохастическая, то можно использовать следующий трюк:
1. По стратегии $\pi_n$ cделаем выборку из $M$ детерминированных стратегий $\pi_{n,m}$
2. По стратегиям $\pi_{n,m}$ получим $K$ траекторий $\tau_{m,k}$
3. Вычислим $$  V_{\pi_{n,m}} \coloneqq \frac{1}{K}\sum_kG(\tau_{m,k}) $$
4. Выберем "элитные" траектории $T_n = \{ \tau_{m,k} : V_{\pi_{n,m}} > \gamma_q \}$, где $\gamma_q$ -- $q$-квантиль по выборке $V_{\pi_{n,m}}$
5. Дальше делается все то же самое

In [147]:
class StochasticEnvironment:
    def __init__(self, env_name='Taxi-v3', q_param=0.9, iteration_n=100, trajectory_n=50, packet_n=100):
        self.env_name = env_name
        self.q_param = q_param
        self.iteration_n = iteration_n
        self.iteration_range = np.array(range(self.iteration_n))
        self.trajectory_n = trajectory_n
        self.packet_n = packet_n
        
def train_agent_in_stochastic_env(environment, agent, verbose=True):
    env = gym.make(environment.env_name)
    env.reset()
    mean_total_reward_per_iteration = np.zeros(environment.iteration_n)
    
    for iteration in environment.iteration_range:
        packets = [
            [get_trajectory(env, agent) for _ in range(environment.trajectory_n)] 
            for _ in range(environment.packet_n)
        ]
    
        mean_reward_per_packet = [
            np.mean(
                [np.sum(trajectory['rewards']) for trajectory in trajectories]
            )
            for trajectories in packets
        ]
        mean_total_reward_per_iteration[iteration] = np.mean(mean_reward_per_packet)
        
        quantile = np.quantile(mean_reward_per_packet, environment.q_param)
        elite_trajectories = []
        for i, packet in enumerate(packets):
            if mean_reward_per_packet[i] > quantile:
                elite_trajectories.extend(packet)
    
        agent.fit(elite_trajectories)
    return mean_total_reward_per_iteration

### Создадим игру и обучим агента

In [149]:
environment = StochasticEnvironment()
agent = CrossEntropyAgent()

In [150]:
mean_total_reward_per_iteration = train_agent(environment, agent)

trajectory = get_trajectory(env, agent, max_len=100, visualize=False)
env.close()
print('total reward:', sum(trajectory['rewards']))
print('model:')
print(agent.model)

iteration: 0 mean total reward: -782.1
iteration: 1 mean total reward: -750.26
iteration: 2 mean total reward: -688.9
iteration: 3 mean total reward: -698.56
iteration: 4 mean total reward: -673.62
iteration: 5 mean total reward: -555.26
iteration: 6 mean total reward: -553.84
iteration: 7 mean total reward: -558.64
iteration: 8 mean total reward: -495.3
iteration: 9 mean total reward: -556.7
iteration: 10 mean total reward: -459.18
iteration: 11 mean total reward: -571.78
iteration: 12 mean total reward: -545.28
iteration: 13 mean total reward: -484.32
iteration: 14 mean total reward: -481.64
iteration: 15 mean total reward: -605.12
iteration: 16 mean total reward: -523.72
iteration: 17 mean total reward: -435.8
iteration: 18 mean total reward: -543.74
iteration: 19 mean total reward: -528.94
iteration: 20 mean total reward: -513.84
iteration: 21 mean total reward: -534.22
iteration: 22 mean total reward: -533.8
iteration: 23 mean total reward: -548.96
iteration: 24 mean total reward:

In [155]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=environment.iteration_range, y=mean_total_reward_per_iteration,
                    mode='lines+markers'))

fig.update_layout(
    title=f"Cross-Entropy Agent in Stochastic Environment: q={environment.q_param}, trajectory_n={environment.trajectory_n}, packet_n={environment.packet_n}",
    xaxis_title="Iteration",
    yaxis_title="Mean Total Reward",
    legend_title="Legend Title",
)

fig.show()

Как следует из графика выше, результаты вышли хуже, чем когда рассматривалась детерминированная среда.