In [None]:
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output


def show_progress_fixed(rewards_batch, log, percentile):
    """
    A convenience function that displays training progress.
    No cool math here, just charts.
    """
    min_reward = min(rewards_batch) - 12
    max_reward = max(rewards_batch) + 12
    reward_range = [min_reward, max_reward]
    mean_reward = np.mean(rewards_batch)
    threshold = np.percentile(rewards_batch, percentile)
    log.append([mean_reward, threshold])

    plt.figure(figsize=[8, 4])
    plt.subplot(1, 2, 1)
    plt.plot(list(zip(*log))[0], label="Mean rewards")
    plt.plot(list(zip(*log))[1], label="Reward thresholds")
    plt.legend()
    plt.grid()


    plt.subplot(1, 2, 2)
    plt.hist(rewards_batch, range=reward_range)
    plt.vlines(
        [np.percentile(rewards_batch, percentile)],
        [0],
        [100],
        label="percentile",
        color="red",
    )
    plt.legend()
    plt.grid()
    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f" % (mean_reward, threshold))
    plt.show()


In [None]:
from itertools import chain


def select_elites(states_batch, actions_batch, rewards_batch, percentile):
    """
    Select states and actions from games that have rewards >= percentile
    :param states_batch: list of lists of states, states_batch[session_i][t]
    :param actions_batch: list of lists of actions, actions_batch[session_i][t]
    :param rewards_batch: list of rewards, rewards_batch[session_i]

    :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions

    Please return elite states and actions in their original order
    [i.e. sorted by session number and timestep within session]

    If you are confused, see examples below. Please don't assume that states are integers
    (they will become different later).
    """

    reward_threshold = np.percentile(rewards_batch, percentile)

    elite_states = list(chain.from_iterable(
        states for (states, reward) in zip(states_batch, rewards_batch) if reward > reward_threshold
    ))
    elite_actions = list(chain.from_iterable(
        actions for (actions, reward) in zip(actions_batch, rewards_batch) if reward > reward_threshold
    ))

    return elite_states, elite_actions


In [None]:
from sklearn.neural_network import MLPClassifier, MLPRegressor
import gymnasium as gym
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


env = gym.make("MountainCarContinuous-v0", render_mode="rgb_array")

n_actions = env.action_space.n

agent = MLPRegressor(
    hidden_layer_sizes=(42, 42),
    activation="tanh",
    solver="adam",
    learning_rate_init=0.01,
    max_iter=1,
    warm_start=True
)
scaler = StandardScaler()


In [None]:
def generate_session_for_agent(env, agent, scaler, t_max=10000):
    states, actions = [], []
    total_reward = 0.0

    s, _ = env.reset()
    s_scaled = scaler.transform([s])

    for t in range(t_max):
        probs = agent.predict_proba(s_scaled)[0]
        assert probs.shape == (env.action_space.n,), "Probabilities shape mismatch"

        a = np.random.choice(env.action_space.n, p=probs)

        new_s, r, terminated, truncated, _ = env.step(a)

        states.append(s)
        actions.append(a)
        total_reward += r

        s = new_s
        if terminated or truncated:
            break

        s_scaled = scaler.transform([s])

    return states, actions, total_reward
