In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import torch
import sys, os
import pystk
import ray
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('device = ', device)
ray.init(logging_level=50)

device =  cpu


{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': None,
 'object_store_address': '/tmp/ray/session_2022-04-24_00-32-14_714468_37765/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-04-24_00-32-14_714468_37765/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-04-24_00-32-14_714468_37765',
 'metrics_export_port': 61077,
 'gcs_address': '127.0.0.1:54943',
 'address': '127.0.0.1:54943',
 'node_id': '765d4e5f33433a7e6b57b7c7af5b71fae0f0f861081aef80cd81f41c'}

In [None]:
from utils.track_actors import Agent, SteeringActor, DriftActor
from utils.utils import run_agent, rollout_many, show_trajectory_histogram
import numpy as np

In [None]:
data = run_agent(Agent(SteeringActor(), train=True))

In [None]:
many_actors = [SteeringActor() for i in range(100)]

data = rollout_many([Agent(actor) for actor in many_actors], n_steps=600)

good_initialization = many_actors[ np.argmax([d[-1]['kart_info'].overall_distance for d in data]) ]
bad_initialization = many_actors[ np.argmin([d[-1]['kart_info'].overall_distance for d in data]) ]

In [None]:
data = run_agent(Agent(good_initialization))

Recall what we're trying to do in RL: maximize the expected return of a policy $\pi$ (or in turn minmize a los $L$)
$$
-L = E_{\tau \sim P_\pi}[R(\tau)],
$$
where $\tau = \{s_0, a_0, s_1, a_1, \ldots\}$ is a trajectory of states and actions.
The return of a trajectory is then defined as the sum of individual rewards $R(\tau) = \sum_k r(s_k)$ (we won't discount in this assignment).

Policy gradient computes the gradient of the loss $L$ using the log-derivative trick
$$
\nabla_\pi L = -E_{\tau \sim P_\pi}[\sum_k r(s_k) \nabla_\pi \sum_i \log \pi(a_i | s_i)].
$$
Since the return $r(s_k)$ only depends on action $a_i$ in the past $i < k$ we can further simplify the above equation:
$$
\nabla_\pi L = -E_{\tau \sim P_\pi}\left[\sum_i \left(\nabla_\pi \log \pi(a_i | s_i)\right)\left(\sum_{k=i}^{i+T} r(s_k) \right)\right].
$$
We will implement an estimator for this objective below. There are a few steps that we need to follow:

 * The expectation $E_{\tau \sim P_\pi}$ are rollouts of our policy
 * The log probability $\log \pi(a_i | s_i)$ uses the `Categorical.log_prob`
 * Gradient computation uses the `.backward()` function
 * The gradient $\nabla_\pi L$ is then used in a standard optimizer

In [None]:
from utils.reinforce import reinforce
import copy

#good_initialization = best_steering_net
action_net = copy.deepcopy(bad_initialization.action_net)
actors = [SteeringActor(action_net, reward_type="lateral")]

# iterations is high relatively here to help force a good outcome from a bad initialization
best_steering_net = reinforce(actors[0], actors, n_epochs=5, n_iterations=100, n_trajectories=100, n_validations=100, T=10)

In [None]:
data = run_agent(Agent(SteeringActor(best_steering_net)))
data = rollout_many([Agent(SteeringActor(best_steering_net))] * 100, n_steps=600)

In [None]:
show_trajectory_histogram(data)

In [None]:
data = run_agent(Agent(SteeringActor(best_steering_net), DriftActor(new_action_net())))

In [None]:
# train the drift action - may need to run this a few times to get a good result
many_actors = [DriftActor() for i in range(100)]

# train=True for steering to introduce a bit of uncertainty otherwise Drift may never beat the non-drift network.
data = rollout_many([Agent(actor, SteeringActor(best_steering_net), train=True) for actor in many_actors], n_steps=600)

good_initialization_drift = many_actors[ np.argmax([d[-1]['kart_info'].overall_distance for d in data]) ]

actors = [SteeringActor(best_steering_net, train=False), good_initialization_drift]
best_drift_net = reinforce(actors[1], actors, n_epochs=5, n_iterations=1000, n_trajectories=100, n_validations=20, T=15)

In [None]:
data = run_agent(Agent(SteeringActor(best_steering_net), DriftActor(best_drift_net)))

In [None]:
from utils.utils import Rollout
viz_rollout = Rollout.remote(400, 300, track='hacienda')
data = run_agent(Agent(SteeringActor(best_steering_net), DriftActor(best_drift_net)), rollout=viz_rollout)