In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import torch
import sys, os
import pystk
import ray
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('device = ', device)
ray.init(logging_level=50)

In [None]:
from utils.actors import new_action_net, Actor, GreedyActor, SteeringActor
from utils.utils import run_agent, rollout_many
import numpy as np

In [None]:
action_net = new_action_net()
data = run_agent(Actor(SteeringActor(action_net)))

In [None]:
many_action_nets = [new_action_net() for i in range(100)]

data = rollout_many([Actor(SteeringActor(action_net)) for action_net in many_action_nets], n_steps=600)

good_initialization = many_action_nets[ np.argmax([d[-1]['kart_info'].overall_distance for d in data]) ]
bad_initialization = many_action_nets[ np.argmin([d[-1]['kart_info'].overall_distance for d in data]) ]

In [4]:
data = run_agent(GreedyActor(SteeringActor(bad_initialization)))



Recall what we're trying to do in RL: maximize the expected return of a policy $\pi$ (or in turn minmize a los $L$)
$$
-L = E_{\tau \sim P_\pi}[R(\tau)],
$$
where $\tau = \{s_0, a_0, s_1, a_1, \ldots\}$ is a trajectory of states and actions.
The return of a trajectory is then defined as the sum of individual rewards $R(\tau) = \sum_k r(s_k)$ (we won't discount in this assignment).

Policy gradient computes the gradient of the loss $L$ using the log-derivative trick
$$
\nabla_\pi L = -E_{\tau \sim P_\pi}[\sum_k r(s_k) \nabla_\pi \sum_i \log \pi(a_i | s_i)].
$$
Since the return $r(s_k)$ only depends on action $a_i$ in the past $i < k$ we can further simplify the above equation:
$$
\nabla_\pi L = -E_{\tau \sim P_\pi}\left[\sum_i \left(\nabla_\pi \log \pi(a_i | s_i)\right)\left(\sum_{k=i}^{i+T} r(s_k) \right)\right].
$$
We will implement an estimator for this objective below. There are a few steps that we need to follow:

 * The expectation $E_{\tau \sim P_\pi}$ are rollouts of our policy
 * The log probability $\log \pi(a_i | s_i)$ uses the `Categorical.log_prob`
 * Gradient computation uses the `.backward()` function
 * The gradient $\nabla_\pi L$ is then used in a standard optimizer

In [None]:
from utils.reinforce import reinforce
import copy

# good_initialization = best_action_net
action_net = copy.deepcopy(bad_initialization)
best_action_net = reinforce(action_net, n_epochs=5, n_iterations=200, n_trajectories=100, n_validations=100, T=10)

In [None]:
data = run_agent(GreedyActor(SteeringActor(best_action_net)))

In [None]:
from utils.utils import Rollout
viz_rollout = Rollout.remote(400, 300, track='hacienda')
data = run_agent(GreedyActor(SteeringActor(best_action_net)), rollout=viz_rollout)