# Prerequisites

In [None]:
# Definitions
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
rc('animation', html='jshtml')

from matplotlib.animation import FuncAnimation

def animate_slingshot(trajectory, resolution_Hz=15, duration=None, fig_size=8):
    plt.ioff()
    trajectory = np.array(trajectory)
    trajectory = trajectory.T[0:2].T
    if len(trajectory.shape) == 1:
        trajectory = trajectory.reshape(trajectory.size, 1)
    if trajectory.shape == (trajectory.size, 1):
        trajectory = np.stack((trajectory.T[0], np.zeros(trajectory.size))).T
    if duration == None:
        frames = range(trajectory.shape[0])
    else:
        frames = range(int(duration * resolution_Hz))
    fig, ax = plt.subplots(figsize=(fig_size, fig_size))
    # set the axes limits
    ax.axis([-2,2,-2, 2])
    ax.set_aspect("equal")
    # create a point in the axes
    plt.grid()
    point, = ax.plot(0,1, marker="o")

    # Updating function, to be repeatedly called by the animation
    def update(t):
        # obtain point coordinates 
        x,y = trajectory[int(t) % trajectory.shape[0]]
        # set point's coordinates
        point.set_data([x],[y])
        return point,

    
    ani = FuncAnimation(fig, update, interval=1000/resolution_Hz, blit=True, repeat=True,
                    frames=frames)
    plt.ion()
    return ani

In [None]:
from scipy.linalg import norm

class ParametrizedDiscreteTimeSystem:
    def __init__(self, 
                 state_transition_function,  # f(., .)
                 initial_state,              # x_0
                 controller,
                 running_reward,
                 discount               
                 ):
      self.__state_transition_function = state_transition_function
      self.__initial_state = initial_state
      self.__controller = controller
      self.__running_reward = running_reward
      self.__discount = discount

    def run_with_parameters(self, 
                            parameters,      # \theta
                            steps=200):
      trajectory = [self.__initial_state]
      actions = []
      total_reward = 0
      accumulated_discount = 1
      for _ in range(steps):
        current_state = trajectory[-1]
        control_input = self.__controller(parameters, current_state)
        actions.append(control_input)
        try:
            next_state = self.__state_transition_function(current_state, control_input)
            total_reward += self.__running_reward(current_state, control_input) * accumulated_discount
            accumulated_discount *= self.__discount
        except OverflowError:
            print("The trajectory blew up. Ending the episode prematurely.")
            return np.array(trajectory)
        trajectory.append(next_state)
      return np.array(trajectory), np.array(actions), total_reward

# Recap: Policy gradient

REINFORCE:

$$
\theta_{i + 1} := \theta_{i} + \alpha \mathbb{E}\left[\sum_{k = 0}^{N} \nabla_{\theta_i} ln \ \rho_{\theta_i} (u_k | x_k) \ \cdot \ \sum_{l = 0}^{N} \gamma^l r(x_l, u_l)\right]
$$
$\theta_i \text{ -- controller parameters at $i$-th iteration}$,

$\rho_\theta(u | x) \text{ -- probability density of action $u$ when in state $x$}$,

$r(x, u) \text{ -- running reward,}$

$\gamma \text{ -- discounting factor.}$

# Aiming a slingshot

The following system decribes the dynamics of a projectile fired from a slingshot. The one firing the slingshot determines the projectile's initial direction and velocity. Their goal is to land the projectile as close to the target as possible. Also, their hands are a bit shaky.
$$
\begin{aligned}
&\begin{cases}
 \left(x_{t + 1} \atop y_{t + 1} \right) = \left(x_{t} \atop y_{t} \right) + v_t\\
 v_{t + 1} = \begin{cases} {v_t + \left({0 \atop -7\cdot 10^{-4}}\right) + u_t, \ \text{if } y_t \geq 0\\ \left({0 \atop 0}\right),  \phantom{...............}\text{ otherwise}}\end{cases}\end{cases}\\
& \left(x_0 \atop y_0 \right) = \left(-1 \atop 0 \right) \\
& v_0 = \left(0 \atop 0 \right) \\
& u_0 \sim \mathcal{N}\left(\theta, \left({10^{-6} \atop 0}{0 \atop 10^{-6}}\right)\right), \theta \in \Theta := \{\theta' \in \mathbb{R}^2 \ | \ \lVert \theta' \rVert_2 \leq 0.04 \} \\
& u_t = 0, \ t \neq 0\\
& r(x_t, u_t) := -\lvert x_t - 1.5\rvert, \ \gamma = 0.99
\end{aligned}
$$

Your goal is to maximize the abover reward by varying the parameters $\theta \in \Theta$ using **policy gradient**.

**Directive:** set $\theta_0 := \left(0 \atop 0\right)$

**Useful resources:**
* [Bivariate normal distribution](https://mathworld.wolfram.com/BivariateNormalDistribution.html)
* [Projected gradient descent](https://math.stackexchange.com/questions/571068/what-is-the-difference-between-projected-gradient-descent-and-ordinary-gradient)
* [Law of large numbers](https://en.wikipedia.org/wiki/Law_of_large_numbers)

In [None]:
slingshot_initial_state = np.array([-1.0, 0.0, 0.0, 0.0])


def slingshot_transition_function(state, control):
    next_state = state.copy()
    next_state[0:2] += state[2:]
    if state[1] < 0:
        next_state[2:] = 0
    else:
        next_state[3] += -0.0007
        next_state[2:] += control
    return next_state

def slingshot_controller(parameters, state):
    if (state != slingshot_initial_state).any():
        return np.zeros(2)
    assert norm(parameters) <= 0.04, "The provided parameters are out of bounds. Fix your optimization."
    control = parameters.copy()
    control += np.random.normal(loc=0, scale=0.001, size=2)
    return control

def slingshot_running_reward(state, control_input):
    return -np.abs(state[0] - 1.5)
slingshot_discount = 0.99

slingshot_system = ParametrizedDiscreteTimeSystem(slingshot_transition_function,
                                                  slingshot_initial_state,
                                                  slingshot_controller,
                                                  slingshot_running_reward,
                                                  slingshot_discount)
    

## Example

In [None]:
trajectory, actions, total_reward = slingshot_system.run_with_parameters(np.array([0.0, 0.04]))
print("Total reward: %f" % total_reward)
print("First action:", actions[0])
animate_slingshot(trajectory)

Total reward: -216.321517
First action: [3.54728747e-05 3.99833448e-02]


In [None]:
## YOUR SOUTION