In [1]:
# a few packages we need to import
%matplotlib widget

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.animation as animation
import IPython 

import torch

import pendulum

The goal of this homework is to implement the Q-learning with a neural network for the Q function to solve the inverted pendulum problem.

<img src='pendulum.png' width="120">

In the following, we write $x = \begin{pmatrix} \theta \\ \dot{\theta} \end{pmatrix}$ as the vector of states of the system.

## System dynamics
* The system dynamics is implemented in the `pendulum.py` function. The dynamics is implemented in `pendulum.step`.
* The allowed control inputs are $[-5,0,5]$

## Cost function
The goal is to find a policy that minimizes the following cost
$$\min \sum_{n=0}^N \alpha^n g(x,u)$$
where
$$g(x,v,u) = 0.01*(1-\cos(x-\pi))^2 + 0.001* v^2 + 0.00001*u^2$$
which gives a high cost for states far from $\pi$ (i.e. far from the inverted position) or states with non zero velocity or high controls



## Q-learning algorithm to implement
For each episode:
* Initialize the episode $x_0 = [0,0]$
* For each step of the episode:
    * Select $u_n$ using an $\epsilon$-greedy policy
    * Compute the next state $x_{n+1}$
    * Compute the target $y_n = g(x_n,u_n) + \alpha \min_a Q(x_{n+1},a)$
    * Do one SGD step on the neural network parameters to minimize $(Q(x,u) - y_t)^2$


## Parameters:
* Episode length 100 steps
* Discount factor $\alpha = 0,99$
* Learning rate (for SGD) 0.1
* $\epsilon = 0.1$



## Using PyTorch
You need to install and use PyTorch for the neural network and do the optimization. 

You may want to use the following functions:
* [`torch.optim.SGD`](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html)
* [`torch.nn.MSELoss`](https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html)

The neural network is given below

In [2]:
## we define the neural network to be used for Q-learning
## 2 hidden layers with 64 nodes
## 2 inputs (state)
## 3 outputs for the 3 possible controls
D_in, H, D_out = 2, 64, 3

q_function = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

## we initialize the network parameters to 0
for params in q_function.parameters():
    params = torch.zeros_like(params)


### possible controls
possible_controls = np.array([-5.,0.,5.])

# Questions:
1. Implement the Q-learning algorithm described above
2. Test that it works with and without pushes using the code below
3. Plot the cost per episode (to visualize learning)
4. Plot the learned value function (in 2D as a function of pendulum position and velocity) as well as the policy.
5. Describe the algorithm and put the plots in a short report (max 2 pages) and include a video of the pendulum.

## Testing
You can test your results with the code below which use the Q-function to create a controller send to the `animate_robot` function.
You can choose to save also a movie of the animation and toggle the animation with a disturbance.

In [3]:
def cost_function(x, u):
    # Convert x to a PyTorch tensor if it's not already
    if not isinstance(x, torch.Tensor):
        x = torch.tensor(x, dtype=torch.float32)
    if not isinstance(u, torch.Tensor):
        u = torch.tensor(u, dtype=torch.float32)

    # Compute the cost using PyTorch operations
    return 0.01 * (1 - torch.cos(x[0] - np.pi))**2 + 0.001 * x[1]**2 + 0.00001 * u**2

optimizer = torch.optim.SGD(q_function.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()



# We need to define the q-learning parameters
alpha = 0.99
epsilon = 0.1
episode_length = 100
learning_rate = 0.1
episodes = 20000

# Now we need to define the Q-learning algorithm according to this
for episode in range(episodes):
    x = torch.tensor([0., 0.])
    for step in range(episode_length):
        # Select u_n using an epsilon-greedy policy
        if np.random.rand() < epsilon:
            u_n = np.random.choice(possible_controls)
        else:
            u_pred = torch.argmin(q_function(torch.as_tensor(x, dtype=torch.float32).unsqueeze(0))).item()
            u_n = possible_controls[u_pred]

        # Compute the next state x_{n+1}
        x_next = pendulum.step(x, u_n)

        # Compute the target y_n
        g_value = cost_function(x, u_n)
        q_value = q_function(torch.as_tensor(x_next, dtype=torch.float32).unsqueeze(0)).detach()

        y = g_value + alpha * torch.min(q_value)

        # Update the Q-function
        q_value = q_function(torch.as_tensor(x, dtype=torch.float32).unsqueeze(0))
        target = q_value.clone().detach()
        u_index = np.where(possible_controls == u_n)[0][0]
        target[0, u_index] = y

        # Compute the loss
        loss = loss_fn(q_value, target)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the state
        x = x_next

    # Print the status of the episode
    if episode % 100 == 0:
        print(f"Episode {episode}, Step {step}, Loss {loss.item()}")


# def controller(x):


Episode 0, Step 99, Loss 0.00030738490750081837
Episode 100, Step 99, Loss 7.273672235896811e-05
Episode 200, Step 99, Loss 8.740862540435046e-07
Episode 300, Step 99, Loss 1.6727179172448814e-05
Episode 400, Step 99, Loss 7.322827150346711e-05
Episode 500, Step 99, Loss 5.733341367886169e-06
Episode 600, Step 99, Loss 1.620520720280183e-06
Episode 700, Step 99, Loss 0.0001474095624871552
Episode 800, Step 99, Loss 3.202239895472303e-07
Episode 900, Step 99, Loss 0.00029746326617896557
Episode 1000, Step 99, Loss 2.2971615180722438e-05
Episode 1100, Step 99, Loss 0.0011876699281856418
Episode 1200, Step 99, Loss 2.3640968720428646e-06
Episode 1300, Step 99, Loss 3.3658307074801996e-05
Episode 1400, Step 99, Loss 4.034489393234253e-06
Episode 1500, Step 99, Loss 6.417061285901582e-06
Episode 1600, Step 99, Loss 1.3776140804111492e-05
Episode 1700, Step 99, Loss 3.3423930290155113e-07
Episode 1800, Step 99, Loss 9.339417010778561e-05
Episode 1900, Step 99, Loss 1.9709432308445685e-06
Epi

In [4]:
x0 = np.zeros((2,1))
def controller(x):
    u_pred = torch.argmin(q_function(torch.as_tensor(x, dtype=torch.float).unsqueeze(0))).item()
    u = possible_controls[u_pred]
    return u
    
pendulum.animate_robot(x0,controller,push=False, save_movie=False)


In [5]:
x0 = np.zeros((2,1))
def controller(x):
    u_pred = torch.argmin(q_function(torch.as_tensor(x, dtype=torch.float).unsqueeze(0))).item()
    u = possible_controls[u_pred]
    return u
    
pendulum.animate_robot(x0,controller,push=True, save_movie=False)
