# Q-learning with neural networks

In [2]:
# a few packages we need to import
%matplotlib widget

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.animation as animation
import IPython 
from tqdm import tqdm

import torch

import pendulum

The goal of this homework is to implement the Q-learning with a neural network for the Q function to solve the inverted pendulum problem.

<img src='pendulum.png' width="120">

In the following, we write $x = \begin{pmatrix} \theta \\ \dot{\theta} \end{pmatrix}$ as the vector of states of the system.

## System dynamics
* The system dynamics is implemented in the `pendulum.py` function. The dynamics is implemented in `pendulum.step`.
* The allowed control inputs are $[-5,0,5]$

## Cost function
The goal is to find a policy that minimizes the following cost
$$\min \sum_{n=0}^N \alpha^n g(x,u)$$
where
$$g(x,v,u) = 0.01*(1-\cos(x-\pi))^2 + 0.001* v^2 + 0.00001*u^2$$
which gives a high cost for states far from $\pi$ (i.e. far from the inverted position) or states with non zero velocity or high controls



## Q-learning algorithm to implement
For each episode:
* Initialize the episode $x_0 = [0,0]$
* For each step of the episode:
    * Select $u_n$ using an $\epsilon$-greedy policy
    * Compute the next state $x_{n+1}$
    * Compute the target $y_n = g(x_n,u_n) + \alpha \min_a Q(x_{n+1},a)$
    * Do one SGD step on the neural network parameters to minimize $(Q(x,u) - y_t)^2$


## Parameters:
* Episode length 100 steps
* Discount factor $\alpha = 0,99$
* Learning rate (for SGD) 0.1
* $\epsilon = 0.1$



## Using PyTorch
You need to install and use PyTorch for the neural network and do the optimization. 

You may want to use the following functions:
* [`torch.optim.SGD`](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html)
* [`torch.nn.MSELoss`](https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html)

The neural network is given below

In [2]:
## we define the neural network to be used for Q-learning
## 2 hidden layers with 64 nodes
## 2 inputs (state)
## 3 outputs for the 3 possible controls
D_in, H, D_out = 2, 64, 3

q_function = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

## we initialize the network parameters to 0
for params in q_function.parameters():
    params = torch.zeros_like(params)


### possible controls
possible_controls = np.array([-5.,0.,5.])

# Questions:
1. Implement the Q-learning algorithm described above
2. Test that it works with and without pushes using the code below
3. Plot the cost per episode (to visualize learning)
4. Plot the learned value function (in 2D as a function of pendulum position and velocity) as well as the policy.
5. Describe the algorithm and put the plots in a short report (max 2 pages) and include a video of the pendulum.

## Testing
You can test your results with the code below which use the Q-function to create a controller send to the `animate_robot` function.
You can choose to save also a movie of the animation and toggle the animation with a disturbance.

In [3]:
# x0 = np.zeros((2,1))
# def controller(x):
#     u_pred = torch.argmin(q_function(torch.as_tensor(x, dtype=torch.float).unsqueeze(0))).item()
#     u = possible_controls[u_pred]
#     return u
    
# pendulum.animate_robot(x0,controller,push=True, save_movie=True)


In [4]:
def cost(x, u):
    return (0.01 * (1 - np.cos(x[0] - np.pi)) + 0.001 * x[1]**2 + 0.00001 * u**2)

In [5]:
# Make device agnostic code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
q_function.to(device=device)
# np.random.seed(42)
# torch.manual_seed(42)
N = 100
MAX_ITER = 5000
alpha = 0.99
lr = 0.1
epsilon = 0.1

# loss_fn = torch.nn.MSELoss(reduction='sum')
loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.SGD(q_function.parameters(), lr=lr)

In [6]:
for _ in tqdm(range(MAX_ITER), desc=f'Training in Progress'):
    xi = torch.tensor(np.zeros((2, )), device=device, dtype=torch.float)
    for _ in range(N):
        forward_pass = q_function.forward(xi.unsqueeze(0))
        if torch.rand(1).item() > epsilon:
            ui = torch.argmin(forward_pass).item()
        else:
            ui = torch.randint(0, 3, (1,)).item()

        with torch.no_grad():
            xip1 = torch.tensor(pendulum.step(
                x=xi.cpu().numpy(), u=possible_controls[ui]), device=device, dtype=torch.float)

            yi = torch.tensor((cost(x=xi.cpu().numpy(), u=possible_controls[ui]) + (alpha * torch.min(
                q_function.forward(xip1.unsqueeze(0))).item())), device=device, dtype=torch.float)

        loss = loss_fn(forward_pass.squeeze()[ui], yi)
        if ~torch.isnan(loss):
            pass
        # use the optimizer object to zero all of the gradients for the variables it will update,
        # i.e. the weights of the model. Checkout docs of torch.autograd.backward for more details.
        optimizer.zero_grad()

        # compute gradient of the loss with respect to model parameters (backward autodiff)
        loss.backward()

        # call the step function of the optimizer to make one update of the parameters
        optimizer.step()

        xi = torch.tensor(xip1, device=device, dtype=torch.float)

print(f"values of x0: {xi}")

  xi = torch.tensor(xip1, device=device, dtype=torch.float)
Training in Progress: 100%|██████████| 5000/5000 [17:56<00:00,  4.65it/s]


values of x0: tensor([5.5513, 0.6516], device='cuda:0')


In [10]:
# print(xi.shape)
# print(xi)

# x0 = np.zeros((2, 1))
x0 = np.array([5.5513, 0.6516]).reshape(2, 1)


def controller(x):
    u_pred = torch.argmin(q_function(torch.as_tensor(x, dtype=torch.float, device=device).unsqueeze(0))).item()
    u = possible_controls[u_pred]
    return u

# assert(x0.shape == (2,))
# assert(x0.shape == 2)
assert(x0.shape[0]==2)
x = np.zeros((2, 2))  # Replace n_rows and n_columns with actual dimensions
x[:,0] = x0[:,0]

pendulum.animate_robot(x0, controller, push=False, save_movie=True)

NameError: name 'q_function' is not defined

In [None]:
def traj_controller():
    for _ in tqdm(range(5000), desc=f'Running Simulation'):
        xi = torch.from_numpy(np.zeros((2, )), dtype=torch.float).to(device=device, dtype=torch.float)
        for _ in range(100):
            # First we do the forward pass
            forward_pass = q_function.forward(xi.unsqueeze(0))
            # implement epsilon greedy policy
            if torch.rand(1).item() > epsilon:
                ui = torch.argmin(forward_pass).item()
            else:
                ui = torch.randint(0, 3, (1,)).item()

            # No gradient calculation needed
            with torch.no_grad():
                xip1 = torch.from_numpy(pendulum.step(x=xi.cpu().numpy(), u=possible_controls[ui])).to(device=device, dtype=torch.float)
                cost_value = cost(x=xi.cpu().numpy(), u=possible_controls[ui])
                q_min_value = torch.min(q_function.forward(xip1.unsqueeze(0))).item()
                total_value = cost_value + (alpha * q_min_value)
                yi = torch.tensor(total_value, device=device, dtype=torch.float)
            
            loss = loss_fn(forward_pass.squeeze()[ui], yi)
            # Check if the loss is nan
            if ~torch.isnan(loss):
                pass

            # Zero all the gradients
            optimizer.zero_grad()

            # Compute the gradients
            loss.backward()

            # Update the weights
            optimizer.step()

            xi = torch.from_numpy(xip1.cpu(), dtype=torch.float).to(device=device, dtype=torch.float)

        # print(f"values of x0: {xi}")
    return xi
                