## Part 2 : REINFORCE algorithm

In [2]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pygame
import scipy as sc
from collections import deque
from tqdm import *

## Description

The system consists of two links connected linearly to form a chain, with one end of the chain fixed. The joint between the two links is actuated. The goal is to apply torques on the actuated joint to swing the free end of the linear chain above a given height while starting from the initial state of hanging downwards.

## Action Space
The action is discrete, deterministic, and represents the torque applied on the actuated joint between the two links.
    | Num | Action                                | Unit         |
    |-----|---------------------------------------|--------------|
    | 0   | apply -1 torque to the actuated joint | torque (N m) |
    | 1   | apply 0 torque to the actuated joint  | torque (N m) |
    | 2   | apply 1 torque to the actuated joint  | torque (N m) 

## Observation Space
The observation is a ndarray with shape (6,) that provides information about the two rotational joint angles as well as their angular velocities.

    | Num | Observation                  | Min                 | Max               |
    |-----|------------------------------|---------------------|-------------------|
    | 0   | Cosine of `theta1`           | -1                  | 1                 |
    | 1   | Sine of `theta1`             | -1                  | 1                 |
    | 2   | Cosine of `theta2`           | -1                  | 1                 |
    | 3   | Sine of `theta2`             | -1                  | 1                 |
    | 4   | Angular velocity of `theta1` | ~ -12.567 (-4 * pi) | ~ 12.567 (4 * pi) |
    | 5   | Angular velocity of `theta2` | ~ -28.274 (-9 * pi) | ~ 28.274 (9 * pi) |

where

theta1 is the angle of the first joint, where an angle of 0 indicates the first link is pointing directly downwards.

theta2 is relative to the angle of the first link. An angle of 0 corresponds to having the same angle between the two links.

The angular velocities of theta1 and theta2 are bounded at ±4π, and ±9π rad/s respectively. A state of [1, 0, 1, 0, ..., ...] indicates that both links are pointing downwards.

## Rewards
The goal is to have the free end reach a designated target height in as few steps as possible, and as such all steps that do not reach the goal incur a reward of -1. Achieving the target height results in termination with a reward of 0. The reward threshold is -100.

## Starting State
Each parameter in the underlying state (theta1, theta2, and the two angular velocities) is initialized uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity.

## Episode End
The episode ends if one of the following occurs:

1. Termination: The free end reaches the target height, which is constructed as: -cos(theta1) - cos(theta2 + theta1) > 1.0

2. Truncation: Episode length is greater than 500 (200 for v0)

In [2]:
env = gym.make('Acrobot-v1')

In [3]:
# To change the dynamics as described above
env.env.book_or_nips = 'nips'

In [4]:
n_actions = env.action_space.n
shape_states = env.observation_space.shape
print(n_actions)
print(shape_states)

3
(6,)


In [3]:
def policy(state,theta): #softmax
    p=state @ theta
    return sc.special.softmax(p)

def gradient_function(state,theta):
    z=[0,0,0]
    for i in range(3):
        z[i]=np.dot(state,1-policy(state,theta)[i])
    return np.array(z)

In [4]:
def reinforce(theta_0,lr,gamma,n_episode):
    env = gym.make("Acrobot-v1") #, render_mode="human"
    theta=theta_0
    History=[]
    
    for i in range(n_episode):
        X=[] #list of states
        A=[] #list of actions
        R=[] #list of rewards
        x,_=env.reset()
        n_move = 0 
        terminated=False
        truncated=False
        while not terminated and not truncated: #episode to fill the lists
            if n_move > 500:
                env.close()
                raise Exception("Too many attempts, failed")
            n_move += 1
            X.append(x)
            pol=policy(x, theta)
            #print(pol)
            action=np.random.choice([0,1,2], p=pol)
            #print(action)
            A.append(action)
            x, r, terminated, truncated, info = env.step(action)
            R.append(r)
        
        History.append(np.sum(R))
        n=0 
        while n<n_move: #list run for the adjustment of theta
          
            G=0
            for i in range(n+1,n_move):
                G=G+gamma**(i-n-1)*R[i]
                
            grad=np.transpose(gradient_function(X[n],theta))
            theta=theta+lr*gamma**n*G*grad
            
            n += 1
    #env.close()
    return theta, History

In [5]:
lr=0.001
n_episode=100
gamma=1
U=np.random.uniform(0,1,18)
theta_0=U.reshape((6,3))

In [6]:
reinforce(theta_0,lr,gamma,n_episode)
#plt.plot(History)
#plt.title("reward evolution")
#plt.show()

(array([[-124.60012089, -274.94165851, -152.15847092],
        [  29.83309144,    0.90688331,  -28.68907224],
        [ -75.99381911, -174.50116268,  -98.05783915],
        [  -1.71124265,    8.90064857,   11.26839414],
        [ 213.87047419,  -70.77670543, -282.68658015],
        [-433.91998441,   76.05018066,  511.99040072]]),
 [-148.0,
  -65.0,
  -92.0,
  -124.0,
  -65.0,
  -82.0,
  -64.0,
  -93.0,
  -74.0,
  -70.0,
  -72.0,
  -87.0,
  -89.0,
  -89.0,
  -86.0,
  -66.0,
  -64.0,
  -66.0,
  -81.0,
  -74.0,
  -90.0,
  -149.0,
  -65.0,
  -100.0,
  -90.0,
  -73.0,
  -89.0,
  -74.0,
  -87.0,
  -79.0,
  -182.0,
  -74.0,
  -89.0,
  -88.0,
  -73.0,
  -65.0,
  -78.0,
  -73.0,
  -64.0,
  -79.0,
  -208.0,
  -74.0,
  -177.0,
  -88.0,
  -64.0,
  -66.0,
  -92.0,
  -90.0,
  -72.0,
  -90.0,
  -87.0,
  -73.0,
  -90.0,
  -65.0,
  -76.0,
  -72.0,
  -66.0,
  -93.0,
  -76.0,
  -109.0,
  -87.0,
  -73.0,
  -105.0,
  -64.0,
  -64.0,
  -74.0,
  -64.0,
  -74.0,
  -89.0,
  -75.0,
  -72.0,
  -66.0,
  -85.0,
  