TODO:
 * Stepper motor skipping steps--more current (better power supply)?
 * Way to pause and resume (pendulum fell off!)
 * Set up experiments (WandB?)
 * Train!
 * Normalize action and observation spaces (see: https://ai.stackexchange.com/questions/21477/why-do-we-also-need-to-normalize-the-actions-values-on-continuous-action-spaces)

In [1]:
# !python -m pip install gymnasium==0.28.1
# !python -m pip install stable-baselines3[extra]==2.1.0

In [2]:
import time

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
from control_comms import ControlComms, StatusCode, DebugLevel

# Reinforcement model modules
import stable_baselines3 as sb3
from stable_baselines3.common import env_checker
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import KVWriter, Logger

# Check versions
print(f"gym version: {gym.__version__}")
print(f"sb3 version: {sb3.__version__}")

gym version: 0.28.1
sb3 version: 2.1.0


In [3]:
# Communication settings
SERIAL_PORT = "COM6"    # Check your devices
BAUD_RATE = 500000      # Must match what's in the Arduino code!
CTRL_TIMEOUT = 1.0      # Seconds
DEBUG_LEVEL = DebugLevel.DEBUG_ERROR

# Reinforcement learning settings
K_T = 1                 # Reward constant to multiply theta (angle of encoder)
K_DT = 0.1              # Reward constant to multiply dtheto/dt (angular velocity of encoder)
K_P = 0.01              # Reward constant to multiply phi (angle of stepper)
K_DP = 0.001            # Reward constant to multiply dphi/dt (angular velocity of stepper)
REWARD_OOB = -10_000    # Reward (penalty) for having the stepper motor move out of bounds (OOB)
ENC_OFFSET = 180.0      # Pendulum in the "up" position should be 0 deg
STP_MOVE_MIN = -10.0
STP_MOVE_MAX = 10.0
STP_ANGLE_MIN = -180.0  # Episode ends if stepper goes beyond this angle
STP_ANGLE_MAX = 180.0   # Episode ends if stepper goes beyond this angle
ENV_TIMEOUT = 10.0
RESET_SETTLE_TIME = 1.0

In [4]:
# Communication constants
CMD_SET_HOME = 0        # Set current stepper position as home (0 deg)
CMD_MOVE_TO = 1         # Move stepper to a particular position (deg)
CMD_MOVE_BY = 2         # Move stepper by a given amount (deg)
CMD_SET_STEP_MODE = 3   # Set step mode
CMD_SET_BLOCK_MODE = 4  # Set blocking mode
CMD_NOP = 5             # Take no action, just receive observation
STEP_MODE_1 = 0         # 1 division per step
STEP_MODE_2 = 1         # 2 divisions per step
STEP_MODE_4 = 2         # 4 divisions per step
STEP_MODE_8 = 3         # 8 divisions per step
STEP_MODE_16 = 4        # 16 divisions per step
STATUS_OK = 0           # Stepper idle
STATUS_STP_MOVING = 1   # Stepper is currently moving

# Set to desired step mode
STEP_MODE = STEP_MODE_8

In [5]:
# Close connection to Arduino board (if open)
try:
    controller.close()
except:
    pass

In [6]:
# Connect to Arduino board
controller = ControlComms(timeout=CTRL_TIMEOUT, debug_level=DEBUG_LEVEL)
ret = controller.connect(SERIAL_PORT, BAUD_RATE)
if ret is not StatusCode.OK:
    print("ERROR: Could not connect to board")

In [7]:
# Test basic comms
controller.step(CMD_SET_STEP_MODE, [STEP_MODE_8])
controller.step(CMD_SET_HOME, [0])
controller.step(CMD_SET_BLOCK_MODE, [1])
controller.step(CMD_MOVE_BY, [90])

(0, 13854, False, [0.0, 0.22])

In [8]:
# Close comms
controller.close()

## Build gym Environment

Subclass gymnasium.Env to create a custom environment. Learn more here:<br>
https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/

In [9]:
class Pendulum(gym.Env):
    """
    Subclass gymnasium Env class
    
    This is the gym wrapper class that allows our agent to interact with our environment. We need
    to implement four main methods: step(), reset(), render(), and close(). We should also define
    the action_space and observation space as class members.
    
    Note: on Windows, time.sleep() is only accurate to around 10ms. As a result, setting fps_limit
    will give you a "best effort" limit.
    
    More information: https://gymnasium.farama.org/api/env/
    """
    
    def __init__(
        self,
        serial_port,
        baud_rate,
        ctrl_timeout=1.0,
        debug_level=DebugLevel.DEBUG_NONE,
        env_timeout=0.0, 
        stp_mode=STEP_MODE_8, 
        stp_blocking=False
    ):
        """
        Set up the environment, action, and observation shapes. Optional tiemout in seconds.
        """
        
        # Call superclass's constructor
        super().__init__()
        
        # Connect to Arduino board
        self.ctrl = ControlComms(timeout=ctrl_timeout, debug_level=debug_level)
        try:
            self.ctrl.close()
        except:
            pass
        ret = self.ctrl.connect(serial_port, baud_rate)
        if ret is not StatusCode.OK:
            print("ERROR: Could not connect to board")
        
        # Define action space (scalar signifying how many degrees to move stepper by)
        self.action_space = gym.spaces.Box(
            low=STP_MOVE_MIN,
            high=STP_MOVE_MAX,
            shape=(1, 1),
            dtype=np.float32
        )
        
        # Define observation space 
        # [encoder angle, encoder angular velocity, stepper angle, stepper angular velocity]
        self.observation_space = gym.spaces.Box(
            low=np.array([-180, -np.inf, STP_ANGLE_MIN, -np.inf]),
            high=np.array([180, np.inf, STP_ANGLE_MAX, np.inf]),
            dtype=np.float32
        )
        
        # Record time from microcontroller and own elapsed time
        self.timestamp = 0
        self.timeout = env_timeout
        self.start_time = time.time()
        
        # Record previous encoder and stepper angles (to calculate velocities)
        self.angle_stp_prev = 0
        self.angle_enc_prev = 0
        
        # Set current stepper position as "home" and optionally set blocking
        self.ctrl.step(CMD_SET_STEP_MODE, [stp_mode])
        self.ctrl.step(CMD_SET_HOME, [0])
        if stp_blocking:
            self.ctrl.step(CMD_SET_BLOCK_MODE, [1])
        else:
            self.ctrl.step(CMD_SET_BLOCK_MODE, [0])
    
    def __del__(self):
        """
        Destructor: make sure to close the serial port
        """
        self.close()
    
    def step(self, action: np.ndarray):
        """
        What happens when you tell the stepper motor to do something then record the observation.
        """
        
        # Initialize return values
        obs = np.array([0.0, 0.0, 0.0, 0.0], dtype=np.float32)
        reward = 0.0
        info = {"error": False, "dtime": 0.0, "elapesed_time": 0.0}
        terminated = False
        truncated = False
        
        # Box is 2D NumPy array, action must be sent out as 1D list [...]
        action_list = action.flatten().tolist()
        
        # Move the stepper motor and wait for a response
        resp = self.ctrl.step(CMD_MOVE_BY, action_list)
        if resp:
            status, timestamp, terminated, angles = resp
            
            # Compute lapsed time from previous observation
            info["dtime"] = timestamp - self.timestamp
            self.timestamp = timestamp
            
            # Calculate velocities
            dtheta = (angles[0] - self.angle_enc_prev) / info["dtime"]
            dphi = (angles[1] - self.angle_stp_prev) / info["dtime"]
            
            # Construct observation
            obs[0] = angles[0] - ENC_OFFSET
            obs[1] = dtheta
            obs[2] = angles[1]
            obs[3] = dphi
                    
            # Calculate reward
            if (obs[2] >= STP_ANGLE_MIN) and (obs[2] <= STP_ANGLE_MAX):
                reward = -1 * (K_T * obs[0] ** 2 + 
                               K_DT * obs[1] ** 2 + 
                               K_P * obs[2] ** 2 +
                               K_DP * obs[3] ** 2)
            
            # Stepper motor is out of bounds--terminate episode
            else:
                reward = REWARD_OOB
                terminated = True
        
        # Something is wrong with communication
        else:
            print("ERROR: Could not communicate with Arduino")
            info["error"] = True
            terminated = True
        
        # Calculate elapsed time
        info["elapsed_time"] = time.time() - self.start_time
        
        # Check if we've exceeded the time limit
        if not terminated and self.timeout > 0.0 and info["elapsed_time"] >= self.timeout:
            truncated = True
        
        return obs, reward, terminated, truncated, info
    
    def reset(self, seed=None):
        """
        Return the pendulum to the starting position
        """
        
        # Initialize return values
        obs = np.array([0.0, 0.0, 0.0, 0.0], dtype=np.float32)
        info = {"error": False, "dtime": 0, "elapsed_time": 0.0}
        
        # Reset timer
        self.start_time = time.time()
        
        # Let the pendulum fall and return to the starting position
        time.sleep(RESET_SETTLE_TIME)
        resp = self.ctrl.step(CMD_MOVE_TO, [0.0])
        if resp:
            status, timestamp, terminated, angles = resp
            
            # Compute lapsed time from previous observation
            info["dtime"] = timestamp - self.timestamp
            self.timestamp = timestamp
            
            # Calculate velocities
            dtheta = (angles[0] - self.angle_enc_prev) / info["dtime"]
            dphi = (angles[1] - self.angle_stp_prev) / info["dtime"]
            
            # Construct observation
            obs[0] = angles[0] - ENC_OFFSET
            obs[1] = dtheta
            obs[2] = angles[1]
            obs[3] = dphi
            time.sleep(RESET_SETTLE_TIME)
            
        # Something is wrong with communication
        else:
            print("ERROR: Could not communicate with Arduino")
            info["error"] = True
            
        # Calculate elapsed time
        info["elapsed_time"] = time.time() - self.start_time
        
        return obs, info
    
    def close(self):
        """
        Close connection to Arduino
        """
        self.ctrl.close()

## Test gym Environment

Test the gym wrapper before training

In [10]:
# Create our environment
try:
    env.close()
except:
    pass
env = Pendulum(
        SERIAL_PORT,
        BAUD_RATE,
        ctrl_timeout=CTRL_TIMEOUT,
        debug_level=DEBUG_LEVEL,
        env_timeout=ENV_TIMEOUT, 
        stp_mode=STEP_MODE, 
        stp_blocking=True
)

In [11]:
# Try running the environment for a few steps
obs, info = env.reset()
obs_str = ", ".join([f"{val:.2f}" for val in obs])
if info["error"]:
    print("Stopping")
else:
    print(f"{'Step': ^8} | {'Observation': ^32} | {'Reward': ^16} | {'Done': ^8} | Info")
    print(f"{'Reset': ^8} | {obs_str: <32} | {0.0: <16} | {str(False): ^8} | {info}")
    for i in range(10):
        obs, reward, terminated, truncated, info = env.step(np.array([[-25]]))
        obs_str = ", ".join([f"{val:.2f}" for val in obs])
        print(f"{i: ^8} | {obs_str: <32} | {reward: <16.2f} | {str(terminated or truncated): ^8} | {info}")
        if info["error"]:
            print("Stopping")
            break
        if terminated or truncated:
            print("Episode done")
            break

  Step   |           Observation            |      Reward      |   Done   | Info
 Reset   | 168.90, 0.02, 0.00, 0.00         | 0.0              |  False   | {'error': False, 'dtime': 19225, 'elapsed_time': 2.013803482055664}
   0     | 177.00, 0.32, 0.00, 0.00         | -31329.01        |  False   | {'error': False, 'dtime': 1109, 'elapesed_time': 0.0, 'elapsed_time': 2.1107232570648193}
   1     | -157.50, 0.23, -24.97, -0.25     | -24812.49        |  False   | {'error': False, 'dtime': 99, 'elapesed_time': 0.0, 'elapsed_time': 2.2053277492523193}
   2     | -140.40, 0.42, -49.95, -0.53     | -19737.13        |  False   | {'error': False, 'dtime': 95, 'elapesed_time': 0.0, 'elapsed_time': 2.299238443374634}
   3     | -135.60, 0.47, -74.93, -0.79     | -18443.53        |  False   | {'error': False, 'dtime': 95, 'elapesed_time': 0.0, 'elapsed_time': 2.396799325942993}
   4     | -144.60, 0.36, -99.90, -1.01     | -21008.98        |  False   | {'error': False, 'dtime': 99, 'elapesed_tim

In [12]:
# Test timeout
obs, info = env.reset()
action = 2
if not info["error"]:
    for _ in range(1000):
        action = -2 if action == 2 else 2
        obs, reward, terminated, truncated, info = env.step(np.array([[action]]))
        if terminated or truncated:
            print("Episode done")
            break

Episode done


In [13]:
# Final environment check to make sure it works with Stable-Baselines3
env_checker.check_env(env)



In [18]:
# Close the environment
env.close()

## Train Model

In [19]:
# Create our environment
try:
    env.close()
except:
    pass
env = Pendulum(
        SERIAL_PORT,
        BAUD_RATE,
        ctrl_timeout=CTRL_TIMEOUT,
        debug_level=DEBUG_LEVEL,
        env_timeout=ENV_TIMEOUT, 
        stp_mode=STEP_MODE, 
        stp_blocking=True
)

In [20]:
# Function that tests the model in the given environment
def test_model(env, model):

    # Reset environment
    obs, info = env.reset()
    ep_len = 0
    ep_rew = 0

    # Run episode until complete
    while True:

        # Provide observation to policy to predict the next action
        action, _ = model.predict(obs)

        # Perform action, update total reward
        obs, reward, terminated, truncated, info = env.step(action)
        ep_rew += reward

        # Increase step counter
        ep_len += 1

        # Check to see if episode has ended
        if terminated or truncated:
            return ep_len, ep_rew

In [21]:
# Initialize model
# PPO docs: https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
# Policy networks: https://stable-baselines.readthedocs.io/en/master/modules/policies.html
# Hyperparameters from: https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/ppo.yml
model = sb3.PPO(
    'MlpPolicy',
    env,
    learning_rate=0.001,       # Learning rate of neural network (default: 0.0003)
    n_steps=1024,               # Number of steps per update (default: 2048)
    batch_size=64,              # Minibatch size for NN update (default: 64)
    gamma=0.9,                 # Discount factor (default: 0.99)
    ent_coef=0.0,               # Entropy, how much to explore (default: 0.0)
    use_sde=True,               # Use generalized State Dependent Exploration (default: False)
    sde_sample_freq=4,          # Number of steps before sampling new noise matrix (default -1)
    policy_kwargs={'net_arch': [64, 64]}, # 2 hidden layers, 1 output layer (default: [64, 64])
    verbose=0                   # Print training metrics (default: 0)
)

In [22]:
# Training and testing hyperparameters
NUM_ROUNDS = 20
NUM_TRAINING_STEPS_PER_ROUND = 5000
NUM_TESTS_PER_ROUND = 100
MODEL_FILENAME_BASE = "inverted-pendulum-ppo"

# Train and test the model for a number of rounds
avg_ep_lens = []
avg_ep_rews = []
for rnd in range(NUM_ROUNDS):

    # Train the model
    model.learn(total_timesteps=NUM_TRAINING_STEPS_PER_ROUND)

    # Save the model
    model.save(f"{MODEL_FILENAME_BASE}_{rnd}")

    # Test the model in several episodes
    avg_ep_len = 0
    avg_ep_rew = 0
    for ep in range(NUM_TESTS_PER_ROUND):
        ep_len, ep_rew = test_model(env, model)
        avg_ep_len += ep_len
        avg_ep_rew += ep_rew

    # Record and dieplay average episode length and reward
    avg_ep_len /= NUM_TESTS_PER_ROUND
    avg_ep_lens.append(avg_ep_len)
    avg_ep_rew /= NUM_TESTS_PER_ROUND
    avg_ep_rews.append(avg_ep_rew)
    print(f"Round {rnd} | average test length: {avg_ep_len}, average test reward: {avg_ep_rew}")

Round 0 | average test length: 143.01, average test reward: -2070948.0657016016
Round 1 | average test length: 227.12, average test reward: -6856389.350233657
Round 2 | average test length: 133.32, average test reward: -3321301.3769947374
Round 3 | average test length: 344.7, average test reward: -3820116.4364356752
Round 4 | average test length: 33.28, average test reward: -644523.9924307623
Round 5 | average test length: 56.74, average test reward: -1593507.5790542322
Round 6 | average test length: 35.83, average test reward: -654718.3901519384
Round 7 | average test length: 135.86, average test reward: -3355535.89865323


KeyboardInterrupt: 

In [None]:
# Plot test lengths and rewards
# TODO

In [None]:
# Close the environment
env.close()