# Bayesian Optimization Tutorial (Exercise): Using Ax package to tune a linear controller to stabilize a cart_pole.    
**By Wenjie Xu**

Throughout this exercise, you will see blocks of code. In some of the places, they contain an indicator:
```python
## [TODO]
```

This indicates the part of code that you need to implement yourself.  





# In this exercise, we give the objective reward function to maximize, and you are required to use Ax package to tune the linear controller. 

In [None]:
import gym
import numpy as np
import time
import matplotlib.pyplot as plt
from IPython.display import Video, display, clear_output
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import moviepy.editor as mpy 
import os
from render import show_video # show video implements the visualization of frames

# The goal is to stabilize the following cart-pole, both in position and angle.

In [None]:
display(Video('./render_videos/cart_pole_play.mp4', embed=True))

In [None]:
# [TODO]: import relevant packages including Ax
# import Ax related packages


# We implement the function to evaluate reward for you. 

In [None]:
def run_cart_pole_experiment(Kp, Kd, Kpx, Kdx, num_episodes=5, max_steps=500):
    # reward function takes Kp and Kd as inputs
    
    # Initialize the CartPole environment
    env = gym.make('CartPole-v1')

    def pd_controller(state):
        # Extract the state variables
        x, x_dot, theta, theta_dot = state
    
        # PD control law
        force = Kp * theta + Kd * theta_dot + Kpx * x + Kdx * x_dot
    
        # Convert force to discrete action (left or right)
        action = 1 if force > 0 else 0
    
        return action
    
    total_reward_list = []
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        frames = []
        for t in range(max_steps):
            if True:
                frame = env.render(mode='rgb_array')  # Capture frame
                frames.append(frame)
                    
            # Get action from PD controller
            action = pd_controller(state)
        
            # Step the environment
            state, reward, done, _ = env.step(action)  
            total_reward += max((reward - 70 * np.abs(state[2]) - 5 * np.abs(state[1])), 0) # penalize the pole angle and moving
        
            if done:
                break
    
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")
        total_reward_list.append(total_reward)
    env.close()
    return np.min(total_reward_list), frames


In [None]:
# [TODO]: manually tune Kp and Kd yourself before call Ax package
manual_kp = 2.0
manual_kd = 1.0
manual_kpx = 2.0
manual_kdx = 1.0
mean_total_reward, frames = run_cart_pole_experiment(manual_kp, manual_kd, manual_kpx, manual_kdx, num_episodes=5)
show_video(frames)

# Now you should implement the tuning algorithm by Ax package yourself

In [None]:
# [TODO]: define the Ax client and the interface to evaluate objective


def pd_to_frames(pd):
    kp = pd['kp']
    kd = pd['kd']
    kpx = pd['kpx']
    kdx = pd['kdx']
    mean_total_reward, frames = run_cart_pole_experiment(kp, kd, kpx, kdx, num_episodes=1)
    return frames




In [None]:
# [TODO]: run the optimization loop


In [None]:
# [TODO] plot the optimization trace


In [None]:
# [TODO] compare the running result with the parameters tuned by you manually and the parameters obtained by Ax 

# Example code of doing visualization
# init_frames = pd_to_frames(init_parameters)
# show_video(init_frames)

In [None]:
best_frames = pd_to_frames(best_parameters)
show_video(best_frames)

# Bonus question.
Tune the controller only using pairwise comparison data. That is, in each step, the algorithm proposes two solutions, and then you express which one is preferred. The algorithm uses this comparison data to search your favorite solution.

# Hint.
Refer to https://botorch.org/tutorials/preference_bo .
