<a href="https://colab.research.google.com/github/Wyatt-Kugler/Cartpole-RL-Model/blob/main/Cartpole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install gymnasium
!pip install imageio
!pip install IPython
!pip install numpy
!pip install tqdm




In [19]:
from google.colab import drive
drive.mount('/content/drive')

save_folder = "/content/drive/MyDrive/Cartpole/Cartpole_GIFs/"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import gymnasium as gym
import imageio
import numpy as np

from IPython.display import Image
from collections import defaultdict

In [22]:
import numpy as np
from collections import defaultdict

def discretize(obs, bins):
    """Convert continuous obs to discrete indices."""
    upper_bounds = [4.8, 5, 0.418, 5]  # CartPole-v1 env limits
    lower_bounds = [-4.8, -5, -0.418, -5]

    ratios = [(obs[i] - lower_bounds[i]) / (upper_bounds[i] - lower_bounds[i]) for i in range(4)]
    new_obs = [int(np.clip(r * (bins[i]-1), 0, bins[i]-1)) for i, r in enumerate(ratios)]
    return tuple(new_obs)

class CartAgent:
    def __init__(
        self,
        env,
        learning_rate,
        initial_epsilon,
        epsilon_decay,
        final_epsilon,
        discount_factor=0.95,
    ):
        """ Initializes Q-Learning Agent """
        self.env = env
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
        self.lr = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.training_error = []

    def get_action(self, obs):
        """ Choose action using epsilon-greedy policy """
        state_index = discretize(obs, bins=[10, 10, 10, 10])
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.q_values[state_index])

    def update(self, obs, action, reward, terminated, next_obs):
        """ Update Q-values """
        obs_index = discretize(obs, bins=[10, 10, 10, 10])
        next_index = discretize(next_obs, bins=[10, 10, 10, 10])

        # only include future reward if episode is not terminated
        future_q_value = np.max(self.q_values[next_index]) * (not terminated)
        target = reward + self.discount_factor * future_q_value

        td_error = target - self.q_values[obs_index][action]
        self.q_values[obs_index][action] += self.lr * td_error

        self.training_error.append(td_error)

    def decay_epsilon(self):
        """ Decay exploration rate """
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)


In [23]:
learning_rate = 0.05
n_episodes = 5000
start_epsilon = 1.0
epsilon_rate = 0.5
epsilon_decay = start_epsilon / (n_episodes / 1 + epsilon_rate)
final_epsilon = 0.02
recording_frequency = 500 # How often to record

env = gym.make("CartPole-v1", render_mode="rgb_array", max_episode_steps=1000)

agent = CartAgent(
    env=env,
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)


In [34]:
def reinforcement_learn(
    learning_rate,
    n_episodes,
    epsilon_decay,
    final_epsilon,
    start_epsilon = 1.0,
    recording_frequency = 500
):
  ''' Trains the agent using Q-Learning.

      Args:
        learning_rate: The learning rate to use.
        n_episodes: The number of episodes to train for.
        start_epsilon: The starting value of epsilon.
        epsilon_decay: The decay rate of epsilon.
        final_epsilon: The final value of epsilon.
  '''
  from tqdm import tqdm
  import imageio

  Recording_Number = 0
  gif_files = []

  episode_rewards = []

  for episode in tqdm(range(n_episodes)):
    recording = False

    total_reward = 0

    # Record every n episodes
    if episode % recording_frequency == 0:
      frames = []
      Recording_Number += 1
      recording = True
    obs, info = env.reset()

    # recording = False #For Grid Search


    if recording:
      frames.append(env.render())

    done = False

    while not done:
      action = agent.get_action(obs)
      next_obs, reward, terminated, truncated, info = env.step(action)
      total_reward += reward


      agent.update(obs, action, reward, terminated, next_obs)

      done = terminated or truncated

      if recording:
        frames.append(env.render())
      obs = next_obs
    episode_rewards.append(total_reward)
    if recording:
      gif_path = f"cartpole_loop_{Recording_Number}.gif"

      full_path = save_folder + gif_path

      imageio.mimsave(full_path, frames, fps=30)

      gif_files.append(full_path)

    agent.decay_epsilon()
  return episode_rewards

In [None]:
reinforcement_learn(learning_rate,n_episodes, epsilon_decay,final_epsilon)

 99%|█████████▊| 7890/8000 [04:49<00:03, 33.50it/s]

In [33]:
## Grid Searching ##

import os
import shutil
grid_search_dict = {}

learning_rates = [0.01, 0.05,0.1]
n_episode_list = [1000,3000]
epsilon_rate_list = [0.1,0.3,0.5,1]
final_epsilon_list = [0.01,0.05,0.1,0.2]

for i in range(len(learning_rates)):
  for j in range(len(n_episode_list)):
    for k in range(len(epsilon_rate_list)):
      for l in range(len(final_epsilon_list)):
        episode_rewards=reinforcement_learn(learning_rates[i],n_episode_list[j], epsilon_rate_list[k],final_epsilon_list[l])

        total_size = 0
        for m in episode_rewards[-5:]:
          total_size += m
        total_size = total_size/5

        grid_search_dict[f"Learning Rate: {learning_rates[i]}, Episodes: {n_episode_list[j]}, Epsilon Rate: {epsilon_rate_list[k]}, Final Epsilon: {final_epsilon_list[l]}"] = total_size
        shutil.rmtree(save_folder)
        os.makedirs(save_folder)
print(grid_search_dict)





100%|██████████| 1000/1000 [00:23<00:00, 41.71it/s]
100%|██████████| 1000/1000 [00:24<00:00, 41.10it/s]
100%|██████████| 1000/1000 [00:24<00:00, 41.26it/s]
100%|██████████| 1000/1000 [00:24<00:00, 40.70it/s]
100%|██████████| 1000/1000 [00:23<00:00, 43.01it/s]
100%|██████████| 1000/1000 [00:24<00:00, 41.29it/s]
100%|██████████| 1000/1000 [00:24<00:00, 41.46it/s]
100%|██████████| 1000/1000 [00:24<00:00, 41.45it/s]
100%|██████████| 1000/1000 [00:23<00:00, 42.13it/s]
100%|██████████| 1000/1000 [00:23<00:00, 41.67it/s]
100%|██████████| 1000/1000 [00:23<00:00, 41.86it/s]
100%|██████████| 1000/1000 [00:24<00:00, 40.55it/s]
100%|██████████| 1000/1000 [00:23<00:00, 42.78it/s]
100%|██████████| 1000/1000 [00:24<00:00, 40.44it/s]
100%|██████████| 1000/1000 [00:23<00:00, 41.89it/s]
100%|██████████| 1000/1000 [00:23<00:00, 41.90it/s]
100%|██████████| 3000/3000 [01:11<00:00, 41.79it/s]
100%|██████████| 3000/3000 [01:11<00:00, 41.77it/s]
100%|██████████| 3000/3000 [01:11<00:00, 41.99it/s]
100%|███████

{'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.1, Final Epsilon: 0.01': 174.8, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.1, Final Epsilon: 0.05': 158.4, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.1, Final Epsilon: 0.1': 164.4, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.1, Final Epsilon: 0.2': 169.4, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.3, Final Epsilon: 0.01': 163.0, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.3, Final Epsilon: 0.05': 176.8, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.3, Final Epsilon: 0.1': 164.6, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.3, Final Epsilon: 0.2': 157.2, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.5, Final Epsilon: 0.01': 160.8, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.5, Final Epsilon: 0.05': 168.0, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.5, Final Epsilon: 0.1': 162.0, 'Learning Rate: 0.01, Episodes: 1000, Epsilon Rate: 0.5, F


