<a href="https://colab.research.google.com/github/ParlitsisG/Cart_pole_Dqn/blob/main/cartpole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gym pygame ray[rllib]==2.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Downloading pygame-2.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ray[rllib]==2.2.0
  Downloading ray-2.2.0-cp39-cp39-manylinux2014_x86_64.whl (57.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting frozenlist
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting virtualenv>=20.0.24
  Downloading virtualenv-20.21.0-py3-none-any.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/

In [3]:
import gym
import numpy as np
from ray.rllib.algorithms.dqn import DQNConfig

In [4]:
class CustomEnv(gym.Env):
    def __init__(self, env_config: dict):
        
        # Construct & Init Environment
        self._env = gym.make('CartPole-v1')

        # Define Action Space: 2 Discrete Actions for cartpole
        self.action_space = gym.spaces.Discrete(2)

        # Define State (Observation) Space: A Continuous State Space represented by a vector of size (4,)
        self.observation_space = gym.spaces.Box(
             low=np.array([-4.8, -np.inf, -0.42, -np.inf]),
            high=np.array([4.8, np.inf, 0.42, np.inf]),
            dtype=np.float32
        )

    # Reset Environment & Init Episode
    def reset(self):
        observation = self._env.reset()
        return observation

    def step(self, action):
        observation, reward, done, info = self._env.step(action)
        return observation, reward, done, info

    def render(self, mode: str or None = None):
        self._env.render()

In [5]:
config = DQNConfig()
config.num_steps_sampled_before_learning_starts = 1000
config.train_batch_size = 64
config.replay_buffer_config.update({
    'capacity': 50000
})
# Pause episode and train
config.batch_mode = 'truncate_episodes'

# Disabling Dueling Feature
config.dueling = False  # later try with True

# Setting Epsilon
config.exploration_config.update({
    "initial_epsilon": 0.5,
    "final_epsilon": 0.01,
    "epsilon_timesteps": 1000,
})

# 1 Step per training
# config.rollout_fragment_length = 1

# Set seed to constant value to reproduce results
config.seed = 0

# Gamma is the discount factor in bellman equation
config.gamma = 0.99

# Set learning rate of neural network
config.lr = 0.0005

# Enable gpu
config.num_gpus = 1

In [6]:

agent = config.framework("tf").environment(env=CustomEnv, env_config={}).build()

2023-03-15 17:04:07,971	INFO worker.py:1538 -- Started a local Ray instance.
2023-03-15 17:04:17,164	INFO trainable.py:172 -- Trainable.setup took 11.825 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [7]:
def evaluate(agent, eval_env, eval_episodes):
    total_rewards = 0.0

    for _ in range(eval_episodes):
        done = False
        observation = eval_env.reset()

        while not done:
            action = agent.compute_single_action(observation=observation)
            observation, reward, done, _ = eval_env.step(action)
            total_rewards += reward

    return total_rewards/eval_episodes

In [None]:
num_steps = 1000
eval_env = CustomEnv(env_config={})
eval_episodes = 5


for i in range(num_steps):
    agent.train()

    if i % 1 == 0:
        average_rewards = evaluate(agent, eval_env, eval_episodes)
        print('i =', i, ', average rewards =', average_rewards)




i = 0 , average rewards = 97.6
i = 1 , average rewards = 15.4
i = 2 , average rewards = 24.0
i = 3 , average rewards = 69.2
i = 4 , average rewards = 97.4
i = 5 , average rewards = 154.4
i = 6 , average rewards = 135.4
i = 7 , average rewards = 272.6
i = 8 , average rewards = 240.8
i = 9 , average rewards = 203.8
i = 10 , average rewards = 161.4
i = 11 , average rewards = 115.2
i = 12 , average rewards = 128.4
i = 13 , average rewards = 266.4
i = 14 , average rewards = 99.8
i = 15 , average rewards = 93.0
i = 16 , average rewards = 145.0
i = 17 , average rewards = 107.4
i = 18 , average rewards = 146.2
i = 19 , average rewards = 125.8
i = 20 , average rewards = 203.8
i = 21 , average rewards = 301.8
i = 22 , average rewards = 98.0
i = 23 , average rewards = 142.6
i = 24 , average rewards = 189.8
i = 25 , average rewards = 145.4
i = 26 , average rewards = 115.8
i = 27 , average rewards = 248.8
i = 28 , average rewards = 200.6
i = 29 , average rewards = 193.0
i = 30 , average rewards = 1