# Import Dependencies

In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! PyTorch can use the GPU.")
else:
    print("CUDA is not available. PyTorch will use the CPU.")

CUDA is available! PyTorch can use the GPU.


# Load Environment

In [27]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode='human')

In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, *extra_values = env.step(action)
        score += reward
    print(f'Episode: {episode}, Score: {score}')
env.close()

Episode: 1, Score: 16.0
Episode: 2, Score: 17.0
Episode: 3, Score: 41.0
Episode: 4, Score: 51.0
Episode: 5, Score: 35.0


In [5]:
env.reset()

error: display Surface quit

In [6]:
episodes = 5
for episode in range(1, episodes+1):
   print(episode)

1
2
3
4
5


In [7]:
env.step(1)

error: display Surface quit

In [10]:
env.action_space

Discrete(2)

In [11]:
env.action_space.sample()

0

In [12]:
env.observation_space.sample()

array([ 1.2979455e+00,  2.4623403e+38,  7.4686050e-02, -1.8711460e+38],
      dtype=float32)

# Understanding the Environment

In [13]:
env.action_space

Discrete(2)

In [14]:
env.action_space.sample()

1

In [15]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [16]:
env.observation_space.sample()

array([ 2.2585249e+00, -2.5807695e+38,  3.9841634e-01,  1.3596435e+38],
      dtype=float32)

# Train Model

In [17]:
log_path = os.path.join('Training', 'Logs')

In [18]:
log_path

'Training\\Logs'

In [19]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [20]:
import sys
print(sys.executable)  # Check Python interpreter path
print(sys.path)  # Check for unexpected paths

C:\Users\USER\anaconda3\envs\cv_cuda\python.exe
['C:\\Program Files\\JetBrains\\DataSpell 2023.3.4\\plugins\\python-ce\\helpers-pro\\jupyter_debug', 'C:\\Program Files\\JetBrains\\DataSpell 2023.3.4\\plugins\\python-ce\\helpers\\pydev', 'G:\\Reinforcement Learning', 'G:\\Reinforcement Learning', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda\\python38.zip', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda\\DLLs', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda\\lib', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda', '', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda\\lib\\site-packages', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda\\lib\\site-packages\\win32', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda\\lib\\site-packages\\win32\\lib', 'C:\\Users\\USER\\anaconda3\\envs\\cv_cuda\\lib\\site-packages\\Pythonwin']


In [21]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 372  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 355         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009008573 |
|    clip_fraction        | 0.0859      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.00196     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.07        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0128     |
|    value_loss           | 46.3        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x1d9b478f250>

# Save Model and reload

In [22]:
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_Model_CartPole')

In [23]:
model.save(ppo_path)

In [24]:
del model

In [25]:
model = PPO.load(ppo_path, env=env)

# Evaluation

In [1]:
evaluation_policy = evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()

NameError: name 'evaluate_policy' is not defined