In [None]:
import torch.multiprocessing as mp
mp.set_start_method('spawn')

from process_cartpole import *
import gymnasium as gym
import numpy as np


N = 20000
batch_size = 4000
n_epochs = 4
alpha = 3e-4
gamma = 0.99
gae_lambda = 0.95
checkpoint_dir = 'tmp/ppo'
env = gym.make('CartPole-v1')
agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
                alpha=alpha, n_epochs=n_epochs, 
                input_dims=env.observation_space.shape)
agent.save_models()
agent.actor.share_memory()
agent.critic.share_memory()

trainer = AgentTrainer(agent, env, checkpoint_dir=checkpoint_dir, steps_to_post=N, batch_size=batch_size, collector_processes=1, normalize_reward=False, performance_games_to_sim=100, steps_between_performance_checks=3)
trainer.start_training()

... saving models ...
... loading models ...
First weights in actor: Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', requires_grad=True)


In [1]:
import gymnasium as gym
import numpy as np
from process_cartpole import *

env = gym.make('CartPole-v1', render_mode='human')
N = 20
batch_size = 5
n_epochs = 4
alpha = 0.0003
agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
                alpha=alpha, n_epochs=n_epochs, 
                input_dims=env.observation_space.shape)
n_games = 300

figure_file = 'plots/cartpole.png'

best_score = -1000
score_history = []
max_score = 0
episodes_without_best = 0


learn_iters = 0
avg_score = 0
n_steps = 0

memory = PPOMemory(batch_size)

for i in range(n_games):
    agent.load_models()
    observation = env.reset()[0]
    done = False
    truncated = False
    score = 0
    while not done and not truncated:
        action, prob, val = agent.choose_action(observation)
        observation_, reward, done, truncated, info = env.step(action)
        n_steps += 1
        score += reward
        memory.store_memory(observation, action, prob, val, reward, done)
        if n_steps % N == 0:
            agent.learn(memory)
            memory = PPOMemory(batch_size)
            learn_iters += 1
        observation = observation_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    
    max_score = max(max_score, score)
    
    if avg_score > best_score:
        best_score = avg_score
        recent_std = np.std(score_history[-100:])
        agent.save_models()
        episodes_without_best = 0
        print(f'episode {i:>5} | score {score:>6.1f} | avg {avg_score:>6.1f} | std {recent_std:>6.2f} | max score {max_score:>5.1f} | learning steps {learn_iters:>5} | done {done} *')
    else:
        episodes_without_best += 1
        
    if episodes_without_best % 100 == 0 and episodes_without_best > 0:
        recent_std = np.std(score_history[-100:])
        print(f'episode {i:>5} | score {score:>6.1f} | avg {avg_score:>6.1f} | std {recent_std:>6.2f} | max score {max_score:>5.1f} | learning steps {learn_iters:>5} | done {done}')


... loading models ...


  state = torch.tensor([observation], dtype=torch.float).to(self.actor.device)


torch.return_types.max(
values=tensor(0.0032, device='cuda:0'),
indices=tensor(26, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0009, device='cuda:0'),
indices=tensor(34, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0257, device='cuda:0'),
indices=tensor(26, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0024, device='cuda:0'),
indices=tensor(34, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0037, device='cuda:0'),
indices=tensor(34, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0176, device='cuda:0'),
indices=tensor(26, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0140, device='cuda:0'),
indices=tensor(26, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0020, device='cuda:0'),
indices=tensor(242, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0028, device='cuda:0'),
indices=tensor(34, device='cuda:0'))
torch.return_types.max(
values=tensor(0.0031, device='cuda:0'),
indices=tensor(26, device=

KeyboardInterrupt: 

In [3]:
import torch.multiprocessing as mp

# Worker function that increments a shared counter with a lock
def increment_counter(counter, lock):
    for _ in range(1000):
        with lock:
            counter.value += 1
            
counter = mp.Value('i', 0)  # Shared counter
lock = mp.Lock()

processes = [mp.Process(target=increment_counter, args=(counter, lock)) for _ in range(4)]
for p in processes:
    p.start()
for p in processes:
    p.join()

print("Final counter value:", counter.value)

Final counter value: 0
