In [1]:
# Import thư viện
import torch
import numpy as np
import matplotlib.pyplot as plt
from Env.environment import make_env
from Policy.ppo_agent import PPOAgent


In [2]:
# Thiết lập device và môi trường
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = make_env("BipedalWalker-v3", seed=42, render_mode=None)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]  # hành động liên tục


In [3]:
# Khởi tạo PPO agent với tham số tùy chỉnh
agent = PPOAgent(state_dim=state_dim,
                 action_dim=action_dim,
                 device=device,
                 lr=3e-4,
                 gamma=0.99,
                 clip_epsilon=0.2)


In [4]:
# Tham số huấn luyện
num_episodes = 5000
max_steps = 2000
update_timestep = 2000
rewards_all = []
timestep = 0


In [7]:
# Vòng huấn luyện PPO
for episode in range(1, num_episodes + 1):
    state, _ = env.reset()
    ep_reward = 0

    for t in range(max_steps):
        timestep += 1

        # Chọn hành động
        action, action_logprob = agent.select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Lưu transition
        agent.store_transition(state, action, action_logprob, reward, done)

        state = next_state
        ep_reward += reward

        # Khi đủ số bước, cập nhật mô hình
        if timestep % update_timestep == 0:
            # Giá trị V cuối cùng
            with torch.no_grad():
                last_value = agent.critic(torch.FloatTensor(state).to(agent.device)).item()

            # Tính returns
            returns = agent.compute_returns(agent.rewards, agent.dones, last_value)

            # Tính advantages
            with torch.no_grad():
                states_array = np.array(agent.states)  # Chuyển sang numpy trước
                values = agent.critic(torch.FloatTensor(states_array).to(agent.device)).squeeze().cpu().numpy()
            advantages = np.array(returns) - values

            # Gói dữ liệu vào memory dict
            memory = {
                'states': agent.states,
                'actions': agent.actions,
                'log_probs': agent.logprobs,
                'returns': returns,
                'advantages': advantages
            }

            # Cập nhật mô hình
            agent.update(memory)
            if episode % 100 == 0:
                print(f"Action example: {agent.actions[0]}")
            # Reset bộ nhớ
            agent.states = []
            agent.actions = []
            agent.logprobs = []
            agent.rewards = []
            agent.dones = []
        

        # Nếu episode kết thúc, thoát vòng lặp
        if done:
            break

    # Ghi lại reward mỗi episode
    rewards_all.append(ep_reward)
    print(f"Episode {episode} Reward: {ep_reward:.2f}")



Episode 1 Reward: -120.75
Episode 2 Reward: -116.66
Episode 3 Reward: -119.85
Episode 4 Reward: -115.06
Episode 5 Reward: -136.37
Episode 6 Reward: -138.97
Episode 7 Reward: -115.96
Episode 8 Reward: -117.89
Episode 9 Reward: -120.11
Episode 10 Reward: -113.69
Episode 11 Reward: -115.07
Episode 12 Reward: -118.97
Episode 13 Reward: -122.32
Episode 14 Reward: -117.74
Episode 15 Reward: -116.85
Episode 16 Reward: -116.05
Episode 17 Reward: -114.97
Actor loss: 167.7064 | Critic loss: 1323.8552 | Entropy: 6.2429
Actor loss: 263.7082 | Critic loss: 2768.6624 | Entropy: 6.2437
Actor loss: 166.5308 | Critic loss: 3213.0020 | Entropy: 6.2444
Actor loss: 73.3194 | Critic loss: 2145.6599 | Entropy: 6.2456
Actor loss: 413.4577 | Critic loss: 3840.7100 | Entropy: 6.2469
Actor loss: 164.1097 | Critic loss: 2581.9976 | Entropy: 6.2494
Actor loss: 53.4071 | Critic loss: 1358.6202 | Entropy: 6.2520
Actor loss: 114.4208 | Critic loss: 1566.5765 | Entropy: 6.2544
Actor loss: 5.5483 | Critic loss: 740.42

In [None]:
# Cập nhật model cuối cùng nếu còn dữ liệu
agent.update()


In [None]:
# Vẽ biểu đồ tổng reward theo episode
plt.plot(rewards_all)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Training Rewards of PPO Agent")
plt.show()


In [None]:
# Lưu model PPO
save_path = r"D:\code_etc\Python\_File_chay_code\DRL\Bidepal_Gym\Model\actor_ppo.pth"
agent.save(save_path)
print(f"Model saved to {save_path}")
