In [1]:
import torch
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
from collections import deque
from tqdm import tqdm

from walker import PPO, Normalize

RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
BLUE = "\033[34m"
MAGENTA = "\033[35m"
CYAN = "\033[36m"
RESET = "\033[0m"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Initialize environment
env = gym.make('Walker2d-v4', render_mode='rgb_array')

log_dir = "../runs/20240712_02-38-11/"

# Number of state and action
N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]

# Initialize PPO model
ppo = PPO(N_S, N_A, log_dir)
normalize = Normalize(N_S, log_dir)

# Load the saved model
ppo.actor_net.load_model()
ppo.actor_net.eval()
normalize.load_params()

In [5]:
# Test the model
state, _ = env.reset()
state = normalize(state)

test_total_reward = 0
test_episodes = 10  # Number of episodes to test
for episode_id in range(test_episodes):
    state, _ = env.reset()
    state = normalize(state)
    # state = np.zeros(17)
    print('initial state: ', state)
    score = 0
    for _ in range(1000):
        action = ppo.actor_net.choose_action(state)
        # print(f"{YELLOW}walker velocity: {RESET}", state[8])
        state, reward, done, _, _ = env.step(action)
        state = normalize(state)
        score += reward

        if done:
            env.reset()
            break
    
    print("episode: ", episode_id, "\tscore: ", score)
env.close()

initial state:  [ 0.89948147 -0.30370082  0.19691223  0.17621902 -2.49970686  0.15205039
  0.07187048 -2.39904975 -0.31266222  0.52422122  0.0749803   0.13616518
  0.10759177 -0.00303725  0.14393825  0.10335399 -0.00578812]
episode:  0 	score:  253.7049103505284
initial state:  [ 0.87996205 -0.31598652  0.13826823  0.16889033 -2.57546659  0.13656154
  0.06110285 -2.44353327 -0.33006872  0.53257142  0.06733975  0.13801706
  0.1135697  -0.00332282  0.14558178  0.1023115  -0.00410514]
episode:  1 	score:  231.4604113912506
initial state:  [ 8.90390024e-01 -3.57219008e-01  1.78728694e-01  1.90584569e-01
 -2.58058771e+00  1.39021913e-01  7.29734214e-02 -2.48603667e+00
 -3.62913306e-01  5.46710443e-01  5.04165306e-02  1.32331474e-01
  1.12832710e-01 -2.48476930e-03  1.41043340e-01  9.87518884e-02
 -6.58205488e-03]
episode:  2 	score:  320.59776397955517
initial state:  [ 9.56710980e-01 -3.88966792e-01  1.49134141e-01  1.53156308e-01
 -2.61293814e+00  1.29988455e-01  4.58987505e-02 -2.5458183