In [2]:
import os
import gym
from gym.wrappers import Monitor

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [5]:
env_name='CartPole-v0'
env=Monitor(gym.make(env_name),'./',force=True)
env=DummyVecEnv([lambda:env])

In [4]:
def run_env(env, episodes, random=True, model=""):
    total_rewards=[]
    if random:
        for _ in range(episodes):
            ep_rewards=0
            env.reset()
            while True:
                action=env.action_space.sample()
                next_state, reward,done, info=env.step(action)
                ep_rewards+=reward
                if done:
                    total_rewards.append(ep_rewards)
                    break
    else:
        if model=="":
            print('Please enter the agent model')
            
        else:
            for _ in range(episodes):
                ep_rewards=0
                state=env.reset()
                while True:
                    action,_=model.predict(state)
                    next_state, reward,done, info=env.step(action)
                    ep_rewards+=reward
                    state=next_state
                    if done:
                        total_rewards.append(ep_rewards)
                        break
        return total_rewards

In [5]:
def run_experiment(env_name,env,random=True, episodes=5, model=""):
    render_path=os.path.join(env_name,'render','base_performance') if random else os.path.join(env_name,'render','model_performance')
    env=Monitor(gym.make(env_name),render_path,force=True)
    if random:
        total_rewards=run_env(env, 5)
    else:
        total_rewards=run_env(env,episodes=episodes, random=False,model=model)
    return env, total_rewards

### Model Training

In [None]:
log_path=os.path.join(env_name,"PPO","logs")
print(log_path)

In [10]:
model=PPO("MlpPolicy",env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [11]:
model.learn(40000)

Logging to CartPole-v0\PPO\logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 719  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 825         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009532657 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00303    |
|    learning_rate        | 0.0003      |
|    loss                 | 5.86        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0152     |
|    value_loss           | 45          |
--------------------------------------

-----------------------------------------
| time/                   |             |
|    fps                  | 902         |
|    iterations           | 13          |
|    time_elapsed         | 29          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.004110072 |
|    clip_fraction        | 0.0343      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.535      |
|    explained_variance   | 0.912       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.961       |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.00804    |
|    value_loss           | 5.55        |
-----------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 913          |
|    iterations           | 14           |
|    time_elapsed         | 31           |
|    total_timesteps      | 2

<stable_baselines3.ppo.ppo.PPO at 0x1759060ab38>

In [6]:
model_path=os.path.join(env_name,"PPO","models")
model_path

'CartPole-v0\\PPO\\models'

In [9]:
model.save(model_path)

NameError: name 'model' is not defined

In [18]:
env.close()

### Evaluation of Model

In [6]:
env_name='CartPole-v0'
env=Monitor(gym.make(env_name),'./',force=True)
env=DummyVecEnv([lambda:env])

In [7]:
model_path=os.path.join(env_name,"PPO","models")

In [8]:
model=PPO.load(model_path, env)
env.close()

In [9]:
evaluate_policy(model,env,render=True)



(200.0, 0.0)

In [10]:
env.close()

In [None]:
del model

### Test model

In [None]:
env_name='CartPole-v0'
env=Monitor(gym.make(env_name),'./',force=True)
env=DummyVecEnv([lambda:env])

In [None]:
model_path=os.path.join(env_name,"PPO","models")

In [None]:
model=PPO.load(model_path, env)
env.close()

In [8]:
_, total_rewards = run_experiment(env_name,env,random=False, episodes=5, model=model)

In [9]:
print(total_rewards)

[200.0, 200.0, 200.0, 200.0, 200.0]
