In [None]:
import gym
import pybullet_envs
import numpy as np
from sac_agent import Agent
import time
import matplotlib.pyplot as plt

In [None]:
if __name__ =="__main__":
    env_id = 'Pendulum-v0'
    env = gym.make(env_id)
    agent = Agent(n_actions=env.action_space.shape[0], alpha=0.003, beta=0.008,
                  env=env, gamma=0.99, mem_size=50000, tau=0.005, fc1_dims=512, fc2_dims=256, 
                  batch_size=128, chkpt_dir='models/')
    n_games = 501

    best_score = env.reward_range[0]
    score_history = []
    load_checkpoint = False

    if load_checkpoint:
        agent.load_models()

    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward,
                                   observation_, done)
            if not load_checkpoint:
                agent.learn()
            observation = observation_

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
        
        if 300 <= i and score_history[-2] < score_history[-1]:
            agent.save_models()

        print('episode {} score {:.1f} avg score {:.1f}'.
              format(i, score, avg_score))



In [None]:
def plot_learning_curve(x, scores):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')

x = [i+1 for i in range(n_games)]
plot_learning_curve(x, score_history)

In [None]:
scores = []
agent.load_models() 
evaluate = True

for i in range(100):
    obs = env.reset()
    actions = []
    score = 0
    while True:
        action = agent.choose_action(obs, evaluate)
        obs, reward, done, info = env.step(action)
        score += reward
        actions.append(action)

        if done:
            print(f"Done, points: {score}")
            break
    
    scores.append(score)
    
    env.close()

In [None]:
np.mean(scores)