In [None]:
import gym
import numpy as np
from ddqn_agent import DDQNAgent
from utils import plot_learning_curve, make_env

env = make_env('PongNoFrameskip-v4')
best_score = -np.inf
load_checkpoint = False
n_games = 100
agent = DDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001,
                 input_dims=(env.observation_space.shape),
                 n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
                 batch_size=32, replace=10000, eps_dec=1e-5,
                 chkpt_dir='models/', algo='DDQNAgent',
                 env_name='PongNoFrameskip-v4')

if load_checkpoint:
    agent.load_models()

fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \
        + str(n_games) + 'games'
figure_file = 'plots/' + fname + '.png'

n_steps = 0
scores, eps_history, steps_array = [], [], []

for i in range(n_games):
    done = False
    observation = env.reset()

    score = 0
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward

        if not load_checkpoint:
            agent.store_transition(observation, action,
                                 reward, observation_, int(done))
            agent.learn()
        observation = observation_
        n_steps += 1
    scores.append(score)
    steps_array.append(n_steps)

    avg_score = np.mean(scores[-100:])
    print('episode: ', i,'score: ', score,
         ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
        'epsilon %.2f' % agent.epsilon, 'steps', n_steps)

    if avg_score > best_score:
        #if not load_checkpoint:
        #    agent.save_models()
        best_score = avg_score

    eps_history.append(agent.epsilon)
    if load_checkpoint and n_steps >= 18000:
        break

x = [i+1 for i in range(len(scores))]
plot_learning_curve(steps_array, scores, eps_history, figure_file)


episode:  0 score:  -21.0  average score -21.0 best score -inf epsilon 0.99 steps 880
episode:  1 score:  -20.0  average score -20.5 best score -21.00 epsilon 0.98 steps 1922
episode:  2 score:  -21.0  average score -20.7 best score -20.50 epsilon 0.97 steps 2893
episode:  3 score:  -21.0  average score -20.8 best score -20.50 epsilon 0.96 steps 3877
episode:  4 score:  -21.0  average score -20.8 best score -20.50 epsilon 0.95 steps 4670
episode:  5 score:  -19.0  average score -20.5 best score -20.50 epsilon 0.94 steps 5652
episode:  6 score:  -20.0  average score -20.4 best score -20.50 epsilon 0.93 steps 6638
episode:  7 score:  -21.0  average score -20.5 best score -20.43 epsilon 0.92 steps 7546
episode:  8 score:  -20.0  average score -20.4 best score -20.43 epsilon 0.92 steps 8444
episode:  9 score:  -16.0  average score -20.0 best score -20.43 epsilon 0.90 steps 9785
episode:  10 score:  -21.0  average score -20.1 best score -20.00 epsilon 0.89 steps 10763
episode:  11 score:  -