In [1]:
# general imports
import cv2
import math
import numpy as np
import random

# reinforcement learning related imports
import re
import atari_py as ap
from collections import deque
from gym import make, ObservationWrapper, Wrapper
from gym.spaces import Box

# pytorch imports 
import torch
import torch.nn as nn
from torch import save
from torch.optim import Adam

In [2]:
class ConvDQN(nn.Module):
    def __init__(self, ip_sz, tot_num_acts):
        super(ConvDQN, self).__init__()
        self._ip_sz = ip_sz
        self._tot_num_acts = tot_num_acts

        self.cnv1 = nn.Conv2d(ip_sz[0], 32, kernel_size=8, stride=4)
        self.rl = nn.ReLU()
        self.cnv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.cnv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self.feat_sz, 512)
        self.fc2 = nn.Linear(512, tot_num_acts)

    def forward(self, x):
        op = self.cnv1(x)
        op = self.rl(op)
        op = self.cnv2(op)
        op = self.rl(op)
        op = self.cnv3(op)
        op = self.rl(op).view(x.size()[0], -1)
        op = self.fc1(op)
        op = self.rl(op)
        op = self.fc2(op)
        return op

    @property
    def feat_sz(self):
        x = torch.zeros(1, *self._ip_sz)
        x = self.cnv1(x)
        x = self.rl(x)
        x = self.cnv2(x)
        x = self.rl(x)
        x = self.cnv3(x)
        x = self.rl(x)
        return x.view(1, -1).size(1)

    def perf_action(self, stt, eps, dvc):
        if random.random() > eps:
            stt = torch.from_numpy(np.float32(stt)).unsqueeze(0).to(dvc)
            q_val = self.forward(stt)
            act = q_val.max(1)[1].item()
        else:
            act = random.randrange(self._tot_num_acts)
        return act

In [3]:
def calc_temp_diff_loss(mdl, tgt_mdl, bch, gm, dvc):
    st, act, rwd, nxt_st, fin = bch

    st = torch.from_numpy(np.float32(st)).to(dvc)
    nxt_st = torch.from_numpy(np.float32(nxt_st)).to(dvc)
    act = torch.from_numpy(act).to(dvc)
    rwd = torch.from_numpy(rwd).to(dvc)
    fin = torch.from_numpy(fin).to(dvc)

    q_vals = mdl(st)
    nxt_q_vals = tgt_mdl(nxt_st)

    q_val = q_vals.gather(1, act.unsqueeze(-1)).squeeze(-1)
    nxt_q_val = nxt_q_vals.max(1)[0]
    exp_q_val = rwd + gm * nxt_q_val * (1 - fin)

    loss = (q_val - exp_q_val.data.to(dvc)).pow(2).mean()
    loss.backward()


def upd_eps(epd):
    last_eps = EPS_FINL
    first_eps = EPS_STRT
    eps_decay = EPS_DECAY
    eps = last_eps + (first_eps - last_eps) * math.exp(-1 * ((epd + 1) / eps_decay))
    return eps


def models_init(env, dvc):
    mdl = ConvDQN(env.observation_space.shape, env.action_space.n).to(dvc)
    tgt_mdl = ConvDQN(env.observation_space.shape, env.action_space.n).to(dvc)
    return mdl, tgt_mdl


def gym_to_atari_format(gym_env):
    return re.sub(r"(?<!^)(?=[A-Z])", "_", gym_env).lower()


def check_atari_env(env):
    for f in ["Deterministic", "ramDeterministic", "ram", "NoFrameskip", "ramNoFrameSkip"]:
        env = env.replace(f, "")
    env = re.sub(r"-v\d+", "", env)
    env = gym_to_atari_format(env) 
    return True if env in ap.list_games() else False

In [4]:
class RepBfr:
    def __init__(self, cap_max):
        self._bfr = deque(maxlen=cap_max)

    def push(self, st, act, rwd, nxt_st, fin):
        self._bfr.append((st, act, rwd, nxt_st, fin))

    def smpl(self, bch_sz):
        idxs = np.random.choice(len(self._bfr), bch_sz, False)
        bch = zip(*[self._bfr[i] for i in idxs])
        st, act, rwd, nxt_st, fin = bch
        return (np.array(st), np.array(act), np.array(rwd, dtype=np.float32),
                np.array(nxt_st), np.array(fin, dtype=np.uint8))

    def __len__(self):
        return len(self._bfr)

In [5]:
class TrMetadata:
    def __init__(self):
        self._avg = 0.0
        self._bst_rwd = -float("inf")
        self._bst_avg = -float("inf")
        self._rwds = []
        self._avg_rng = 100
        self._idx = 0

    @property
    def bst_rwd(self):
        return self._bst_rwd

    @property
    def bst_avg(self):
        return self._bst_avg

    @property
    def avg(self):
        avg_rng = self._avg_rng * -1
        return sum(self._rwds[avg_rng:]) / len(self._rwds[avg_rng:])

    @property
    def idx(self):
        return self._idx

    def _upd_bst_rwd(self, epd_rwd):
        if epd_rwd > self.bst_rwd:
            self._bst_rwd = epd_rwd

    def _upd_bst_avg(self):
        if self.avg > self.bst_avg:
            self._bst_avg = self.avg
            return True
        return False

    def upd_rwds(self, epd_rwd):
        self._rwds.append(epd_rwd)
        self._upd_bst_rwd(epd_rwd)
        return self._upd_bst_avg()

    def upd_idx(self):
        self._idx += 1

In [6]:
class CCtrl(Wrapper):
    def __init__(self, env, is_atari):
        super(CCtrl, self).__init__(env)
        self._is_atari = is_atari

    def reset(self):
        if self._is_atari:
            return self.env.reset()
        else:
            self.env.reset()
            return self.env.render(mode="rgb_array")


class FrmDwSmpl(ObservationWrapper):
    def __init__(self, env):
        super(FrmDwSmpl, self).__init__(env)
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self._width = 84
        self._height = 84

    def observation(self, observation):
        frame = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self._width, self._height), interpolation=cv2.INTER_AREA)
        return frame[:, :, None]


class MaxNSkpEnv(Wrapper):
    def __init__(self, env, atari, skip=4):
        super(MaxNSkpEnv, self).__init__(env)
        self._obs_buffer = deque(maxlen=2)
        self._skip = skip
        self._atari = atari

    def step(self, act):
        total_rwd = 0.0
        fin = None
        for _ in range(self._skip):
            obs, rwd, fin, log = self.env.step(act)
            if not self._atari:
                obs = self.env.render(mode="rgb_array")
            self._obs_buffer.append(obs)
            total_rwd += rwd
            if fin:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_rwd, fin, log

    def reset(self):
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class FrRstEnv(Wrapper):
    def __init__(self, env):
        Wrapper.__init__(self, env)
        if len(env.unwrapped.get_action_meanings()) < 3:
            raise ValueError("min required action space of 3!")

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, fin, _ = self.env.step(1)
        if fin:
            self.env.reset(**kwargs)
        obs, _, fin, _ = self.env.step(2)
        if fin:
            self.env.reset(**kwargs)
        return obs

    def step(self, act):
        return self.env.step(act)


class FrmBfr(ObservationWrapper):
    def __init__(self, env, num_steps, dtype=np.float32):
        super(FrmBfr, self).__init__(env)
        obs_space = env.observation_space
        self._dtype = dtype
        self.observation_space = Box(obs_space.low.repeat(num_steps, axis=0),
                                     obs_space.high.repeat(num_steps, axis=0), dtype=self._dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self._dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


class Img2Trch(ObservationWrapper):
    def __init__(self, env):
        super(Img2Trch, self).__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = Box(low=0.0, high=1.0, shape=(obs_shape[::-1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class NormFlts(ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0
        

def wrap_env(env_ip):
    env = make(env_ip)
    is_atari = check_atari_env(env_ip)
    env = CCtrl(env, is_atari)
    env = MaxNSkpEnv(env, is_atari)
    try:
        env_acts = env.unwrapped.get_action_meanings()
        if "FIRE" in env_acts:
            env = FrRstEnv(env)
    except AttributeError:
        pass
    env = FrmDwSmpl(env)
    env = Img2Trch(env)
    env = FrmBfr(env, 4)
    env = NormFlts(env)
    return env

In [7]:
def upd_grph(mdl, tgt_mdl, opt, rpl_bfr, dvc, log):
    if len(rpl_bfr) > INIT_LEARN:
        if not log.idx % TGT_UPD_FRQ:
            tgt_mdl.load_state_dict(mdl.state_dict())
        opt.zero_grad()
        bch = rpl_bfr.smpl(B_S)
        calc_temp_diff_loss(mdl, tgt_mdl, bch, G, dvc)
        opt.step()


def fin_epsd(mdl, env, log, epd_rwd, epd, eps):
    bst_so_fat = log.upd_rwds(epd_rwd)
    if bst_so_fat:
        print(f"checkpointing current model weights. highest running_average_reward of\
 {round(log.bst_avg, 3)} achieved!")
        save(mdl.state_dict(), f"{env}.dat")
    print(f"episode_num {epd}, curr_reward: {epd_rwd}, best_reward: {log.bst_rwd},\
 running_avg_reward: {round(log.avg, 3)}, curr_epsilon: {round(eps, 4)}")


def run_epsd(env, mdl, tgt_mdl, opt, rpl_bfr, dvc, log, epd):
    epd_rwd = 0.0
    st = env.reset()

    while True:
        eps = upd_eps(log.idx)
        act = mdl.perf_action(st, eps, dvc)
        if True:
            env.render()
        nxt_st, rwd, fin, _ = env.step(act)
        rpl_bfr.push(st, act, rwd, nxt_st, fin)
        st = nxt_st
        epd_rwd += rwd
        log.upd_idx()
        upd_grph(mdl, tgt_mdl, opt, rpl_bfr, dvc, log)
        if fin:
            fin_epsd(mdl, ENV, log, epd_rwd, epd, eps)
            break


def train(env, mdl, tgt_mdl, opt, rpl_bfr, dvc):
    log = TrMetadata()

    for epd in range(N_EPDS):
        run_epsd(env, mdl, tgt_mdl, opt, rpl_bfr, dvc, log, epd)

In [8]:
B_S = 64
ENV = "Pong-v4"
EPS_STRT = 1.0
EPS_FINL = 0.005
EPS_DECAY = 100000
G = 0.99
INIT_LEARN = 10000
LR = 1e-4
MEM_CAP = 20000
N_EPDS = 50000
TGT_UPD_FRQ = 1000

In [None]:
env = wrap_env(ENV)
dvc = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
mdl, tgt_mdl = models_init(env, dvc)
opt = Adam(mdl.parameters(), lr=LR)
rpl_bfr = RepBfr(MEM_CAP)
train(env, mdl, tgt_mdl, opt, rpl_bfr, dvc)
env.close()

checkpointing current model weights. highest running_average_reward of -20.0 achieved!
episode_num 0, curr_reward: -20.0, best_reward: -20.0, running_avg_reward: -20.0, curr_epsilon: 0.9971
checkpointing current model weights. highest running_average_reward of -19.5 achieved!
episode_num 1, curr_reward: -19.0, best_reward: -19.0, running_avg_reward: -19.5, curr_epsilon: 0.9937
episode_num 2, curr_reward: -21.0, best_reward: -19.0, running_avg_reward: -20.0, curr_epsilon: 0.991
episode_num 3, curr_reward: -21.0, best_reward: -19.0, running_avg_reward: -20.25, curr_epsilon: 0.9881
episode_num 4, curr_reward: -19.0, best_reward: -19.0, running_avg_reward: -20.0, curr_epsilon: 0.9846
episode_num 5, curr_reward: -20.0, best_reward: -19.0, running_avg_reward: -20.0, curr_epsilon: 0.9811
episode_num 6, curr_reward: -21.0, best_reward: -19.0, running_avg_reward: -20.143, curr_epsilon: 0.9779
episode_num 7, curr_reward: -21.0, best_reward: -19.0, running_avg_reward: -20.25, curr_epsilon: 0.9751

episode_num 77, curr_reward: -21.0, best_reward: -17.0, running_avg_reward: -20.41, curr_epsilon: 0.7872
episode_num 78, curr_reward: -21.0, best_reward: -17.0, running_avg_reward: -20.418, curr_epsilon: 0.785
episode_num 79, curr_reward: -21.0, best_reward: -17.0, running_avg_reward: -20.425, curr_epsilon: 0.7829
episode_num 80, curr_reward: -20.0, best_reward: -17.0, running_avg_reward: -20.42, curr_epsilon: 0.7807
episode_num 81, curr_reward: -21.0, best_reward: -17.0, running_avg_reward: -20.427, curr_epsilon: 0.7784
episode_num 82, curr_reward: -20.0, best_reward: -17.0, running_avg_reward: -20.422, curr_epsilon: 0.7758
episode_num 83, curr_reward: -21.0, best_reward: -17.0, running_avg_reward: -20.429, curr_epsilon: 0.7737
episode_num 84, curr_reward: -20.0, best_reward: -17.0, running_avg_reward: -20.424, curr_epsilon: 0.7714
episode_num 85, curr_reward: -21.0, best_reward: -17.0, running_avg_reward: -20.43, curr_epsilon: 0.7692
episode_num 86, curr_reward: -20.0, best_reward: -

episode_num 155, curr_reward: -20.0, best_reward: -16.0, running_avg_reward: -20.06, curr_epsilon: 0.6041
episode_num 156, curr_reward: -19.0, best_reward: -16.0, running_avg_reward: -20.05, curr_epsilon: 0.6017
episode_num 157, curr_reward: -21.0, best_reward: -16.0, running_avg_reward: -20.09, curr_epsilon: 0.5995
episode_num 158, curr_reward: -19.0, best_reward: -16.0, running_avg_reward: -20.09, curr_epsilon: 0.5975
episode_num 159, curr_reward: -20.0, best_reward: -16.0, running_avg_reward: -20.09, curr_epsilon: 0.5952
episode_num 160, curr_reward: -21.0, best_reward: -16.0, running_avg_reward: -20.09, curr_epsilon: 0.5931
episode_num 161, curr_reward: -19.0, best_reward: -16.0, running_avg_reward: -20.07, curr_epsilon: 0.5907
episode_num 162, curr_reward: -18.0, best_reward: -16.0, running_avg_reward: -20.04, curr_epsilon: 0.5884
episode_num 163, curr_reward: -20.0, best_reward: -16.0, running_avg_reward: -20.04, curr_epsilon: 0.5863
episode_num 164, curr_reward: -18.0, best_rewa

episode_num 233, curr_reward: -18.0, best_reward: -15.0, running_avg_reward: -19.61, curr_epsilon: 0.4462
episode_num 234, curr_reward: -20.0, best_reward: -15.0, running_avg_reward: -19.61, curr_epsilon: 0.4446
episode_num 235, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -19.6, curr_epsilon: 0.4428
episode_num 236, curr_reward: -21.0, best_reward: -15.0, running_avg_reward: -19.6, curr_epsilon: 0.4413
episode_num 237, curr_reward: -21.0, best_reward: -15.0, running_avg_reward: -19.61, curr_epsilon: 0.4399
episode_num 238, curr_reward: -18.0, best_reward: -15.0, running_avg_reward: -19.58, curr_epsilon: 0.438
episode_num 239, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -19.56, curr_epsilon: 0.4366
episode_num 240, curr_reward: -20.0, best_reward: -15.0, running_avg_reward: -19.58, curr_epsilon: 0.4343
episode_num 241, curr_reward: -18.0, best_reward: -15.0, running_avg_reward: -19.59, curr_epsilon: 0.4325
episode_num 242, curr_reward: -21.0, best_reward:

episode_num 300, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -19.24, curr_epsilon: 0.3347
episode_num 301, curr_reward: -20.0, best_reward: -15.0, running_avg_reward: -19.23, curr_epsilon: 0.3331
checkpointing current model weights. highest running_average_reward of -19.22 achieved!
episode_num 302, curr_reward: -20.0, best_reward: -15.0, running_avg_reward: -19.22, curr_epsilon: 0.3318
checkpointing current model weights. highest running_average_reward of -19.2 achieved!
episode_num 303, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -19.2, curr_epsilon: 0.3305
episode_num 304, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -19.2, curr_epsilon: 0.3288
checkpointing current model weights. highest running_average_reward of -19.18 achieved!
episode_num 305, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -19.18, curr_epsilon: 0.3274
episode_num 306, curr_reward: -20.0, best_reward: -15.0, running_avg_reward: -19.18, curr_epsilon: 0.32

checkpointing current model weights. highest running_average_reward of -18.56 achieved!
episode_num 352, curr_reward: -15.0, best_reward: -15.0, running_avg_reward: -18.56, curr_epsilon: 0.259
episode_num 353, curr_reward: -20.0, best_reward: -15.0, running_avg_reward: -18.57, curr_epsilon: 0.2579
checkpointing current model weights. highest running_average_reward of -18.54 achieved!
episode_num 354, curr_reward: -18.0, best_reward: -15.0, running_avg_reward: -18.54, curr_epsilon: 0.2566
episode_num 355, curr_reward: -21.0, best_reward: -15.0, running_avg_reward: -18.55, curr_epsilon: 0.2556
checkpointing current model weights. highest running_average_reward of -18.53 achieved!
episode_num 356, curr_reward: -17.0, best_reward: -15.0, running_avg_reward: -18.53, curr_epsilon: 0.2541
episode_num 357, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -18.56, curr_epsilon: 0.2529
episode_num 358, curr_reward: -19.0, best_reward: -15.0, running_avg_reward: -18.58, curr_epsilon: 0.

episode_num 412, curr_reward: -19.0, best_reward: -12.0, running_avg_reward: -18.17, curr_epsilon: 0.1862
episode_num 413, curr_reward: -18.0, best_reward: -12.0, running_avg_reward: -18.17, curr_epsilon: 0.1852
checkpointing current model weights. highest running_average_reward of -18.11 achieved!
episode_num 414, curr_reward: -12.0, best_reward: -12.0, running_avg_reward: -18.11, curr_epsilon: 0.1839
checkpointing current model weights. highest running_average_reward of -18.08 achieved!
episode_num 415, curr_reward: -18.0, best_reward: -12.0, running_avg_reward: -18.08, curr_epsilon: 0.1829
episode_num 416, curr_reward: -19.0, best_reward: -12.0, running_avg_reward: -18.08, curr_epsilon: 0.1819
checkpointing current model weights. highest running_average_reward of -18.07 achieved!
episode_num 417, curr_reward: -16.0, best_reward: -12.0, running_avg_reward: -18.07, curr_epsilon: 0.181
episode_num 418, curr_reward: -18.0, best_reward: -12.0, running_avg_reward: -18.07, curr_epsilon: 0.

checkpointing current model weights. highest running_average_reward of -17.16 achieved!
episode_num 467, curr_reward: -12.0, best_reward: -11.0, running_avg_reward: -17.16, curr_epsilon: 0.1313
checkpointing current model weights. highest running_average_reward of -17.15 achieved!
episode_num 468, curr_reward: -18.0, best_reward: -11.0, running_avg_reward: -17.15, curr_epsilon: 0.1306
checkpointing current model weights. highest running_average_reward of -17.12 achieved!
episode_num 469, curr_reward: -17.0, best_reward: -11.0, running_avg_reward: -17.12, curr_epsilon: 0.1297
checkpointing current model weights. highest running_average_reward of -17.1 achieved!
episode_num 470, curr_reward: -18.0, best_reward: -11.0, running_avg_reward: -17.1, curr_epsilon: 0.129
checkpointing current model weights. highest running_average_reward of -17.08 achieved!
episode_num 471, curr_reward: -16.0, best_reward: -11.0, running_avg_reward: -17.08, curr_epsilon: 0.128
checkpointing current model weight

checkpointing current model weights. highest running_average_reward of -16.18 achieved!
episode_num 518, curr_reward: -14.0, best_reward: -11.0, running_avg_reward: -16.18, curr_epsilon: 0.0927
checkpointing current model weights. highest running_average_reward of -16.14 achieved!
episode_num 519, curr_reward: -17.0, best_reward: -11.0, running_avg_reward: -16.14, curr_epsilon: 0.0922
episode_num 520, curr_reward: -21.0, best_reward: -11.0, running_avg_reward: -16.18, curr_epsilon: 0.0917
episode_num 521, curr_reward: -16.0, best_reward: -11.0, running_avg_reward: -16.17, curr_epsilon: 0.0911
episode_num 522, curr_reward: -17.0, best_reward: -11.0, running_avg_reward: -16.21, curr_epsilon: 0.0905
episode_num 523, curr_reward: -13.0, best_reward: -11.0, running_avg_reward: -16.18, curr_epsilon: 0.0898
checkpointing current model weights. highest running_average_reward of -16.12 achieved!
episode_num 524, curr_reward: -13.0, best_reward: -11.0, running_avg_reward: -16.12, curr_epsilon: 0

checkpointing current model weights. highest running_average_reward of -15.22 achieved!
episode_num 574, curr_reward: -8.0, best_reward: -6.0, running_avg_reward: -15.22, curr_epsilon: 0.0612
checkpointing current model weights. highest running_average_reward of -15.18 achieved!
episode_num 575, curr_reward: -8.0, best_reward: -6.0, running_avg_reward: -15.18, curr_epsilon: 0.0606
checkpointing current model weights. highest running_average_reward of -15.14 achieved!
episode_num 576, curr_reward: -12.0, best_reward: -6.0, running_avg_reward: -15.14, curr_epsilon: 0.0601
episode_num 577, curr_reward: -17.0, best_reward: -6.0, running_avg_reward: -15.16, curr_epsilon: 0.0597
checkpointing current model weights. highest running_average_reward of -15.09 achieved!
episode_num 578, curr_reward: -9.0, best_reward: -6.0, running_avg_reward: -15.09, curr_epsilon: 0.0592
checkpointing current model weights. highest running_average_reward of -15.04 achieved!
episode_num 579, curr_reward: -12.0, b

checkpointing current model weights. highest running_average_reward of -13.86 achieved!
episode_num 626, curr_reward: -6.0, best_reward: -6.0, running_avg_reward: -13.86, curr_epsilon: 0.0411
episode_num 627, curr_reward: -20.0, best_reward: -6.0, running_avg_reward: -13.87, curr_epsilon: 0.0408
episode_num 628, curr_reward: -10.0, best_reward: -6.0, running_avg_reward: -13.86, curr_epsilon: 0.0405
checkpointing current model weights. highest running_average_reward of -13.84 achieved!
episode_num 629, curr_reward: -17.0, best_reward: -6.0, running_avg_reward: -13.84, curr_epsilon: 0.0402
checkpointing current model weights. highest running_average_reward of -13.82 achieved!
episode_num 630, curr_reward: -14.0, best_reward: -6.0, running_avg_reward: -13.82, curr_epsilon: 0.0399
checkpointing current model weights. highest running_average_reward of -13.75 achieved!
episode_num 631, curr_reward: -8.0, best_reward: -6.0, running_avg_reward: -13.75, curr_epsilon: 0.0395
episode_num 632, cur

checkpointing current model weights. highest running_average_reward of -12.51 achieved!
episode_num 683, curr_reward: -14.0, best_reward: 1.0, running_avg_reward: -12.51, curr_epsilon: 0.0265
episode_num 684, curr_reward: -16.0, best_reward: 1.0, running_avg_reward: -12.55, curr_epsilon: 0.0263
episode_num 685, curr_reward: -12.0, best_reward: 1.0, running_avg_reward: -12.59, curr_epsilon: 0.0261
episode_num 686, curr_reward: -14.0, best_reward: 1.0, running_avg_reward: -12.61, curr_epsilon: 0.0259
episode_num 687, curr_reward: -13.0, best_reward: 1.0, running_avg_reward: -12.58, curr_epsilon: 0.0257
episode_num 688, curr_reward: -13.0, best_reward: 1.0, running_avg_reward: -12.59, curr_epsilon: 0.0255
episode_num 689, curr_reward: -14.0, best_reward: 1.0, running_avg_reward: -12.6, curr_epsilon: 0.0253
episode_num 690, curr_reward: -14.0, best_reward: 1.0, running_avg_reward: -12.54, curr_epsilon: 0.0252
checkpointing current model weights. highest running_average_reward of -12.43 ach

episode_num 739, curr_reward: -16.0, best_reward: 1.0, running_avg_reward: -11.13, curr_epsilon: 0.0175
episode_num 740, curr_reward: -9.0, best_reward: 1.0, running_avg_reward: -11.16, curr_epsilon: 0.0174
episode_num 741, curr_reward: -15.0, best_reward: 1.0, running_avg_reward: -11.19, curr_epsilon: 0.0173
episode_num 742, curr_reward: -6.0, best_reward: 1.0, running_avg_reward: -11.14, curr_epsilon: 0.0171
episode_num 743, curr_reward: -16.0, best_reward: 1.0, running_avg_reward: -11.21, curr_epsilon: 0.017
episode_num 744, curr_reward: -4.0, best_reward: 1.0, running_avg_reward: -11.1, curr_epsilon: 0.0169
episode_num 745, curr_reward: -11.0, best_reward: 1.0, running_avg_reward: -11.07, curr_epsilon: 0.0167
episode_num 746, curr_reward: -10.0, best_reward: 1.0, running_avg_reward: -11.08, curr_epsilon: 0.0167
episode_num 747, curr_reward: -7.0, best_reward: 1.0, running_avg_reward: -11.0, curr_epsilon: 0.0165
checkpointing current model weights. highest running_average_reward of 

episode_num 804, curr_reward: -12.0, best_reward: 6.0, running_avg_reward: -9.91, curr_epsilon: 0.0116
episode_num 805, curr_reward: -19.0, best_reward: 6.0, running_avg_reward: -9.99, curr_epsilon: 0.0115
episode_num 806, curr_reward: -11.0, best_reward: 6.0, running_avg_reward: -10.01, curr_epsilon: 0.0114
episode_num 807, curr_reward: -13.0, best_reward: 6.0, running_avg_reward: -10.06, curr_epsilon: 0.0114
episode_num 808, curr_reward: -8.0, best_reward: 6.0, running_avg_reward: -10.0, curr_epsilon: 0.0113
episode_num 809, curr_reward: -8.0, best_reward: 6.0, running_avg_reward: -10.04, curr_epsilon: 0.0113
episode_num 810, curr_reward: -14.0, best_reward: 6.0, running_avg_reward: -10.06, curr_epsilon: 0.0112
episode_num 811, curr_reward: -6.0, best_reward: 6.0, running_avg_reward: -10.05, curr_epsilon: 0.0112
episode_num 812, curr_reward: -6.0, best_reward: 6.0, running_avg_reward: -10.03, curr_epsilon: 0.0111
episode_num 813, curr_reward: -12.0, best_reward: 6.0, running_avg_rewa

episode_num 874, curr_reward: -8.0, best_reward: 6.0, running_avg_reward: -9.53, curr_epsilon: 0.0084
episode_num 875, curr_reward: -6.0, best_reward: 6.0, running_avg_reward: -9.55, curr_epsilon: 0.0083
episode_num 876, curr_reward: -6.0, best_reward: 6.0, running_avg_reward: -9.47, curr_epsilon: 0.0083
episode_num 877, curr_reward: -10.0, best_reward: 6.0, running_avg_reward: -9.51, curr_epsilon: 0.0083
episode_num 878, curr_reward: 3.0, best_reward: 6.0, running_avg_reward: -9.49, curr_epsilon: 0.0082
episode_num 879, curr_reward: -8.0, best_reward: 6.0, running_avg_reward: -9.49, curr_epsilon: 0.0082
checkpointing current model weights. highest running_average_reward of -9.42 achieved!
episode_num 880, curr_reward: -5.0, best_reward: 6.0, running_avg_reward: -9.42, curr_epsilon: 0.0082
checkpointing current model weights. highest running_average_reward of -9.38 achieved!
episode_num 881, curr_reward: -9.0, best_reward: 6.0, running_avg_reward: -9.38, curr_epsilon: 0.0081
checkpoint

checkpointing current model weights. highest running_average_reward of -8.9 achieved!
episode_num 945, curr_reward: -8.0, best_reward: 13.0, running_avg_reward: -8.9, curr_epsilon: 0.0066
checkpointing current model weights. highest running_average_reward of -8.88 achieved!
episode_num 946, curr_reward: -2.0, best_reward: 13.0, running_avg_reward: -8.88, curr_epsilon: 0.0066
checkpointing current model weights. highest running_average_reward of -8.86 achieved!
episode_num 947, curr_reward: -5.0, best_reward: 13.0, running_avg_reward: -8.86, curr_epsilon: 0.0066
checkpointing current model weights. highest running_average_reward of -8.82 achieved!
episode_num 948, curr_reward: -9.0, best_reward: 13.0, running_avg_reward: -8.82, curr_epsilon: 0.0066
checkpointing current model weights. highest running_average_reward of -8.81 achieved!
episode_num 949, curr_reward: -11.0, best_reward: 13.0, running_avg_reward: -8.81, curr_epsilon: 0.0066
episode_num 950, curr_reward: -12.0, best_reward: 1

episode_num 1002, curr_reward: -15.0, best_reward: 13.0, running_avg_reward: -6.72, curr_epsilon: 0.0059
episode_num 1003, curr_reward: -3.0, best_reward: 13.0, running_avg_reward: -6.66, curr_epsilon: 0.0059
episode_num 1004, curr_reward: -7.0, best_reward: 13.0, running_avg_reward: -6.72, curr_epsilon: 0.0059
episode_num 1005, curr_reward: -12.0, best_reward: 13.0, running_avg_reward: -6.69, curr_epsilon: 0.0059
episode_num 1006, curr_reward: -6.0, best_reward: 13.0, running_avg_reward: -6.67, curr_epsilon: 0.0059
episode_num 1007, curr_reward: -5.0, best_reward: 13.0, running_avg_reward: -6.63, curr_epsilon: 0.0059
checkpointing current model weights. highest running_average_reward of -6.57 achieved!
episode_num 1008, curr_reward: -5.0, best_reward: 13.0, running_avg_reward: -6.57, curr_epsilon: 0.0058
episode_num 1009, curr_reward: -10.0, best_reward: 13.0, running_avg_reward: -6.57, curr_epsilon: 0.0058
episode_num 1010, curr_reward: -10.0, best_reward: 13.0, running_avg_reward: -

episode_num 1065, curr_reward: -6.0, best_reward: 14.0, running_avg_reward: -5.23, curr_epsilon: 0.0055
episode_num 1066, curr_reward: 5.0, best_reward: 14.0, running_avg_reward: -5.11, curr_epsilon: 0.0054
episode_num 1067, curr_reward: -2.0, best_reward: 14.0, running_avg_reward: -5.02, curr_epsilon: 0.0054
episode_num 1068, curr_reward: -14.0, best_reward: 14.0, running_avg_reward: -5.05, curr_epsilon: 0.0054
episode_num 1069, curr_reward: -13.0, best_reward: 14.0, running_avg_reward: -5.13, curr_epsilon: 0.0054
episode_num 1070, curr_reward: -12.0, best_reward: 14.0, running_avg_reward: -5.15, curr_epsilon: 0.0054
episode_num 1071, curr_reward: -1.0, best_reward: 14.0, running_avg_reward: -5.08, curr_epsilon: 0.0054
episode_num 1072, curr_reward: -14.0, best_reward: 14.0, running_avg_reward: -5.21, curr_epsilon: 0.0054
episode_num 1073, curr_reward: 1.0, best_reward: 14.0, running_avg_reward: -5.26, curr_epsilon: 0.0054
episode_num 1074, curr_reward: -7.0, best_reward: 14.0, runnin

episode_num 1137, curr_reward: -15.0, best_reward: 14.0, running_avg_reward: -4.73, curr_epsilon: 0.0052
episode_num 1138, curr_reward: 7.0, best_reward: 14.0, running_avg_reward: -4.63, curr_epsilon: 0.0052
episode_num 1139, curr_reward: -5.0, best_reward: 14.0, running_avg_reward: -4.59, curr_epsilon: 0.0052
episode_num 1140, curr_reward: -1.0, best_reward: 14.0, running_avg_reward: -4.66, curr_epsilon: 0.0052
episode_num 1141, curr_reward: -1.0, best_reward: 14.0, running_avg_reward: -4.66, curr_epsilon: 0.0052
episode_num 1142, curr_reward: -14.0, best_reward: 14.0, running_avg_reward: -4.76, curr_epsilon: 0.0052
episode_num 1143, curr_reward: -6.0, best_reward: 14.0, running_avg_reward: -4.91, curr_epsilon: 0.0052
episode_num 1144, curr_reward: -11.0, best_reward: 14.0, running_avg_reward: -5.03, curr_epsilon: 0.0052
episode_num 1145, curr_reward: -14.0, best_reward: 14.0, running_avg_reward: -5.04, curr_epsilon: 0.0052
episode_num 1146, curr_reward: -14.0, best_reward: 14.0, runn

episode_num 1216, curr_reward: -10.0, best_reward: 14.0, running_avg_reward: -6.5, curr_epsilon: 0.0051
episode_num 1217, curr_reward: -7.0, best_reward: 14.0, running_avg_reward: -6.56, curr_epsilon: 0.0051
episode_num 1218, curr_reward: -15.0, best_reward: 14.0, running_avg_reward: -6.57, curr_epsilon: 0.0051
episode_num 1219, curr_reward: 1.0, best_reward: 14.0, running_avg_reward: -6.47, curr_epsilon: 0.0051
episode_num 1220, curr_reward: -9.0, best_reward: 14.0, running_avg_reward: -6.55, curr_epsilon: 0.0051
episode_num 1221, curr_reward: -6.0, best_reward: 14.0, running_avg_reward: -6.48, curr_epsilon: 0.0051
episode_num 1222, curr_reward: 8.0, best_reward: 14.0, running_avg_reward: -6.35, curr_epsilon: 0.0051
episode_num 1223, curr_reward: -15.0, best_reward: 14.0, running_avg_reward: -6.46, curr_epsilon: 0.0051
episode_num 1224, curr_reward: -16.0, best_reward: 14.0, running_avg_reward: -6.56, curr_epsilon: 0.0051
episode_num 1225, curr_reward: -1.0, best_reward: 14.0, running

episode_num 1296, curr_reward: -8.0, best_reward: 14.0, running_avg_reward: -5.64, curr_epsilon: 0.005
episode_num 1297, curr_reward: -18.0, best_reward: 14.0, running_avg_reward: -5.81, curr_epsilon: 0.005
episode_num 1298, curr_reward: -5.0, best_reward: 14.0, running_avg_reward: -5.73, curr_epsilon: 0.005
episode_num 1299, curr_reward: -9.0, best_reward: 14.0, running_avg_reward: -5.7, curr_epsilon: 0.005
episode_num 1300, curr_reward: -9.0, best_reward: 14.0, running_avg_reward: -5.69, curr_epsilon: 0.005
episode_num 1301, curr_reward: -14.0, best_reward: 14.0, running_avg_reward: -5.88, curr_epsilon: 0.005
episode_num 1302, curr_reward: -8.0, best_reward: 14.0, running_avg_reward: -5.91, curr_epsilon: 0.005
episode_num 1303, curr_reward: -4.0, best_reward: 14.0, running_avg_reward: -5.86, curr_epsilon: 0.005
episode_num 1304, curr_reward: -12.0, best_reward: 14.0, running_avg_reward: -5.9, curr_epsilon: 0.005
episode_num 1305, curr_reward: -10.0, best_reward: 14.0, running_avg_rew

episode_num 1376, curr_reward: -8.0, best_reward: 16.0, running_avg_reward: -7.21, curr_epsilon: 0.005
episode_num 1377, curr_reward: -10.0, best_reward: 16.0, running_avg_reward: -7.24, curr_epsilon: 0.005
episode_num 1378, curr_reward: -6.0, best_reward: 16.0, running_avg_reward: -7.27, curr_epsilon: 0.005
episode_num 1379, curr_reward: -8.0, best_reward: 16.0, running_avg_reward: -7.28, curr_epsilon: 0.005
episode_num 1380, curr_reward: -10.0, best_reward: 16.0, running_avg_reward: -7.23, curr_epsilon: 0.005
episode_num 1381, curr_reward: 2.0, best_reward: 16.0, running_avg_reward: -7.22, curr_epsilon: 0.005
episode_num 1382, curr_reward: -14.0, best_reward: 16.0, running_avg_reward: -7.27, curr_epsilon: 0.005
episode_num 1383, curr_reward: -4.0, best_reward: 16.0, running_avg_reward: -7.3, curr_epsilon: 0.005
episode_num 1384, curr_reward: -5.0, best_reward: 16.0, running_avg_reward: -7.34, curr_epsilon: 0.005
episode_num 1385, curr_reward: -13.0, best_reward: 16.0, running_avg_rew

checkpointing current model weights. highest running_average_reward of -4.38 achieved!
episode_num 1455, curr_reward: 1.0, best_reward: 16.0, running_avg_reward: -4.38, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of -4.15 achieved!
episode_num 1456, curr_reward: 13.0, best_reward: 16.0, running_avg_reward: -4.15, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of -4.03 achieved!
episode_num 1457, curr_reward: -3.0, best_reward: 16.0, running_avg_reward: -4.03, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of -3.95 achieved!
episode_num 1458, curr_reward: 3.0, best_reward: 16.0, running_avg_reward: -3.95, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of -3.86 achieved!
episode_num 1459, curr_reward: 3.0, best_reward: 16.0, running_avg_reward: -3.86, curr_epsilon: 0.005
episode_num 1460, curr_reward: 5.0, best_reward: 16.0

checkpointing current model weights. highest running_average_reward of 0.11 achieved!
episode_num 1503, curr_reward: 3.0, best_reward: 17.0, running_avg_reward: 0.11, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of 0.2 achieved!
episode_num 1504, curr_reward: 2.0, best_reward: 17.0, running_avg_reward: 0.2, curr_epsilon: 0.005
episode_num 1505, curr_reward: -8.0, best_reward: 17.0, running_avg_reward: 0.19, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of 0.3 achieved!
episode_num 1506, curr_reward: 5.0, best_reward: 17.0, running_avg_reward: 0.3, curr_epsilon: 0.005
episode_num 1507, curr_reward: -9.0, best_reward: 17.0, running_avg_reward: 0.28, curr_epsilon: 0.005
episode_num 1508, curr_reward: -11.0, best_reward: 17.0, running_avg_reward: 0.24, curr_epsilon: 0.005
episode_num 1509, curr_reward: 2.0, best_reward: 17.0, running_avg_reward: 0.27, curr_epsilon: 0.005
checkpointing current model weights

episode_num 1563, curr_reward: -3.0, best_reward: 18.0, running_avg_reward: 1.65, curr_epsilon: 0.005
episode_num 1564, curr_reward: -3.0, best_reward: 18.0, running_avg_reward: 1.46, curr_epsilon: 0.005
episode_num 1565, curr_reward: -3.0, best_reward: 18.0, running_avg_reward: 1.44, curr_epsilon: 0.005
episode_num 1566, curr_reward: -2.0, best_reward: 18.0, running_avg_reward: 1.33, curr_epsilon: 0.005
episode_num 1567, curr_reward: 4.0, best_reward: 18.0, running_avg_reward: 1.43, curr_epsilon: 0.005
episode_num 1568, curr_reward: -1.0, best_reward: 18.0, running_avg_reward: 1.49, curr_epsilon: 0.005
episode_num 1569, curr_reward: -11.0, best_reward: 18.0, running_avg_reward: 1.43, curr_epsilon: 0.005
episode_num 1570, curr_reward: -1.0, best_reward: 18.0, running_avg_reward: 1.32, curr_epsilon: 0.005
episode_num 1571, curr_reward: 3.0, best_reward: 18.0, running_avg_reward: 1.22, curr_epsilon: 0.005
episode_num 1572, curr_reward: 11.0, best_reward: 18.0, running_avg_reward: 1.37, c

checkpointing current model weights. highest running_average_reward of 2.46 achieved!
episode_num 1636, curr_reward: -3.0, best_reward: 18.0, running_avg_reward: 2.46, curr_epsilon: 0.005
episode_num 1637, curr_reward: -2.0, best_reward: 18.0, running_avg_reward: 2.46, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of 2.49 achieved!
episode_num 1638, curr_reward: 5.0, best_reward: 18.0, running_avg_reward: 2.49, curr_epsilon: 0.005
episode_num 1639, curr_reward: -1.0, best_reward: 18.0, running_avg_reward: 2.4, curr_epsilon: 0.005
checkpointing current model weights. highest running_average_reward of 2.57 achieved!
episode_num 1640, curr_reward: 15.0, best_reward: 18.0, running_avg_reward: 2.57, curr_epsilon: 0.005
episode_num 1641, curr_reward: 4.0, best_reward: 18.0, running_avg_reward: 2.47, curr_epsilon: 0.005
episode_num 1642, curr_reward: -2.0, best_reward: 18.0, running_avg_reward: 2.43, curr_epsilon: 0.005
episode_num 1643, curr_reward: 