## 7장 벽돌깨기 게임 학습 프로그램

In [1]:
# 구현에 사용할 패키지 임포트
import numpy as np
from collections import deque
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym
from gym import spaces
from gym.spaces.box import Box


In [2]:
# 실행환경 설정
# 참고：https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py

import cv2
cv2.ocl.setUseOpenCL(False)


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        '''첫 번째 트릭 No-Operation. 초기화 후 일정 단계에 이를때까지 아무 행동도 하지않고
        게임 초기 상태를 다양하게 하여 특정 시작 상태만 학습하는 것을 방지한다'''

        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(
                1, self.noop_max + 1)  # pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)


class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        '''두 번째 트릭 Episodic Life. 한번 실패를 게임 종료로 간주하나, 다음 게임을 같은 블록 상태로 시작'''
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done = True

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self, **kwargs):
        '''5번 실패하면 게임을 완전히 다시 시작'''
        if self.was_real_done:
            obs = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        '''세 번째 트릭 Max and Skip. 4프레임 동안 같은 행동을 지속하되, 3번째와 4번째 프레임의 최댓값 이미지를 관측 obs로 삼는다'''
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros(
            (2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env):
        '''네 번째 트릭 Warp frame. DQN 네이처 논문 구현과 같이 84*84 흑백 이미지를 사용'''
        gym.ObservationWrapper.__init__(self, env)
        self.width = 84
        self.height = 84
        self.observation_space = spaces.Box(low=0, high=255,
                                            shape=(self.height, self.width, 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height),
                           interpolation=cv2.INTER_AREA)
        return frame[:, :, None]


class WrapPyTorch(gym.ObservationWrapper):
    def __init__(self, env=None):
        '''인덱스 순서를 파이토치 미니배치와 같이 조정하는 래퍼'''
        super(WrapPyTorch, self).__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = Box(
            self.observation_space.low[0, 0, 0],
            self.observation_space.high[0, 0, 0],
            [obs_shape[2], obs_shape[1], obs_shape[0]],
            dtype=self.observation_space.dtype)

    def observation(self, observation):
        return observation.transpose(2, 0, 1)


In [3]:

# 실행환경 생성 함수

# 병렬 실행환경
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv


def make_env(env_id, seed, rank):
    def _thunk():
        '''멀티 프로세스로 동작하는 환경 SubprocVecEnv를 실행하기 위해 필요하다'''

        env = gym.make(env_id)
        env = NoopResetEnv(env, noop_max=30)
        env = MaxAndSkipEnv(env, skip=4)
        env.seed(seed + rank)  # 난수 시드 설정
        env = EpisodicLifeEnv(env)
        env = WarpFrame(env)
        env = WrapPyTorch(env)

        return env

    return _thunk


In [4]:
# 상수 정의

ENV_NAME = 'BreakoutNoFrameskip-v4' 
# Breakout-v0 대신 BreakoutNoFrameskip-v4을 사용
# v0은 2~4개 프레임을 자동으로 생략하므로 이 기능이 없는 버전을 사용한다
# 참고 URL https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26
# https://github.com/openai/gym/blob/5cb12296274020db9bb6378ce54276b31e7002da/gym/envs/__init__.py#L371
    
NUM_SKIP_FRAME = 4 # 생략할 프레임 수
NUM_STACK_FRAME = 4  # 하나의 상태로 사용할 프레임의 수
NOOP_MAX = 30  #  초기화 후 No-operation을 적용할 최초 프레임 수의 최댓값
NUM_PROCESSES = 16 #  병렬로 실행할 프로세스 수
NUM_ADVANCED_STEP = 5  # Advanced 학습할 단계 수
GAMMA = 0.99  # 시간할인율

TOTAL_FRAMES=10e6  #  학습에 사용하는 총 프레임 수
NUM_UPDATES = int(TOTAL_FRAMES / NUM_ADVANCED_STEP / NUM_PROCESSES)  # 신경망 수정 총 횟수
# NUM_UPDATES는 약 125,000이 됨


In [5]:
# A2C 손실함수를 계산하기 위한 상수
value_loss_coef = 0.5
entropy_coef = 0.01
max_grad_norm = 0.5

# 최적회 기법 RMSprop에 대한 설정
lr = 7e-4
eps = 1e-5
alpha = 0.99


In [6]:
# GPU 사용 설정
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)


cuda


In [7]:
# 메모리 클래스 정의


class RolloutStorage(object):
    '''Advantage 학습에 사용하는 메모리 클래스'''

    def __init__(self, num_steps, num_processes, obs_shape):

        self.observations = torch.zeros(
            num_steps + 1, num_processes, *obs_shape).to(device)
        # *로 리스트의 요소를 풀어낸다(unpack)
        # obs_shape→(4,84,84)
        # *obs_shape→ 4 84 84

        self.masks = torch.ones(num_steps + 1, num_processes, 1).to(device)
        self.rewards = torch.zeros(num_steps, num_processes, 1).to(device)
        self.actions = torch.zeros(
            num_steps, num_processes, 1).long().to(device)

        # 할인 총보상을 저장
        self.returns = torch.zeros(num_steps + 1, num_processes, 1).to(device)
        self.index = 0  # 저장할 인덱스

    def insert(self, current_obs, action, reward, mask):
        '''인덱스가 가리키는 다음 자리에 transition을 저장'''
        self.observations[self.index + 1].copy_(current_obs)
        self.masks[self.index + 1].copy_(mask)
        self.rewards[self.index].copy_(reward)
        self.actions[self.index].copy_(action)

        self.index = (self.index + 1) % NUM_ADVANCED_STEP  # 인덱스 업데이트

    def after_update(self):
        '''Advantage 학습 단계 수만큼 단계가 진행되면 가장 최근 단계를 index0에 저장'''
        self.observations[0].copy_(self.observations[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value):
        '''Advantage 학습 단계에 들어가는 각 단계에 대해 할인 총보상을 계산'''

        # 주의 : 5번째 단계부터 거슬러 올라가며 계산
        # 주의 : 5번째 단계가 Advantage1, 4번째 단계가 Advantage2가 되는 식임
        self.returns[-1] = next_value
        for ad_step in reversed(range(self.rewards.size(0))):
            self.returns[ad_step] = self.returns[ad_step + 1] * \
                GAMMA * self.masks[ad_step + 1] + self.rewards[ad_step]


In [8]:
# A2C 신경망 구성


def init(module, gain):
    '''결합 가중치를 초기화하는 함수'''
    nn.init.orthogonal_(module.weight.data, gain=gain)
    nn.init.constant_(module.bias.data, 0)
    return module


class Flatten(nn.Module):
    '''합성곱층의 출력 이미지를 1차원으로 변환하는 층'''

    def forward(self, x):
        return x.view(x.size(0), -1)


class Net(nn.Module):
    def __init__(self, n_out):
        super(Net, self).__init__()

        # 결합 가중치 초기화 함수
        def init_(module): return init(
            module, gain=nn.init.calculate_gain('relu'))

        # 합성곱층을 정의
        self.conv = nn.Sequential(
            # 이미지 크기의 변화 (84*84 -> 20*20)
            init_(nn.Conv2d(NUM_STACK_FRAME, 32, kernel_size=8, stride=4)),
            # 프레임 4개를 합치므로 input=NUM_STACK_FRAME=4가 된다. 출력은 32이다.
            # size 계산  size = (Input_size - Kernel_size + 2*Padding_size)/ Stride_size + 1

            nn.ReLU(),
            # 이미지 크기의 변화 (20*20 -> 9*9)
            init_(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
            nn.ReLU(),
            init_(nn.Conv2d(64, 64, kernel_size=3, stride=1)),  # 이미지 크기의 변화(9*9 -> 7*7)
            nn.ReLU(),
            Flatten(),  # 이미지를 1차원으로 변환
            init_(nn.Linear(64 * 7 * 7, 512)),  # 7*7 이미지 64개를 512차원으로 변환
            nn.ReLU()
        )

        # 결합 가중치 초기화 함수
        def init_(module): return init(module, gain=1.0)

        # Critic을 정의
        self.critic = init_(nn.Linear(512, 1))  # 출력은 상태가치이므로 1개

        # 결합 가중치 초기화 함수
        def init_(module): return init(module, gain=0.01)

        # Actor를 정의
        self.actor = init_(nn.Linear(512, n_out))  # 출력이 행동이므로 출력 수는 행동의 가짓수
        
        # 신경망을 학습 모드로 전환
        self.train()

    def forward(self, x):
        '''신경망의 순전파 계산 정의'''
        input = x / 255.0  # 이미지의 픽셀값을 [0,255]에서 [0,1] 구간으로 정규화
        conv_output = self.conv(input)  # 합성곱층 계산
        critic_output = self.critic(conv_output)  # 상태가치 출력 계산
        actor_output = self.actor(conv_output)  # 행동 출력 계산

        return critic_output, actor_output

    def act(self, x):
        '''상태 x일때 취할 확률을 확률적으로 구함'''
        value, actor_output = self(x)
        probs = F.softmax(actor_output, dim=1)    # dim=1で行動の種類方向に計算
        action = probs.multinomial(num_samples=1)

        return action

    def get_value(self, x):
        '''상태 x의 상태가치를 구함'''
        value, actor_output = self(x)

        return value

    def evaluate_actions(self, x, actions):
        '''상태 x의 상태가치, 실제 행동 actions의 로그 확률, 엔트로피를 구함'''
        value, actor_output = self(x)

        log_probs = F.log_softmax(actor_output, dim=1)  # dim=1이므로 행동의 종류 방향으로 계산
        action_log_probs = log_probs.gather(1, actions)  # 실제 행동에 대한 log_probs 계산

        probs = F.softmax(actor_output, dim=1)  # dim=1이므로 행동의 종류 방향으로 계산
        dist_entropy = -(log_probs * probs).sum(-1).mean()

        return value, action_log_probs, dist_entropy


In [9]:
# 에이전트의 두뇌 역할을 하는 클래스로, 모든 에이전트가 공유한다


class Brain(object):
    def __init__(self, actor_critic):

        self.actor_critic = actor_critic  # actor_critic은 Net클래스로 구현한 신경망이다

        # 이미 학습된 결합 가중치를 로드하려면
        # filename = 'weight.pth'
        # param = torch.load(filename, map_location='cpu')
        # self.actor_critic.load_state_dict(param)

        # 가중치를 학습하는 최적화 알고리즘 설정
        self.optimizer = optim.RMSprop(
            actor_critic.parameters(), lr=lr, eps=eps, alpha=alpha)

    def update(self, rollouts):
        '''advanced 학습 대상 5단계를 모두 사용하여 수정한다'''
        obs_shape = rollouts.observations.size()[2:]  # torch.Size([4, 84, 84])
        num_steps = NUM_ADVANCED_STEP
        num_processes = NUM_PROCESSES

        values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
            rollouts.observations[:-1].view(-1, *obs_shape),
            rollouts.actions.view(-1, 1))

        # 각 변수의 크기에 주의할 것
        # rollouts.observations[:-1].view(-1, *obs_shape) torch.Size([80, 4, 84, 84])
        # rollouts.actions.view(-1, 1) torch.Size([80, 1])
        # values torch.Size([80, 1])
        # action_log_probs torch.Size([80, 1])
        # dist_entropy torch.Size([])

        values = values.view(num_steps, num_processes,
                             1)  # torch.Size([5, 16, 1])
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = rollouts.returns[:-1] - values  # torch.Size([5, 16, 1])
        value_loss = advantages.pow(2).mean()

        action_gain = (advantages.detach() * action_log_probs).mean()
        # advantages는 detach 하여 정수로 취급한다

        total_loss = (value_loss * value_loss_coef -
                      action_gain - dist_entropy * entropy_coef)

        self.optimizer.zero_grad()  # 경사 초기화
        total_loss.backward()  # 역전파 계산
        nn.utils.clip_grad_norm_(self.actor_critic.parameters(), max_grad_norm)
        # 한번에 결합 가중치가 너무 크게 변화하지 않도록, 경사의 최댓값을 0.5로 제한한다

        self.optimizer.step()  # 결합 가중치 수정


In [10]:
# Breakout을 실행하는 환경 클래스


class Environment:
    def run(self):

        # 난수 시드 설정
        seed_num = 1
        torch.manual_seed(seed_num)
        if use_cuda:
            torch.cuda.manual_seed(seed_num)

        # 실행환경 구축
        torch.set_num_threads(seed_num)
        envs = [make_env(ENV_NAME, seed_num, i) for i in range(NUM_PROCESSES)]
        envs = SubprocVecEnv(envs)  # 멀티프로세스 실행환경

        # 모든 에이전트가 공유하는 두뇌 역할 클래스 Brain 객체 생성
        n_out = envs.action_space.n  # 행동의 가짓수는 4
        actor_critic = Net(n_out).to(device)  # GPU 사용
        global_brain = Brain(actor_critic)

        # 정보 저장용 변수 생성
        obs_shape = envs.observation_space.shape  # (1, 84, 84)
        obs_shape = (obs_shape[0] * NUM_STACK_FRAME,
                     *obs_shape[1:])  # (4, 84, 84)
        # torch.Size([16, 4, 84, 84])
        current_obs = torch.zeros(NUM_PROCESSES, *obs_shape).to(device)
        rollouts = RolloutStorage(
            NUM_ADVANCED_STEP, NUM_PROCESSES, obs_shape)  # rollouts 객체
        episode_rewards = torch.zeros([NUM_PROCESSES, 1])  # 현재 에피소드에서 받을 보상 저장
        final_rewards = torch.zeros([NUM_PROCESSES, 1])  # 마지막 에피소드의 총 보상 저장

        # 초기 상태로 시작
        obs = envs.reset()
        obs = torch.from_numpy(obs).float()  # torch.Size([16, 1, 84, 84])
        current_obs[:, -1:] = obs  # 4번째 프레임에 가장 최근 관측결과를 저장

        # advanced 학습에 사용할 객체 rollouts에 첫번째 상태로 현재 상태를 저장
        rollouts.observations[0].copy_(current_obs)

        # 주 반복문
        for j in tqdm(range(NUM_UPDATES)):
            # advanced 학습 범위에 들어가는 단계마다 반복
            for step in range(NUM_ADVANCED_STEP):

                # 행동을 결정
                with torch.no_grad():
                    action = actor_critic.act(rollouts.observations[step])

                cpu_actions = action.squeeze(1).cpu().numpy()  # tensor를 NumPy 변수로

                # 1단계를 병렬로 실행, 반환값 obs의 크기는 (16, 1, 84, 84)
                obs, reward, done, info = envs.step(cpu_actions)

                # 보상을 텐서로 변환한 다음 에피소드 총 보상에 더함
                # 크기가 (16,)인 것을 (16, 1)로 변환
                reward = np.expand_dims(np.stack(reward), 1)
                reward = torch.from_numpy(reward).float()
                episode_rewards += reward

                # 각 프로세스마다 done이 True이면 0, False이면 1
                masks = torch.FloatTensor(
                    [[0.0] if done_ else [1.0] for done_ in done])

                # 마지막 에피소드의 총 보상을 업데이트
                final_rewards *= masks  # done이 True이면 0을 곱하고, False이면 1을 곱하여 리셋
                # done이 False이면 0을 더하고, True이면 epicodic_rewards를 더함
                final_rewards += (1 - masks) * episode_rewards

                # 에피소드의 총 보상을 업데이트
                episode_rewards *= masks  # 각 프로세스마다 done이 True이면 0, False이면 1을 곱함

                # masks 변수를 GPU로 전달
                masks = masks.to(device)

                # done이 True이면 모두 0으로
                # mask의 크기를 torch.Size([16, 1]) --> torch.Size([16, 1, 1 ,1])로 변환하고 곱함
                current_obs *= masks.unsqueeze(2).unsqueeze(2)

                # 프레임을 모음
                # torch.Size([16, 1, 84, 84])
                obs = torch.from_numpy(obs).float()
                current_obs[:, :-1] = current_obs[:, 1:]  # 0～2번째 프레임을 1~3번째 프레임으로 덮어씀
                current_obs[:, -1:] = obs  # 4번째 프레임에 가장 최근 obs를 저장

                # 메모리 객체에 현 단계의 transition을 저장
                rollouts.insert(current_obs, action.data, reward, masks)

            # advanced 학습의 for문 끝

            # advanced 학습 대상 단계 중 마지막 단계의 상태에서 예상되는 상태가치를 계산
            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts.observations[-1]).detach()

            # 모든 단계의 할인 총보상을 계산하고, rollouts의 변수 returns를 업데이트
            rollouts.compute_returns(next_value)

            # 신경망 수정 및 rollout 업데이트
            global_brain.update(rollouts)
            rollouts.after_update()

            # 로그 기록 : 중간 결과 출력
            if j % 500 == 0:
                print("finished frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}".
                      format(j*NUM_PROCESSES*NUM_ADVANCED_STEP,
                             final_rewards.mean(),
                             final_rewards.median(),
                             final_rewards.min(),
                             final_rewards.max()))

            # 결합 가중치 저장
            if j % 12500 == 0:
                torch.save(global_brain.actor_critic.state_dict(),
                           'weight_'+str(j)+'.pth')
        
        # 주 반복문 끝
        torch.save(global_brain.actor_critic.state_dict(), 'weight_end.pth')
        

In [11]:
# 실행
breakout_env = Environment()
breakout_env.run()


  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  result = entry_point.load(False)
  0%|          | 5/125000 [00:00<1:43:39, 20.10it/s]

finished frames 0, mean/median reward 0.0/0.0, min/max reward 0.0/0.0


  0%|          | 506/125000 [00:19<1:20:56, 25.64it/s]

finished frames 40000, mean/median reward 0.5/0.0, min/max reward 0.0/2.0


  1%|          | 1004/125000 [00:38<1:17:40, 26.60it/s]

finished frames 80000, mean/median reward 1.0/0.0, min/max reward 0.0/4.0


  1%|          | 1505/125000 [00:58<1:17:22, 26.60it/s]

finished frames 120000, mean/median reward 0.4/0.0, min/max reward 0.0/3.0


  2%|▏         | 2004/125000 [01:17<1:14:06, 27.66it/s]

finished frames 160000, mean/median reward 0.6/0.0, min/max reward 0.0/2.0


  2%|▏         | 2506/125000 [01:36<1:12:05, 28.32it/s]

finished frames 200000, mean/median reward 0.6/0.0, min/max reward 0.0/2.0


  2%|▏         | 3006/125000 [01:54<1:17:20, 26.29it/s]

finished frames 240000, mean/median reward 1.6/1.0, min/max reward 0.0/6.0


  3%|▎         | 3503/125000 [02:14<1:36:06, 21.07it/s]

finished frames 280000, mean/median reward 1.8/1.0, min/max reward 1.0/5.0


  3%|▎         | 4004/125000 [02:33<1:19:11, 25.46it/s]

finished frames 320000, mean/median reward 2.3/2.0, min/max reward 1.0/5.0


  4%|▎         | 4505/125000 [02:52<1:15:48, 26.49it/s]

finished frames 360000, mean/median reward 2.1/2.0, min/max reward 1.0/4.0


  4%|▍         | 5006/125000 [03:10<1:13:29, 27.22it/s]

finished frames 400000, mean/median reward 2.9/2.0, min/max reward 0.0/10.0


  4%|▍         | 5504/125000 [03:29<1:14:57, 26.57it/s]

finished frames 440000, mean/median reward 4.0/3.0, min/max reward 0.0/10.0


  5%|▍         | 6005/125000 [03:49<1:11:48, 27.62it/s]

finished frames 480000, mean/median reward 3.4/3.0, min/max reward 0.0/11.0


  5%|▌         | 6506/125000 [04:07<1:12:09, 27.37it/s]

finished frames 520000, mean/median reward 5.6/4.0, min/max reward 3.0/15.0


  6%|▌         | 7004/125000 [04:27<1:21:00, 24.28it/s]

finished frames 560000, mean/median reward 5.1/4.0, min/max reward 0.0/13.0


  6%|▌         | 7505/125000 [04:46<1:11:20, 27.45it/s]

finished frames 600000, mean/median reward 5.3/5.0, min/max reward 1.0/15.0


  6%|▋         | 8006/125000 [05:04<1:09:07, 28.21it/s]

finished frames 640000, mean/median reward 7.2/6.0, min/max reward 3.0/18.0


  7%|▋         | 8506/125000 [05:21<1:07:01, 28.96it/s]

finished frames 680000, mean/median reward 6.6/5.0, min/max reward 0.0/17.0


  7%|▋         | 9006/125000 [05:39<1:08:16, 28.31it/s]

finished frames 720000, mean/median reward 6.8/6.0, min/max reward 2.0/13.0


  8%|▊         | 9506/125000 [05:57<1:10:10, 27.43it/s]

finished frames 760000, mean/median reward 7.5/5.0, min/max reward 3.0/28.0


  8%|▊         | 10006/125000 [06:14<1:06:09, 28.97it/s]

finished frames 800000, mean/median reward 4.9/3.0, min/max reward 1.0/15.0


  8%|▊         | 10506/125000 [06:33<1:13:22, 26.01it/s]

finished frames 840000, mean/median reward 7.4/7.0, min/max reward 1.0/20.0


  9%|▉         | 11004/125000 [06:52<1:15:37, 25.12it/s]

finished frames 880000, mean/median reward 12.4/10.0, min/max reward 4.0/29.0


  9%|▉         | 11502/125000 [07:11<1:14:05, 25.53it/s]

finished frames 920000, mean/median reward 7.9/5.0, min/max reward 0.0/30.0


 10%|▉         | 12003/125000 [07:32<1:08:06, 27.65it/s]

finished frames 960000, mean/median reward 9.0/7.0, min/max reward 1.0/23.0


 10%|█         | 12504/125000 [07:52<1:13:48, 25.40it/s]

finished frames 1000000, mean/median reward 4.6/4.0, min/max reward 0.0/12.0


 10%|█         | 13003/125000 [08:11<1:21:31, 22.90it/s]

finished frames 1040000, mean/median reward 5.5/5.0, min/max reward 0.0/15.0


 11%|█         | 13504/125000 [08:30<1:06:31, 27.93it/s]

finished frames 1080000, mean/median reward 7.1/6.0, min/max reward 2.0/17.0


 11%|█         | 14003/125000 [08:47<1:21:35, 22.67it/s]

finished frames 1120000, mean/median reward 6.8/6.0, min/max reward 0.0/15.0


 12%|█▏        | 14504/125000 [09:06<1:07:14, 27.39it/s]

finished frames 1160000, mean/median reward 14.8/5.0, min/max reward 1.0/119.0


 12%|█▏        | 15004/125000 [09:26<1:11:21, 25.69it/s]

finished frames 1200000, mean/median reward 15.7/12.0, min/max reward 4.0/56.0


 12%|█▏        | 15506/125000 [09:45<1:06:03, 27.62it/s]

finished frames 1240000, mean/median reward 8.2/6.0, min/max reward 0.0/25.0


 13%|█▎        | 16004/125000 [10:03<1:05:38, 27.67it/s]

finished frames 1280000, mean/median reward 7.1/5.0, min/max reward 1.0/21.0


 13%|█▎        | 16505/125000 [10:21<1:15:29, 23.95it/s]

finished frames 1320000, mean/median reward 13.8/3.0, min/max reward 0.0/116.0


 14%|█▎        | 17006/125000 [10:40<1:04:10, 28.05it/s]

finished frames 1360000, mean/median reward 13.6/12.0, min/max reward 2.0/33.0


 14%|█▍        | 17504/125000 [10:59<1:15:10, 23.83it/s]

finished frames 1400000, mean/median reward 17.4/14.0, min/max reward 5.0/49.0


 14%|█▍        | 18005/125000 [11:19<1:05:46, 27.11it/s]

finished frames 1440000, mean/median reward 21.2/9.0, min/max reward 0.0/192.0


 15%|█▍        | 18506/125000 [11:38<1:04:15, 27.62it/s]

finished frames 1480000, mean/median reward 22.4/13.0, min/max reward 5.0/172.0


 15%|█▌        | 19006/125000 [11:56<1:02:35, 28.22it/s]

finished frames 1520000, mean/median reward 10.6/9.0, min/max reward 1.0/28.0


 16%|█▌        | 19505/125000 [12:14<1:05:44, 26.75it/s]

finished frames 1560000, mean/median reward 10.9/5.0, min/max reward 0.0/65.0


 16%|█▌        | 20006/125000 [12:34<1:01:38, 28.38it/s]

finished frames 1600000, mean/median reward 12.2/7.0, min/max reward 1.0/40.0


 16%|█▋        | 20504/125000 [12:53<1:02:45, 27.75it/s]

finished frames 1640000, mean/median reward 1.9/1.0, min/max reward 0.0/8.0


 17%|█▋        | 21005/125000 [13:12<1:06:40, 25.99it/s]

finished frames 1680000, mean/median reward 16.0/11.0, min/max reward 1.0/40.0


 17%|█▋        | 21506/125000 [13:30<1:02:48, 27.47it/s]

finished frames 1720000, mean/median reward 16.1/11.0, min/max reward 3.0/60.0


 18%|█▊        | 22003/125000 [13:49<1:20:44, 21.26it/s]

finished frames 1760000, mean/median reward 12.6/11.0, min/max reward 3.0/33.0


 18%|█▊        | 22504/125000 [14:13<1:29:39, 19.05it/s]

finished frames 1800000, mean/median reward 19.1/11.0, min/max reward 0.0/53.0


 18%|█▊        | 23003/125000 [14:37<1:14:59, 22.67it/s]

finished frames 1840000, mean/median reward 27.8/20.0, min/max reward 4.0/104.0


 19%|█▉        | 23504/125000 [15:01<1:14:47, 22.62it/s]

finished frames 1880000, mean/median reward 14.1/8.0, min/max reward 1.0/58.0


 19%|█▉        | 24003/125000 [15:25<1:13:33, 22.88it/s]

finished frames 1920000, mean/median reward 11.2/5.0, min/max reward 2.0/40.0


 20%|█▉        | 24504/125000 [15:47<1:12:03, 23.25it/s]

finished frames 1960000, mean/median reward 8.9/7.0, min/max reward 0.0/22.0


 20%|██        | 25005/125000 [16:10<1:16:59, 21.64it/s]

finished frames 2000000, mean/median reward 26.3/13.0, min/max reward 0.0/105.0


 20%|██        | 25503/125000 [16:32<1:13:13, 22.65it/s]

finished frames 2040000, mean/median reward 17.8/9.0, min/max reward 2.0/85.0


 21%|██        | 26004/125000 [16:55<1:15:00, 22.00it/s]

finished frames 2080000, mean/median reward 31.4/21.0, min/max reward 4.0/140.0


 21%|██        | 26505/125000 [17:17<1:12:34, 22.62it/s]

finished frames 2120000, mean/median reward 21.9/15.0, min/max reward 0.0/56.0


 22%|██▏       | 27005/125000 [17:40<1:20:31, 20.28it/s]

finished frames 2160000, mean/median reward 25.2/16.0, min/max reward 1.0/152.0


 22%|██▏       | 27504/125000 [18:03<1:11:20, 22.78it/s]

finished frames 2200000, mean/median reward 32.9/15.0, min/max reward 2.0/185.0


 22%|██▏       | 28005/125000 [18:24<1:00:03, 26.92it/s]

finished frames 2240000, mean/median reward 46.8/34.0, min/max reward 0.0/200.0


 23%|██▎       | 28503/125000 [18:43<1:02:13, 25.85it/s]

finished frames 2280000, mean/median reward 31.2/21.0, min/max reward 1.0/129.0


 23%|██▎       | 29004/125000 [19:02<1:05:38, 24.37it/s]

finished frames 2320000, mean/median reward 28.6/11.0, min/max reward 0.0/257.0


 24%|██▎       | 29506/125000 [19:21<1:01:23, 25.92it/s]

finished frames 2360000, mean/median reward 19.9/14.0, min/max reward 4.0/49.0


 24%|██▍       | 30004/125000 [19:41<1:09:52, 22.66it/s]

finished frames 2400000, mean/median reward 14.1/10.0, min/max reward 0.0/51.0


 24%|██▍       | 30504/125000 [20:00<58:00, 27.15it/s]

finished frames 2440000, mean/median reward 16.3/12.0, min/max reward 1.0/77.0


 25%|██▍       | 31004/125000 [20:18<59:55, 26.14it/s]  

finished frames 2480000, mean/median reward 29.2/11.0, min/max reward 0.0/192.0


 25%|██▌       | 31506/125000 [20:37<57:02, 27.32it/s]

finished frames 2520000, mean/median reward 15.9/5.0, min/max reward 0.0/60.0


 26%|██▌       | 32004/125000 [20:56<56:19, 27.52it/s]

finished frames 2560000, mean/median reward 9.9/4.0, min/max reward 0.0/51.0


 26%|██▌       | 32504/125000 [21:15<56:25, 27.32it/s]

finished frames 2600000, mean/median reward 32.3/26.0, min/max reward 0.0/75.0


 26%|██▋       | 33004/125000 [21:34<54:27, 28.15it/s]

finished frames 2640000, mean/median reward 11.3/5.0, min/max reward 0.0/29.0


 27%|██▋       | 33505/125000 [21:53<57:15, 26.63it/s]

finished frames 2680000, mean/median reward 26.4/8.0, min/max reward 0.0/208.0


 27%|██▋       | 34006/125000 [22:10<53:25, 28.39it/s]

finished frames 2720000, mean/median reward 25.0/15.0, min/max reward 3.0/106.0


 28%|██▊       | 34503/125000 [22:30<1:05:19, 23.09it/s]

finished frames 2760000, mean/median reward 24.9/8.0, min/max reward 1.0/106.0


 28%|██▊       | 35004/125000 [22:54<1:09:35, 21.55it/s]

finished frames 2800000, mean/median reward 33.2/24.0, min/max reward 4.0/112.0


 28%|██▊       | 35506/125000 [23:14<53:04, 28.10it/s]

finished frames 2840000, mean/median reward 15.8/6.0, min/max reward 0.0/81.0


 29%|██▉       | 36004/125000 [23:34<1:09:37, 21.30it/s]

finished frames 2880000, mean/median reward 34.8/10.0, min/max reward 0.0/189.0


 29%|██▉       | 36506/125000 [23:55<55:23, 26.62it/s]

finished frames 2920000, mean/median reward 21.0/7.0, min/max reward 0.0/68.0


 30%|██▉       | 37005/125000 [24:13<1:01:01, 24.04it/s]

finished frames 2960000, mean/median reward 35.8/6.0, min/max reward 0.0/158.0


 30%|███       | 37504/125000 [24:33<52:25, 27.82it/s]

finished frames 3000000, mean/median reward 5.1/5.0, min/max reward 0.0/13.0


 30%|███       | 38005/125000 [24:51<50:37, 28.64it/s]

finished frames 3040000, mean/median reward 32.0/20.0, min/max reward 2.0/113.0


 31%|███       | 38506/125000 [25:10<53:07, 27.14it/s]

finished frames 3080000, mean/median reward 40.8/16.0, min/max reward 2.0/198.0


 31%|███       | 39004/125000 [25:29<1:05:08, 22.00it/s]

finished frames 3120000, mean/median reward 28.7/16.0, min/max reward 3.0/162.0


 32%|███▏      | 39505/125000 [25:49<51:11, 27.83it/s]

finished frames 3160000, mean/median reward 30.9/7.0, min/max reward 0.0/188.0


 32%|███▏      | 40005/125000 [26:07<50:07, 28.26it/s]

finished frames 3200000, mean/median reward 25.1/15.0, min/max reward 0.0/70.0


 32%|███▏      | 40504/125000 [26:28<1:13:31, 19.16it/s]

finished frames 3240000, mean/median reward 30.6/11.0, min/max reward 0.0/242.0


 33%|███▎      | 41005/125000 [26:51<1:05:42, 21.30it/s]

finished frames 3280000, mean/median reward 38.2/12.0, min/max reward 0.0/181.0


 33%|███▎      | 41503/125000 [27:14<1:05:59, 21.09it/s]

finished frames 3320000, mean/median reward 40.6/11.0, min/max reward 0.0/212.0


 34%|███▎      | 42004/125000 [27:34<47:36, 29.05it/s]

finished frames 3360000, mean/median reward 29.2/9.0, min/max reward 2.0/242.0


 34%|███▍      | 42504/125000 [27:54<53:13, 25.83it/s]

finished frames 3400000, mean/median reward 11.6/4.0, min/max reward 0.0/56.0


 34%|███▍      | 43006/125000 [28:13<49:33, 27.57it/s]

finished frames 3440000, mean/median reward 27.1/11.0, min/max reward 0.0/177.0


 35%|███▍      | 43505/125000 [28:33<58:32, 23.20it/s]

finished frames 3480000, mean/median reward 14.8/10.0, min/max reward 0.0/43.0


 35%|███▌      | 44004/125000 [28:53<1:05:50, 20.50it/s]

finished frames 3520000, mean/median reward 19.1/5.0, min/max reward 0.0/129.0


 36%|███▌      | 44505/125000 [29:19<1:07:36, 19.84it/s]

finished frames 3560000, mean/median reward 17.4/6.0, min/max reward 0.0/73.0


 36%|███▌      | 45004/125000 [29:42<1:00:49, 21.92it/s]

finished frames 3600000, mean/median reward 40.3/18.0, min/max reward 2.0/158.0


 36%|███▋      | 45505/125000 [30:04<58:17, 22.73it/s]  

finished frames 3640000, mean/median reward 39.8/13.0, min/max reward 0.0/271.0


 37%|███▋      | 46004/125000 [30:27<1:01:34, 21.38it/s]

finished frames 3680000, mean/median reward 25.7/16.0, min/max reward 0.0/82.0


 37%|███▋      | 46503/125000 [30:49<54:58, 23.79it/s]

finished frames 3720000, mean/median reward 53.8/10.0, min/max reward 0.0/327.0


 38%|███▊      | 47003/125000 [31:08<55:02, 23.62it/s]

finished frames 3760000, mean/median reward 40.6/26.0, min/max reward 0.0/113.0


 38%|███▊      | 47503/125000 [31:31<58:39, 22.02it/s]

finished frames 3800000, mean/median reward 76.1/51.0, min/max reward 0.0/290.0


 38%|███▊      | 48005/125000 [31:52<47:06, 27.24it/s]

finished frames 3840000, mean/median reward 58.8/24.0, min/max reward 0.0/304.0


 39%|███▉      | 48506/125000 [32:12<49:10, 25.93it/s]

finished frames 3880000, mean/median reward 56.1/31.0, min/max reward 1.0/237.0


 39%|███▉      | 49006/125000 [32:32<49:56, 25.36it/s]

finished frames 3920000, mean/median reward 31.4/13.0, min/max reward 1.0/186.0


 40%|███▉      | 49505/125000 [32:53<1:00:25, 20.83it/s]

finished frames 3960000, mean/median reward 70.2/45.0, min/max reward 2.0/365.0


 40%|████      | 50003/125000 [33:16<58:17, 21.44it/s]

finished frames 4000000, mean/median reward 45.2/18.0, min/max reward 0.0/164.0


 40%|████      | 50503/125000 [33:39<54:35, 22.74it/s]

finished frames 4040000, mean/median reward 40.8/9.0, min/max reward 0.0/269.0


 41%|████      | 51006/125000 [34:00<44:50, 27.50it/s]

finished frames 4080000, mean/median reward 33.2/16.0, min/max reward 2.0/188.0


 41%|████      | 51504/125000 [34:20<45:29, 26.93it/s]

finished frames 4120000, mean/median reward 6.4/4.0, min/max reward 0.0/18.0


 42%|████▏     | 52005/125000 [34:38<52:15, 23.28it/s]

finished frames 4160000, mean/median reward 20.8/12.0, min/max reward 1.0/80.0


 42%|████▏     | 52505/125000 [34:56<41:34, 29.06it/s]

finished frames 4200000, mean/median reward 9.1/6.0, min/max reward 1.0/37.0


 42%|████▏     | 53004/125000 [35:14<41:24, 28.98it/s]

finished frames 4240000, mean/median reward 38.5/12.0, min/max reward 2.0/283.0


 43%|████▎     | 53505/125000 [35:32<41:19, 28.83it/s]

finished frames 4280000, mean/median reward 6.0/3.0, min/max reward 0.0/49.0


 43%|████▎     | 54004/125000 [35:50<42:15, 28.01it/s]

finished frames 4320000, mean/median reward 70.8/20.0, min/max reward 6.0/296.0


 44%|████▎     | 54504/125000 [36:10<48:49, 24.07it/s]

finished frames 4360000, mean/median reward 23.9/4.0, min/max reward 0.0/214.0


 44%|████▍     | 55006/125000 [36:31<40:54, 28.52it/s]

finished frames 4400000, mean/median reward 20.9/14.0, min/max reward 1.0/66.0


 44%|████▍     | 55504/125000 [36:49<40:56, 28.29it/s]

finished frames 4440000, mean/median reward 48.0/27.0, min/max reward 4.0/200.0


 45%|████▍     | 56005/125000 [37:07<40:45, 28.21it/s]

finished frames 4480000, mean/median reward 7.2/6.0, min/max reward 0.0/21.0


 45%|████▌     | 56506/125000 [37:25<40:09, 28.43it/s]

finished frames 4520000, mean/median reward 31.0/8.0, min/max reward 0.0/173.0


 46%|████▌     | 57004/125000 [37:43<40:36, 27.91it/s]

finished frames 4560000, mean/median reward 8.4/2.0, min/max reward 0.0/40.0


 46%|████▌     | 57505/125000 [38:01<40:37, 27.69it/s]

finished frames 4600000, mean/median reward 35.3/17.0, min/max reward 0.0/180.0


 46%|████▋     | 58006/125000 [38:19<39:06, 28.55it/s]

finished frames 4640000, mean/median reward 19.4/13.0, min/max reward 0.0/57.0


 47%|████▋     | 58504/125000 [38:37<39:56, 27.75it/s]

finished frames 4680000, mean/median reward 27.4/7.0, min/max reward 0.0/157.0


 47%|████▋     | 59005/125000 [38:54<38:48, 28.34it/s]

finished frames 4720000, mean/median reward 16.1/8.0, min/max reward 0.0/114.0


 48%|████▊     | 59506/125000 [39:12<37:57, 28.75it/s]

finished frames 4760000, mean/median reward 36.4/35.0, min/max reward 4.0/94.0


 48%|████▊     | 60005/125000 [39:30<38:48, 27.91it/s]

finished frames 4800000, mean/median reward 109.6/15.0, min/max reward 0.0/319.0


 48%|████▊     | 60504/125000 [39:47<36:55, 29.12it/s]

finished frames 4840000, mean/median reward 54.7/19.0, min/max reward 1.0/319.0


 49%|████▉     | 61005/125000 [40:05<38:11, 27.93it/s]

finished frames 4880000, mean/median reward 33.7/10.0, min/max reward 0.0/253.0


 49%|████▉     | 61504/125000 [40:22<36:11, 29.24it/s]

finished frames 4920000, mean/median reward 65.9/15.0, min/max reward 4.0/355.0


 50%|████▉     | 62005/125000 [40:40<36:22, 28.86it/s]

finished frames 4960000, mean/median reward 34.2/16.0, min/max reward 0.0/183.0


 50%|█████     | 62506/125000 [40:58<37:19, 27.90it/s]

finished frames 5000000, mean/median reward 12.3/1.0, min/max reward 0.0/135.0


 50%|█████     | 63004/125000 [41:15<35:31, 29.08it/s]

finished frames 5040000, mean/median reward 14.4/8.0, min/max reward 0.0/63.0


 51%|█████     | 63505/125000 [41:33<35:57, 28.51it/s]

finished frames 5080000, mean/median reward 53.3/21.0, min/max reward 0.0/185.0


 51%|█████     | 64006/125000 [41:51<36:28, 27.87it/s]

finished frames 5120000, mean/median reward 36.1/19.0, min/max reward 2.0/215.0


 52%|█████▏    | 64504/125000 [42:08<35:16, 28.58it/s]

finished frames 5160000, mean/median reward 33.7/9.0, min/max reward 0.0/281.0


 52%|█████▏    | 65005/125000 [42:26<34:53, 28.66it/s]

finished frames 5200000, mean/median reward 68.1/32.0, min/max reward 4.0/256.0


 52%|█████▏    | 65506/125000 [42:44<35:10, 28.19it/s]

finished frames 5240000, mean/median reward 46.1/19.0, min/max reward 0.0/216.0


 53%|█████▎    | 66004/125000 [43:01<34:39, 28.37it/s]

finished frames 5280000, mean/median reward 31.9/12.0, min/max reward 0.0/253.0


 53%|█████▎    | 66505/125000 [43:19<35:21, 27.57it/s]

finished frames 5320000, mean/median reward 46.3/18.0, min/max reward 0.0/348.0


 54%|█████▎    | 67004/125000 [43:36<32:49, 29.45it/s]

finished frames 5360000, mean/median reward 48.3/15.0, min/max reward 1.0/260.0


 54%|█████▍    | 67505/125000 [43:54<33:37, 28.50it/s]

finished frames 5400000, mean/median reward 50.3/25.0, min/max reward 0.0/368.0


 54%|█████▍    | 68004/125000 [44:11<32:52, 28.89it/s]

finished frames 5440000, mean/median reward 17.0/10.0, min/max reward 0.0/55.0


 55%|█████▍    | 68505/125000 [44:29<32:54, 28.61it/s]

finished frames 5480000, mean/median reward 54.9/20.0, min/max reward 4.0/374.0


 55%|█████▌    | 69006/125000 [44:47<33:35, 27.78it/s]

finished frames 5520000, mean/median reward 54.8/21.0, min/max reward 1.0/374.0


 56%|█████▌    | 69504/125000 [45:04<31:31, 29.33it/s]

finished frames 5560000, mean/median reward 25.6/26.0, min/max reward 0.0/73.0


 56%|█████▌    | 70005/125000 [45:22<31:34, 29.03it/s]

finished frames 5600000, mean/median reward 64.2/29.0, min/max reward 0.0/344.0


 56%|█████▋    | 70506/125000 [45:40<31:43, 28.62it/s]

finished frames 5640000, mean/median reward 85.0/39.0, min/max reward 5.0/344.0


 57%|█████▋    | 71005/125000 [45:57<31:08, 28.90it/s]

finished frames 5680000, mean/median reward 38.0/10.0, min/max reward 1.0/344.0


 57%|█████▋    | 71506/125000 [46:15<30:56, 28.81it/s]

finished frames 5720000, mean/median reward 51.5/16.0, min/max reward 0.0/209.0


 58%|█████▊    | 72004/125000 [46:33<33:00, 26.76it/s]

finished frames 5760000, mean/median reward 46.9/25.0, min/max reward 5.0/198.0


 58%|█████▊    | 72506/125000 [46:50<31:00, 28.21it/s]

finished frames 5800000, mean/median reward 4.9/3.0, min/max reward 0.0/14.0


 58%|█████▊    | 73004/125000 [47:08<30:17, 28.61it/s]

finished frames 5840000, mean/median reward 35.1/13.0, min/max reward 0.0/126.0


 59%|█████▉    | 73505/125000 [47:26<29:47, 28.81it/s]

finished frames 5880000, mean/median reward 74.5/21.0, min/max reward 0.0/357.0


 59%|█████▉    | 74004/125000 [47:44<30:40, 27.70it/s]

finished frames 5920000, mean/median reward 30.1/16.0, min/max reward 4.0/168.0


 60%|█████▉    | 74505/125000 [48:01<29:34, 28.45it/s]

finished frames 5960000, mean/median reward 33.8/8.0, min/max reward 1.0/318.0


 60%|██████    | 75006/125000 [48:19<29:14, 28.50it/s]

finished frames 6000000, mean/median reward 31.4/9.0, min/max reward 1.0/164.0


 60%|██████    | 75504/125000 [48:37<29:52, 27.61it/s]

finished frames 6040000, mean/median reward 33.3/16.0, min/max reward 0.0/178.0


 61%|██████    | 76005/125000 [48:54<28:19, 28.83it/s]

finished frames 6080000, mean/median reward 26.3/18.0, min/max reward 0.0/83.0


 61%|██████    | 76506/125000 [49:12<28:24, 28.46it/s]

finished frames 6120000, mean/median reward 54.8/43.0, min/max reward 4.0/241.0


 62%|██████▏   | 77004/125000 [49:31<28:11, 28.37it/s]

finished frames 6160000, mean/median reward 20.0/6.0, min/max reward 0.0/76.0


 62%|██████▏   | 77505/125000 [49:49<28:33, 27.72it/s]

finished frames 6200000, mean/median reward 36.1/23.0, min/max reward 0.0/216.0


 62%|██████▏   | 78006/125000 [50:07<27:26, 28.54it/s]

finished frames 6240000, mean/median reward 73.8/13.0, min/max reward 0.0/343.0


 63%|██████▎   | 78504/125000 [50:25<29:12, 26.54it/s]

finished frames 6280000, mean/median reward 5.4/0.0, min/max reward 0.0/34.0


 63%|██████▎   | 79005/125000 [50:43<27:19, 28.06it/s]

finished frames 6320000, mean/median reward 25.2/4.0, min/max reward 0.0/286.0


 64%|██████▎   | 79506/125000 [51:01<26:21, 28.76it/s]

finished frames 6360000, mean/median reward 56.9/22.0, min/max reward 2.0/322.0


 64%|██████▍   | 80004/125000 [51:19<27:05, 27.67it/s]

finished frames 6400000, mean/median reward 50.7/31.0, min/max reward 0.0/268.0


 64%|██████▍   | 80505/125000 [51:36<25:26, 29.15it/s]

finished frames 6440000, mean/median reward 39.0/11.0, min/max reward 0.0/253.0


 65%|██████▍   | 81006/125000 [51:54<25:33, 28.70it/s]

finished frames 6480000, mean/median reward 22.7/10.0, min/max reward 0.0/65.0


 65%|██████▌   | 81504/125000 [52:11<25:21, 28.58it/s]

finished frames 6520000, mean/median reward 29.4/8.0, min/max reward 0.0/195.0


 66%|██████▌   | 82005/125000 [52:29<24:46, 28.93it/s]

finished frames 6560000, mean/median reward 33.7/15.0, min/max reward 0.0/133.0


 66%|██████▌   | 82506/125000 [52:47<24:48, 28.54it/s]

finished frames 6600000, mean/median reward 47.6/19.0, min/max reward 0.0/393.0


 66%|██████▋   | 83004/125000 [53:05<24:32, 28.53it/s]

finished frames 6640000, mean/median reward 52.6/26.0, min/max reward 0.0/225.0


 67%|██████▋   | 83505/125000 [53:23<24:19, 28.43it/s]

finished frames 6680000, mean/median reward 82.1/42.0, min/max reward 0.0/323.0


 67%|██████▋   | 84006/125000 [53:41<24:46, 27.58it/s]

finished frames 6720000, mean/median reward 61.7/23.0, min/max reward 0.0/285.0


 68%|██████▊   | 84504/125000 [53:58<23:06, 29.20it/s]

finished frames 6760000, mean/median reward 26.1/15.0, min/max reward 1.0/171.0


 68%|██████▊   | 85005/125000 [54:16<23:27, 28.42it/s]

finished frames 6800000, mean/median reward 43.1/24.0, min/max reward 3.0/218.0


 68%|██████▊   | 85506/125000 [54:34<23:38, 27.84it/s]

finished frames 6840000, mean/median reward 57.8/15.0, min/max reward 0.0/298.0


 69%|██████▉   | 86005/125000 [54:51<23:12, 28.00it/s]

finished frames 6880000, mean/median reward 50.4/14.0, min/max reward 0.0/323.0


 69%|██████▉   | 86506/125000 [55:09<22:21, 28.69it/s]

finished frames 6920000, mean/median reward 55.2/40.0, min/max reward 5.0/135.0


 70%|██████▉   | 87004/125000 [55:26<21:37, 29.28it/s]

finished frames 6960000, mean/median reward 85.1/40.0, min/max reward 7.0/342.0


 70%|███████   | 87506/125000 [55:44<21:46, 28.69it/s]

finished frames 7000000, mean/median reward 56.8/21.0, min/max reward 2.0/300.0


 70%|███████   | 88004/125000 [56:02<22:03, 27.96it/s]

finished frames 7040000, mean/median reward 47.1/6.0, min/max reward 0.0/301.0


 71%|███████   | 88505/125000 [56:19<20:57, 29.01it/s]

finished frames 7080000, mean/median reward 59.6/7.0, min/max reward 0.0/355.0


 71%|███████   | 89006/125000 [56:37<21:23, 28.04it/s]

finished frames 7120000, mean/median reward 41.4/4.0, min/max reward 0.0/355.0


 72%|███████▏  | 89504/125000 [56:54<20:28, 28.89it/s]

finished frames 7160000, mean/median reward 45.8/21.0, min/max reward 0.0/269.0


 72%|███████▏  | 90005/125000 [57:12<21:48, 26.74it/s]

finished frames 7200000, mean/median reward 38.7/7.0, min/max reward 0.0/254.0


 72%|███████▏  | 90506/125000 [57:30<20:11, 28.48it/s]

finished frames 7240000, mean/median reward 60.8/37.0, min/max reward 0.0/163.0


 73%|███████▎  | 91004/125000 [57:48<20:34, 27.54it/s]

finished frames 7280000, mean/median reward 24.2/7.0, min/max reward 0.0/234.0


 73%|███████▎  | 91505/125000 [58:05<19:21, 28.85it/s]

finished frames 7320000, mean/median reward 67.9/12.0, min/max reward 0.0/320.0


 74%|███████▎  | 92006/125000 [58:23<19:09, 28.70it/s]

finished frames 7360000, mean/median reward 42.1/12.0, min/max reward 0.0/295.0


 74%|███████▍  | 92504/125000 [58:41<20:04, 26.98it/s]

finished frames 7400000, mean/median reward 33.5/19.0, min/max reward 4.0/119.0


 74%|███████▍  | 93005/125000 [58:58<18:38, 28.61it/s]

finished frames 7440000, mean/median reward 47.8/18.0, min/max reward 0.0/243.0


 75%|███████▍  | 93506/125000 [59:16<18:32, 28.30it/s]

finished frames 7480000, mean/median reward 35.8/7.0, min/max reward 1.0/238.0


 75%|███████▌  | 94004/125000 [59:34<18:42, 27.61it/s]

finished frames 7520000, mean/median reward 39.8/19.0, min/max reward 1.0/187.0


 76%|███████▌  | 94505/125000 [59:52<18:11, 27.94it/s]

finished frames 7560000, mean/median reward 56.9/12.0, min/max reward 2.0/342.0


 76%|███████▌  | 95006/125000 [1:00:09<17:29, 28.57it/s]

finished frames 7600000, mean/median reward 38.4/28.0, min/max reward 0.0/113.0


 76%|███████▋  | 95504/125000 [1:00:27<17:10, 28.62it/s]

finished frames 7640000, mean/median reward 75.4/22.0, min/max reward 0.0/285.0


 77%|███████▋  | 96005/125000 [1:00:45<17:35, 27.47it/s]

finished frames 7680000, mean/median reward 85.9/58.0, min/max reward 0.0/266.0


 77%|███████▋  | 96506/125000 [1:01:03<16:50, 28.21it/s]

finished frames 7720000, mean/median reward 31.6/5.0, min/max reward 0.0/212.0


 78%|███████▊  | 97004/125000 [1:01:20<16:54, 27.61it/s]

finished frames 7760000, mean/median reward 49.4/22.0, min/max reward 4.0/310.0


 78%|███████▊  | 97505/125000 [1:01:38<16:07, 28.41it/s]

finished frames 7800000, mean/median reward 42.8/9.0, min/max reward 0.0/145.0


 78%|███████▊  | 98006/125000 [1:01:56<16:17, 27.63it/s]

finished frames 7840000, mean/median reward 14.1/6.0, min/max reward 0.0/61.0


 79%|███████▉  | 98504/125000 [1:02:13<15:16, 28.91it/s]

finished frames 7880000, mean/median reward 46.9/21.0, min/max reward 0.0/343.0


 79%|███████▉  | 99005/125000 [1:02:31<15:10, 28.55it/s]

finished frames 7920000, mean/median reward 43.1/35.0, min/max reward 9.0/118.0


 80%|███████▉  | 99506/125000 [1:02:49<15:39, 27.13it/s]

finished frames 7960000, mean/median reward 36.4/12.0, min/max reward 0.0/159.0


 80%|████████  | 100004/125000 [1:03:06<14:29, 28.74it/s]

finished frames 8000000, mean/median reward 42.2/16.0, min/max reward 0.0/248.0


 80%|████████  | 100505/125000 [1:03:24<14:15, 28.64it/s]

finished frames 8040000, mean/median reward 39.1/10.0, min/max reward 3.0/220.0


 81%|████████  | 101006/125000 [1:03:42<14:21, 27.85it/s]

finished frames 8080000, mean/median reward 12.4/7.0, min/max reward 0.0/80.0


 81%|████████  | 101504/125000 [1:03:59<13:39, 28.67it/s]

finished frames 8120000, mean/median reward 67.4/39.0, min/max reward 4.0/296.0


 82%|████████▏ | 102005/125000 [1:04:17<13:26, 28.53it/s]

finished frames 8160000, mean/median reward 21.9/9.0, min/max reward 0.0/54.0


 82%|████████▏ | 102506/125000 [1:04:35<13:14, 28.29it/s]

finished frames 8200000, mean/median reward 31.4/11.0, min/max reward 0.0/166.0


 82%|████████▏ | 103004/125000 [1:04:52<13:07, 27.94it/s]

finished frames 8240000, mean/median reward 122.3/44.0, min/max reward 0.0/310.0


 83%|████████▎ | 103505/125000 [1:05:10<12:32, 28.56it/s]

finished frames 8280000, mean/median reward 7.4/2.0, min/max reward 0.0/46.0


 83%|████████▎ | 104006/125000 [1:05:28<12:10, 28.76it/s]

finished frames 8320000, mean/median reward 54.6/29.0, min/max reward 3.0/165.0


 84%|████████▎ | 104504/125000 [1:05:46<12:15, 27.86it/s]

finished frames 8360000, mean/median reward 62.7/10.0, min/max reward 2.0/212.0


 84%|████████▍ | 105005/125000 [1:06:03<11:45, 28.36it/s]

finished frames 8400000, mean/median reward 58.2/24.0, min/max reward 0.0/285.0


 84%|████████▍ | 105506/125000 [1:06:21<11:35, 28.05it/s]

finished frames 8440000, mean/median reward 52.7/27.0, min/max reward 4.0/261.0


 85%|████████▍ | 106004/125000 [1:06:39<11:28, 27.59it/s]

finished frames 8480000, mean/median reward 64.5/11.0, min/max reward 0.0/336.0


 85%|████████▌ | 106505/125000 [1:06:57<11:19, 27.23it/s]

finished frames 8520000, mean/median reward 15.9/6.0, min/max reward 2.0/66.0


 86%|████████▌ | 107006/125000 [1:07:14<10:32, 28.43it/s]

finished frames 8560000, mean/median reward 39.3/19.0, min/max reward 0.0/247.0


 86%|████████▌ | 107504/125000 [1:07:32<10:07, 28.81it/s]

finished frames 8600000, mean/median reward 24.4/7.0, min/max reward 0.0/173.0


 86%|████████▋ | 108005/125000 [1:07:50<10:14, 27.66it/s]

finished frames 8640000, mean/median reward 57.3/26.0, min/max reward 0.0/303.0


 87%|████████▋ | 108504/125000 [1:08:07<09:30, 28.93it/s]

finished frames 8680000, mean/median reward 68.2/16.0, min/max reward 3.0/384.0


 87%|████████▋ | 109005/125000 [1:08:25<09:29, 28.06it/s]

finished frames 8720000, mean/median reward 36.9/13.0, min/max reward 0.0/281.0


 88%|████████▊ | 109506/125000 [1:08:43<09:10, 28.16it/s]

finished frames 8760000, mean/median reward 56.6/19.0, min/max reward 0.0/342.0


 88%|████████▊ | 110004/125000 [1:09:00<08:37, 28.95it/s]

finished frames 8800000, mean/median reward 43.3/22.0, min/max reward 0.0/163.0


 88%|████████▊ | 110505/125000 [1:09:18<08:30, 28.37it/s]

finished frames 8840000, mean/median reward 23.8/12.0, min/max reward 1.0/65.0


 89%|████████▉ | 111006/125000 [1:09:36<08:37, 27.07it/s]

finished frames 8880000, mean/median reward 65.4/53.0, min/max reward 2.0/266.0


 89%|████████▉ | 111504/125000 [1:09:54<08:10, 27.54it/s]

finished frames 8920000, mean/median reward 81.1/26.0, min/max reward 0.0/320.0


 90%|████████▉ | 112005/125000 [1:10:11<07:29, 28.91it/s]

finished frames 8960000, mean/median reward 74.6/23.0, min/max reward 0.0/286.0


 90%|█████████ | 112506/125000 [1:10:29<07:21, 28.29it/s]

finished frames 9000000, mean/median reward 82.8/10.0, min/max reward 0.0/382.0


 90%|█████████ | 113004/125000 [1:10:47<07:15, 27.53it/s]

finished frames 9040000, mean/median reward 39.1/13.0, min/max reward 0.0/271.0


 91%|█████████ | 113505/125000 [1:11:05<07:04, 27.11it/s]

finished frames 9080000, mean/median reward 7.3/1.0, min/max reward 0.0/52.0


 91%|█████████ | 114006/125000 [1:11:22<06:20, 28.91it/s]

finished frames 9120000, mean/median reward 21.8/12.0, min/max reward 0.0/79.0


 92%|█████████▏| 114504/125000 [1:11:40<06:08, 28.52it/s]

finished frames 9160000, mean/median reward 46.7/11.0, min/max reward 0.0/168.0


 92%|█████████▏| 115005/125000 [1:11:58<05:54, 28.22it/s]

finished frames 9200000, mean/median reward 20.4/8.0, min/max reward 0.0/79.0


 92%|█████████▏| 115506/125000 [1:12:16<05:42, 27.72it/s]

finished frames 9240000, mean/median reward 54.4/17.0, min/max reward 0.0/352.0


 93%|█████████▎| 116004/125000 [1:12:33<05:10, 28.93it/s]

finished frames 9280000, mean/median reward 26.6/4.0, min/max reward 0.0/269.0


 93%|█████████▎| 116505/125000 [1:12:51<05:05, 27.85it/s]

finished frames 9320000, mean/median reward 70.2/16.0, min/max reward 0.0/259.0


 94%|█████████▎| 117006/125000 [1:13:09<04:44, 28.12it/s]

finished frames 9360000, mean/median reward 24.5/5.0, min/max reward 0.0/127.0


 94%|█████████▍| 117504/125000 [1:13:26<04:24, 28.31it/s]

finished frames 9400000, mean/median reward 14.6/11.0, min/max reward 3.0/42.0


 94%|█████████▍| 118005/125000 [1:13:44<04:02, 28.84it/s]

finished frames 9440000, mean/median reward 46.9/18.0, min/max reward 4.0/265.0


 95%|█████████▍| 118506/125000 [1:14:02<03:48, 28.36it/s]

finished frames 9480000, mean/median reward 32.3/9.0, min/max reward 0.0/175.0


 95%|█████████▌| 119004/125000 [1:14:20<03:32, 28.24it/s]

finished frames 9520000, mean/median reward 35.6/16.0, min/max reward 1.0/165.0


 96%|█████████▌| 119505/125000 [1:14:37<03:50, 23.89it/s]

finished frames 9560000, mean/median reward 77.9/22.0, min/max reward 0.0/301.0


 96%|█████████▌| 120006/125000 [1:14:56<03:01, 27.53it/s]

finished frames 9600000, mean/median reward 31.0/8.0, min/max reward 0.0/262.0


 96%|█████████▋| 120504/125000 [1:15:14<02:39, 28.13it/s]

finished frames 9640000, mean/median reward 62.4/18.0, min/max reward 0.0/291.0


 97%|█████████▋| 121005/125000 [1:15:32<02:22, 27.97it/s]

finished frames 9680000, mean/median reward 43.1/9.0, min/max reward 0.0/347.0


 97%|█████████▋| 121506/125000 [1:15:50<02:05, 27.82it/s]

finished frames 9720000, mean/median reward 34.2/8.0, min/max reward 0.0/310.0


 98%|█████████▊| 122004/125000 [1:16:08<01:47, 27.95it/s]

finished frames 9760000, mean/median reward 50.9/13.0, min/max reward 0.0/240.0


 98%|█████████▊| 122505/125000 [1:16:26<01:29, 27.72it/s]

finished frames 9800000, mean/median reward 33.4/10.0, min/max reward 1.0/300.0


 98%|█████████▊| 123006/125000 [1:16:43<01:09, 28.59it/s]

finished frames 9840000, mean/median reward 52.8/14.0, min/max reward 0.0/264.0


 99%|█████████▉| 123504/125000 [1:17:01<01:05, 22.80it/s]

finished frames 9880000, mean/median reward 29.0/12.0, min/max reward 1.0/89.0


 99%|█████████▉| 124005/125000 [1:17:19<00:35, 28.37it/s]

finished frames 9920000, mean/median reward 14.6/4.0, min/max reward 0.0/62.0


100%|█████████▉| 124506/125000 [1:17:38<00:17, 27.49it/s]

finished frames 9960000, mean/median reward 47.0/17.0, min/max reward 0.0/289.0


100%|██████████| 125000/125000 [1:17:55<00:00, 26.73it/s]
