In [1]:
import numpy as np
import os
os.environ.setdefault('PATH', '')
from collections import deque
import gym
from gym import spaces


USE_PIL = True
if USE_PIL:
    # you should use pillow-simd, as it is faster than stardand Pillow
    from PIL import Image
else:
    import cv2
    cv2.ocl.setUseOpenCL(False)


class TimeLimit(gym.Wrapper):
    def __init__(self, env, max_episode_steps=None):
        super(TimeLimit, self).__init__(env)
        self._max_episode_steps = max_episode_steps
        self._elapsed_steps = 0

    def step(self, ac):
        observation, reward, done, info = self.env.step(ac)
        self._elapsed_steps += 1
        if self._elapsed_steps >= self._max_episode_steps:
            done = True
            info['TimeLimit.truncated'] = True
        return observation, reward, done, info

    def reset(self, **kwargs):
        self._elapsed_steps = 0
        return self.env.reset(**kwargs)

class ClipActionsWrapper(gym.Wrapper):
    def step(self, action):
        import numpy as np
        action = np.nan_to_num(action)
        action = np.clip(action, self.action_space.low, self.action_space.high)
        return self.env.step(action)

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done  = True

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condition for a few frames
            # so it's important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self, **kwargs):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip       = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env):
        gym.RewardWrapper.__init__(self, env)

    def reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)


class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
        """
        Warp frames to 84x84 as done in the Nature paper and later work.
        If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
        observation should be warped.
        """
        super().__init__(env)
        self._width = width
        self._height = height
        self._grayscale = grayscale
        self._key = dict_space_key
        if self._grayscale:
            num_colors = 1
        else:
            num_colors = 3

        new_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(self._height, self._width, num_colors),
            dtype=np.uint8,
        )
        if self._key is None:
            original_space = self.observation_space
            self.observation_space = new_space
        else:
            original_space = self.observation_space.spaces[self._key]
            self.observation_space.spaces[self._key] = new_space
        assert original_space.dtype == np.uint8 and len(original_space.shape) == 3

    def observation(self, obs):
        if self._key is None:
            frame = obs
        else:
            frame = obs[self._key]
        if USE_PIL:
            frame = Image.fromarray(frame)
            if self._grayscale:
                frame = frame.convert("L")
            frame = frame.resize((self._width, self._height))
            frame = np.array(frame)
        else:
            if self._grayscale:
                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
            frame = cv2.resize(
                frame, (self._width, self._height),
                interpolation=cv2.INTER_AREA
            )
        if self._grayscale:
            frame = np.expand_dims(frame, -1)

        if self._key is None:
            obs = frame
        else:
            obs = obs.copy()
            obs[self._key] = frame
        return obs

class LazyFrames(object):
    def __init__(self, frames):
        """This object ensures that common frames between the observations are only stored once.
        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
        buffers.
        This object should only be converted to numpy array before being passed to the model.
        You'd not believe how complex the previous solution was."""
        self._frames = frames
        self._out = None

    def _force(self):
        if self._out is None:
            self._out = np.concatenate(self._frames, axis=0)
            self._frames = None
        return self._out

    def __array__(self, dtype=None):
        out = self._force()
        if dtype is not None:
            out = out.astype(dtype)
        return out

    def __len__(self):
        return len(self._force())

    def __getitem__(self, i):
        return self._force()[i]

    def count(self):
        frames = self._force()
        return frames.shape[1:frames.ndim]

    def frame(self, i):
        return self._force()[i, ...]


class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))

class ScaledFloatFrame(gym.ObservationWrapper):
    def __init__(self, env):
        gym.ObservationWrapper.__init__(self, env)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)

    def observation(self, observation):
        # careful! This undoes the memory optimization, use
        # with smaller replay buffers only.
        return np.array(observation).astype(np.float32) / 255.0

class SkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        self._skip       = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


def make_atari(env_id, max_episode_steps=None,
               skip_noop=False, skip_maxskip=False):
    env = gym.make(env_id)
    assert 'NoFrameskip' in env.spec.id
    if not skip_noop:
        env = NoopResetEnv(env, noop_max=30) #no
    if not skip_maxskip:
        env = MaxAndSkipEnv(env, skip=4) #yes
    else:
        env = SkipEnv(env, skip=4)
    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    return env


class ImageToPyTorch(gym.ObservationWrapper):
    """
    Change image shape to CWH
    """
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        new_shape = (old_shape[-1], old_shape[0], old_shape[1])
        self.observation_space = gym.spaces.Box(
            low=0.0, high=1.0, shape=new_shape, dtype=np.uint8)

    def observation(self, observation):
        return np.swapaxes(observation, 2, 0)


def wrap_deepmind(env, episode_life=True, clip_rewards=True,
                  frame_stack=False, scale=False, pytorch_img=False,
                  frame_stack_count=4, skip_firereset=False):
    """Configure environment for DeepMind-style Atari.
    """
    if episode_life:
        env = EpisodicLifeEnv(env)
    if 'FIRE' in env.unwrapped.get_action_meanings():
        if not skip_firereset:
            env = FireResetEnv(env) #yes
    env = WarpFrame(env)
    if pytorch_img:
        env = ImageToPyTorch(env) #yes
    if scale:
        env = ScaledFloatFrame(env)
    if clip_rewards:
        env = ClipRewardEnv(env)
    if frame_stack:
        env = FrameStack(env, frame_stack_count)
    return env

In [2]:
from typing import Tuple, List, Union
from collections import namedtuple, deque


Experience = namedtuple("Experience", 
                            field_names = ["state", "action", "reward", "done", "next_state"])



class ReplayMemory:
    """
    Original Replay Memory by Lin. Used for vanilla DQN, no prioritized Replay or bootstrapping with n>1.
    Used to store and sample experiences
    """
    def __init__(self, capacity: int) -> None:
        """
        Args: 
            capacity: size of buffer
        """
        self.buffer = deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, sample: Experience) -> None:
        """
        Append sample
        Args:
            sample: A sample of an experience to store. Experience is a tuple(state, action, reward, done, next_state)
        """
        self.buffer.append(sample)

    def sample(self, batch_size: int = 1) -> Tuple:
        """
        Return batch of buffer, randomly (uniformely).
        Args: 
            batch_size: size of batch
        """
        idxs = np.random.choice(len(self), batch_size, replace=False)

        states, actions, rewards, dones, next_states = \
            zip(*[self.buffer[idx] for idx in idxs])

        return np.array(states), np.array(actions), \
            np.array(rewards, dtype=np.float32), np.array(dones, dtype=bool), \
            np.array(next_states)

    

    

   


import torch
import torch.nn as nn
    
class DQN(nn.Module):
    """
    Neural Network, choosing actions
    """
    def __init__(self, n_in, n_out):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(n_in[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(n_in)

        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_out)
        )

        self.dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor #here

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        x = x.type(self.dtype) #!!! not sure. I need this, because the images are now stored as uint8 and not float32 anymore
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)

In [3]:
import torch.optim as optim
import torch.nn.functional as F
import time

RM_CAPACITY = 100000
GAMMA = 0.99
N_EPISODES = 1000
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY_LAST_FRAME = 100000
LEARNING_RATE = 0.00001
MEMORY_START_SIZE = 10000
BATCH_SIZE = 32
TARGET_UPDATE_AFTER = 10000

frame_idx = 0
ts_frame = 0 
obtained_returns = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
replay_memory = ReplayMemory(RM_CAPACITY)
env_name = 'PongNoFrameskip-v4'
env = make_atari(env_name, skip_noop=False, skip_maxskip=True)
env = wrap_deepmind(env, pytorch_img=True, frame_stack=False)

target_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
policy_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)


def calc_loss_old(batch, policy_net, target_net, device=device):
    states, actions, rewards, dones, next_states = batch
    
    states_v = torch.from_numpy(states).to(device)
    next_states_v = torch.from_numpy(next_states).to(device)
    actions_v = torch.from_numpy(actions).to(device)
    rewards_v = torch.from_numpy(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)
    
    state_action_values = policy_net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    
    with torch.no_grad():
        next_state_values = target_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()
    expected_state_action_values = (next_state_values * GAMMA) + rewards_v
    
    return F.smooth_l1_loss(state_action_values, expected_state_action_values)

def calc_loss(batch, policy_net, target_net, device=device):
    """
        Calculate L1-Loss for given batch.
    """
    states, actions, rewards, dones, next_states = batch

    states_v = torch.from_numpy(states).to(device)
    next_states_v = torch.from_numpy(next_states).to(device)
    actions_v = torch.from_numpy(actions).to(device)
    rewards_v = torch.from_numpy(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    state_action_values = policy_net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)

    with torch.no_grad():
        next_state_values = target_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()
    expected_state_action_values = (next_state_values * GAMMA) + rewards_v

    return F.smooth_l1_loss(state_action_values, expected_state_action_values)


#EPSILON

#don't need two loops...

for episode in range(N_EPISODES):
    print("Start episode %d" % episode)
    state = env.reset()
    return_val = 0
    done=False
    ts = time.time()
    
    while(not done):
        frame_idx += 1
        epsilon = max(EPSILON_END, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
        
        #Play Step
        if(np.random.random() < epsilon):
            action = env.action_space.sample()
        else:
            q_vals = policy_net(torch.tensor(np.array([state], copy=False)).to(device))
            _, action = torch.max(q_vals, dim=1)
            action = int(action.item())
        
        next_state, reward, done, _ = env.step(action)
        
        return_val += reward
    
        
        
        #save experience
        exp = Experience(state, action, reward, done, next_state)
        replay_memory.append(exp)
        
        state = next_state

        if(len(replay_memory) > MEMORY_START_SIZE):
            #learning (outsourcen spaeter!)
            optimizer.zero_grad()
       
            batch = replay_memory.sample(BATCH_SIZE)
            loss_t = calc_loss(batch, policy_net, target_net, device=device)
            loss_t.backward()
            optimizer.step()
            
          
        
        if frame_idx % TARGET_UPDATE_AFTER == 0:
            target_net.load_state_dict(policy_net.state_dict())
            
    
    
    print("frame_idx = %2f, ts = %2f, tsframe = %2f, time = %2f" % (frame_idx, ts, ts_frame, time.time()))
    speed = (frame_idx - ts_frame) / (time.time() - ts)
    ts_frame = frame_idx
    ts = time.time()
    
    
    print("Episode %d completed, timesteps played: %d, return: %d, speed: %f, epsilon: %f" 
                            % (episode, frame_idx, return_val, speed, epsilon))
    obtained_returns.append(return_val)
    m_reward = np.mean(obtained_returns[-100:])
    print("Mean return of last 100 games: %f" % m_reward)

Start episode 0
frame_idx = 849.000000, ts = 1621348988.177198, tsframe = 0.000000, time = 1621348988.842739
Episode 0 completed, timesteps played: 849, return: -21, speed: 1275.522015, epsilon: 0.991510
Mean return of last 100 games: -21.000000
Start episode 1
frame_idx = 1697.000000, ts = 1621348988.851435, tsframe = 849.000000, time = 1621348989.519976
Episode 1 completed, timesteps played: 1697, return: -21, speed: 1268.319449, epsilon: 0.983030
Mean return of last 100 games: -21.000000
Start episode 2
frame_idx = 2512.000000, ts = 1621348989.529930, tsframe = 1697.000000, time = 1621348990.176505
Episode 2 completed, timesteps played: 2512, return: -21, speed: 1260.369030, epsilon: 0.974880
Mean return of last 100 games: -21.000000
Start episode 3
frame_idx = 3573.000000, ts = 1621348990.187931, tsframe = 2512.000000, time = 1621348991.023861
Episode 3 completed, timesteps played: 3573, return: -19, speed: 1269.146771, epsilon: 0.964270
Mean return of last 100 games: -20.500000
St

KeyboardInterrupt: 

In [43]:
batch = replay_memory.sample(2)
states, actions, rewards, dones, next_states = batch

states_v = torch.from_numpy(states).to(device)
next_states_v = torch.from_numpy(next_states).to(device)
actions_v = torch.from_numpy(actions).to(device)
rewards_v = torch.from_numpy(rewards).to(device)
done_mask = torch.BoolTensor(dones).to(device)
    
next_state_values = target_net(next_states_v).max(1)[1] #passt vom shape
next_state_values
#target_net(next_states_v).gather(1, [1,2])


tensor([0, 0], device='cuda:0')

In [18]:
class DQN(nn.Module):
    def __init__(self, n_in, n_out):
        super(DQN, self).__init__()

        self.fc = nn.Sequential(
            nn.Linear(n_in[0], 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, n_out)
        )


    def forward(self, x):
        return self.fc(x)

In [39]:
import torch.optim as optim
import torch.nn.functional as F
import time

RM_CAPACITY = 100000
GAMMA = 0.99
N_EPISODES = 1000
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY_LAST_FRAME = 100000
LEARNING_RATE = 0.00001
MEMORY_START_SIZE = 10000
BATCH_SIZE = 32
TARGET_UPDATE_AFTER = 10000

frame_idx = 0
ts_frame = 0 
obtained_returns = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
replay_memory = ReplayMemory(RM_CAPACITY)
ENV_NAME = "CartPole-v0"
env = gym.make(ENV_NAME)
target_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
policy_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)


def calc_loss(batch, net, tgt_net, device=device):
    states, actions, rewards, dones, next_states = batch
    
    states_v = torch.from_numpy(states).to(device)
    next_states_v = torch.from_numpy(next_states).to(device)
    actions_v = torch.from_numpy(actions).to(device)
    rewards_v = torch.from_numpy(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)
    
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    
    with torch.no_grad():
        next_state_values = target_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()
    expected_state_action_values = (next_state_values * GAMMA) + rewards_v
    
    return F.smooth_l1_loss(state_action_values, expected_state_action_values)



#EPSILON

#don't need two loops...

for episode in range(N_EPISODES):
    print("Start episode %d" % episode)
    state = env.reset().astype('float32')
    print(state.dtype)
    return_val = 0
    done=False
    ts = time.time()
    
    while(not done):
        frame_idx += 1
        epsilon = max(EPSILON_END, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
        
        #Play Step
        if(np.random.random() < epsilon):
            action = env.action_space.sample()
        else:
            q_vals = policy_net(torch.tensor(np.array([state], copy=False)).to(device))
            _, action = torch.max(q_vals, dim=1)
            action = int(action.item())
        
        next_state, reward, done, _ = env.step(action)
        next_state = next_state.astype('float32')
        
        return_val += reward
    
        
        
        #save experience
        exp = Experience(state, action, reward, done, next_state)
        replay_memory.add(exp)
        
        state = next_state

        if(len(replay_memory) > MEMORY_START_SIZE):
            #learning (outsourcen spaeter!)
            optimizer.zero_grad()
       
            batch = replay_memory.sample(BATCH_SIZE)
            loss_t = calc_loss(batch, policy_net, target_net, device=device)
            loss_t.backward()
            optimizer.step()
            
          
        
        if frame_idx % TARGET_UPDATE_AFTER == 0:
            target_net.load_state_dict(policy_net.state_dict())
            
    
    
    print("frame_idx = %2f, ts = %2f, tsframe = %2f, time = %2f" % (frame_idx, ts, ts_frame, time.time()))
    speed = (frame_idx - ts_frame) / (time.time() - ts)
    ts_frame = frame_idx
    ts = time.time()
    
    
    print("Episode %d completed, timesteps played: %d, return: %d, speed: %f, epsilon: %f" 
                            % (episode, frame_idx, return_val, speed, epsilon))
    obtained_returns.append(return_val)
    m_reward = np.mean(obtained_returns[-100:])
    print("Mean return of last 100 games: %f" % m_reward)

Start episode 0
float32
frame_idx = 32.000000, ts = 1619447718.807167, tsframe = 0.000000, time = 1619447718.807891
Episode 0 completed, timesteps played: 32, return: 32, speed: 42662.977750, epsilon: 0.999680
Mean return of last 100 games: 32.000000
Start episode 1
float32
frame_idx = 67.000000, ts = 1619447718.808240, tsframe = 32.000000, time = 1619447718.808751
Episode 1 completed, timesteps played: 67, return: 35, speed: 66365.569620, epsilon: 0.999330
Mean return of last 100 games: 33.500000
Start episode 2
float32
frame_idx = 99.000000, ts = 1619447718.808846, tsframe = 67.000000, time = 1619447718.809276
Episode 2 completed, timesteps played: 99, return: 32, speed: 71889.516872, epsilon: 0.999010
Mean return of last 100 games: 33.000000
Start episode 3
float32
frame_idx = 113.000000, ts = 1619447718.809381, tsframe = 99.000000, time = 1619447718.809580
Episode 3 completed, timesteps played: 113, return: 14, speed: 65172.315205, epsilon: 0.998870
Mean return of last 100 games: 2

Start episode 245
float32
frame_idx = 5375.000000, ts = 1619447719.026117, tsframe = 5343.000000, time = 1619447719.027696
Episode 245 completed, timesteps played: 5375, return: 32, speed: 19904.749815, epsilon: 0.946250
Mean return of last 100 games: 22.440000
Start episode 246
float32
frame_idx = 5399.000000, ts = 1619447719.027838, tsframe = 5375.000000, time = 1619447719.028901
Episode 246 completed, timesteps played: 5399, return: 24, speed: 22075.284211, epsilon: 0.946010
Mean return of last 100 games: 22.470000
Start episode 247
float32
frame_idx = 5426.000000, ts = 1619447719.029019, tsframe = 5399.000000, time = 1619447719.030122
Episode 247 completed, timesteps played: 5426, return: 27, speed: 23957.310768, epsilon: 0.945740
Mean return of last 100 games: 22.600000
Start episode 248
float32
frame_idx = 5439.000000, ts = 1619447719.030239, tsframe = 5426.000000, time = 1619447719.030908
Episode 248 completed, timesteps played: 5439, return: 13, speed: 18756.777434, epsilon: 0.

frame_idx = 9659.000000, ts = 1619447719.225719, tsframe = 9615.000000, time = 1619447719.228609
Episode 417 completed, timesteps played: 9659, return: 44, speed: 15096.063476, epsilon: 0.903410
Mean return of last 100 games: 23.710000
Start episode 418
float32
frame_idx = 9680.000000, ts = 1619447719.228710, tsframe = 9659.000000, time = 1619447719.230041
Episode 418 completed, timesteps played: 9680, return: 21, speed: 15512.572032, epsilon: 0.903200
Mean return of last 100 games: 23.770000
Start episode 419
float32
frame_idx = 9698.000000, ts = 1619447719.230355, tsframe = 9680.000000, time = 1619447719.231381
Episode 419 completed, timesteps played: 9698, return: 18, speed: 17170.223334, epsilon: 0.903020
Mean return of last 100 games: 23.560000
Start episode 420
float32
frame_idx = 9710.000000, ts = 1619447719.231753, tsframe = 9698.000000, time = 1619447719.232866
Episode 420 completed, timesteps played: 9710, return: 12, speed: 10567.215620, epsilon: 0.902900
Mean return of last

frame_idx = 10301.000000, ts = 1619447719.831824, tsframe = 10281.000000, time = 1619447719.875255
Episode 450 completed, timesteps played: 10301, return: 20, speed: 459.818564, epsilon: 0.896990
Mean return of last 100 games: 21.850000
Start episode 451
float32
frame_idx = 10321.000000, ts = 1619447719.875798, tsframe = 10301.000000, time = 1619447719.928323
Episode 451 completed, timesteps played: 10321, return: 20, speed: 380.308105, epsilon: 0.896790
Mean return of last 100 games: 21.830000
Start episode 452
float32
frame_idx = 10334.000000, ts = 1619447719.928816, tsframe = 10321.000000, time = 1619447719.961400
Episode 452 completed, timesteps played: 10334, return: 13, speed: 398.666033, epsilon: 0.896660
Mean return of last 100 games: 21.770000
Start episode 453
float32
frame_idx = 10344.000000, ts = 1619447719.961778, tsframe = 10334.000000, time = 1619447719.987482
Episode 453 completed, timesteps played: 10344, return: 10, speed: 388.663776, epsilon: 0.896560
Mean return of 

frame_idx = 11093.000000, ts = 1619447721.436769, tsframe = 11052.000000, time = 1619447721.523999
Episode 484 completed, timesteps played: 11093, return: 41, speed: 469.697161, epsilon: 0.889070
Mean return of last 100 games: 22.030000
Start episode 485
float32
frame_idx = 11117.000000, ts = 1619447721.524169, tsframe = 11093.000000, time = 1619447721.569047
Episode 485 completed, timesteps played: 11117, return: 24, speed: 534.459431, epsilon: 0.888830
Mean return of last 100 games: 22.100000
Start episode 486
float32
frame_idx = 11163.000000, ts = 1619447721.569416, tsframe = 11117.000000, time = 1619447721.653447
Episode 486 completed, timesteps played: 11163, return: 46, speed: 547.237785, epsilon: 0.888370
Mean return of last 100 games: 22.430000
Start episode 487
float32
frame_idx = 11183.000000, ts = 1619447721.653556, tsframe = 11163.000000, time = 1619447721.689663
Episode 487 completed, timesteps played: 11183, return: 20, speed: 553.524471, epsilon: 0.888170
Mean return of 

float32
frame_idx = 11848.000000, ts = 1619447722.870175, tsframe = 11830.000000, time = 1619447722.906668
Episode 518 completed, timesteps played: 11848, return: 18, speed: 492.867685, epsilon: 0.881520
Mean return of last 100 games: 21.680000
Start episode 519
float32
frame_idx = 11874.000000, ts = 1619447722.907108, tsframe = 11848.000000, time = 1619447722.955407
Episode 519 completed, timesteps played: 11874, return: 26, speed: 538.047000, epsilon: 0.881260
Mean return of last 100 games: 21.760000
Start episode 520
float32
frame_idx = 11888.000000, ts = 1619447722.955809, tsframe = 11874.000000, time = 1619447722.982452
Episode 520 completed, timesteps played: 11888, return: 14, speed: 524.977032, epsilon: 0.881120
Mean return of last 100 games: 21.780000
Start episode 521
float32
frame_idx = 11896.000000, ts = 1619447722.982558, tsframe = 11888.000000, time = 1619447722.998411
Episode 521 completed, timesteps played: 11896, return: 8, speed: 503.842996, epsilon: 0.881040
Mean ret

frame_idx = 12540.000000, ts = 1619447724.145007, tsframe = 12527.000000, time = 1619447724.169201
Episode 551 completed, timesteps played: 12540, return: 13, speed: 536.619939, epsilon: 0.874600
Mean return of last 100 games: 22.190000
Start episode 552
float32
frame_idx = 12558.000000, ts = 1619447724.169686, tsframe = 12540.000000, time = 1619447724.202946
Episode 552 completed, timesteps played: 12558, return: 18, speed: 540.805089, epsilon: 0.874420
Mean return of last 100 games: 22.240000
Start episode 553
float32
frame_idx = 12591.000000, ts = 1619447724.203242, tsframe = 12558.000000, time = 1619447724.261499
Episode 553 completed, timesteps played: 12591, return: 33, speed: 566.208906, epsilon: 0.874090
Mean return of last 100 games: 22.470000
Start episode 554
float32
frame_idx = 12614.000000, ts = 1619447724.261792, tsframe = 12591.000000, time = 1619447724.301055
Episode 554 completed, timesteps played: 12614, return: 23, speed: 585.419829, epsilon: 0.873860
Mean return of 

frame_idx = 13322.000000, ts = 1619447725.536280, tsframe = 13292.000000, time = 1619447725.593585
Episode 590 completed, timesteps played: 13322, return: 30, speed: 523.239341, epsilon: 0.866780
Mean return of last 100 games: 20.590000
Start episode 591
float32
frame_idx = 13340.000000, ts = 1619447725.593937, tsframe = 13322.000000, time = 1619447725.628643
Episode 591 completed, timesteps played: 13340, return: 18, speed: 518.270306, epsilon: 0.866600
Mean return of last 100 games: 20.620000
Start episode 592
float32
frame_idx = 13355.000000, ts = 1619447725.628944, tsframe = 13340.000000, time = 1619447725.657112
Episode 592 completed, timesteps played: 13355, return: 15, speed: 532.047019, epsilon: 0.866450
Mean return of last 100 games: 20.430000
Start episode 593
float32
frame_idx = 13371.000000, ts = 1619447725.657550, tsframe = 13355.000000, time = 1619447725.687613
Episode 593 completed, timesteps played: 13371, return: 16, speed: 531.787028, epsilon: 0.866290
Mean return of 

frame_idx = 13946.000000, ts = 1619447726.659765, tsframe = 13925.000000, time = 1619447726.696562
Episode 626 completed, timesteps played: 13946, return: 21, speed: 570.217676, epsilon: 0.860540
Mean return of last 100 games: 19.580000
Start episode 627
float32
frame_idx = 13955.000000, ts = 1619447726.697009, tsframe = 13946.000000, time = 1619447726.712502
Episode 627 completed, timesteps played: 13955, return: 9, speed: 579.617302, epsilon: 0.860450
Mean return of last 100 games: 19.540000
Start episode 628
float32
frame_idx = 13978.000000, ts = 1619447726.712618, tsframe = 13955.000000, time = 1619447726.757742
Episode 628 completed, timesteps played: 13978, return: 23, speed: 509.423359, epsilon: 0.860220
Mean return of last 100 games: 19.630000
Start episode 629
float32
frame_idx = 14014.000000, ts = 1619447726.758250, tsframe = 13978.000000, time = 1619447726.830348
Episode 629 completed, timesteps played: 14014, return: 36, speed: 499.133739, epsilon: 0.859860
Mean return of l

float32
frame_idx = 14585.000000, ts = 1619447727.960351, tsframe = 14572.000000, time = 1619447727.986372
Episode 662 completed, timesteps played: 14585, return: 13, speed: 499.084246, epsilon: 0.854150
Mean return of last 100 games: 18.100000
Start episode 663
float32
frame_idx = 14596.000000, ts = 1619447727.986486, tsframe = 14585.000000, time = 1619447728.006687
Episode 663 completed, timesteps played: 14596, return: 11, speed: 543.860809, epsilon: 0.854040
Mean return of last 100 games: 17.810000
Start episode 664
float32
frame_idx = 14632.000000, ts = 1619447728.007131, tsframe = 14596.000000, time = 1619447728.072227
Episode 664 completed, timesteps played: 14632, return: 36, speed: 552.821660, epsilon: 0.853680
Mean return of last 100 games: 18.060000
Start episode 665
float32
frame_idx = 14649.000000, ts = 1619447728.072330, tsframe = 14632.000000, time = 1619447728.104744
Episode 665 completed, timesteps played: 14649, return: 17, speed: 524.064502, epsilon: 0.853510
Mean re

frame_idx = 15265.000000, ts = 1619447729.147874, tsframe = 15233.000000, time = 1619447729.204090
Episode 696 completed, timesteps played: 15265, return: 32, speed: 567.087608, epsilon: 0.847350
Mean return of last 100 games: 18.490000
Start episode 697
float32
frame_idx = 15276.000000, ts = 1619447729.204399, tsframe = 15265.000000, time = 1619447729.223589
Episode 697 completed, timesteps played: 15276, return: 11, speed: 572.444930, epsilon: 0.847240
Mean return of last 100 games: 18.480000
Start episode 698
float32
frame_idx = 15290.000000, ts = 1619447729.223697, tsframe = 15276.000000, time = 1619447729.250386
Episode 698 completed, timesteps played: 15290, return: 14, speed: 524.077433, epsilon: 0.847100
Mean return of last 100 games: 18.510000
Start episode 699
float32
frame_idx = 15306.000000, ts = 1619447729.250488, tsframe = 15290.000000, time = 1619447729.278807
Episode 699 completed, timesteps played: 15306, return: 16, speed: 564.495041, epsilon: 0.846940
Mean return of 

frame_idx = 16027.000000, ts = 1619447730.555310, tsframe = 16007.000000, time = 1619447730.593376
Episode 734 completed, timesteps played: 16027, return: 20, speed: 524.983603, epsilon: 0.839730
Mean return of last 100 games: 19.260000
Start episode 735
float32
frame_idx = 16040.000000, ts = 1619447730.593822, tsframe = 16027.000000, time = 1619447730.620219
Episode 735 completed, timesteps played: 16040, return: 13, speed: 491.996030, epsilon: 0.839600
Mean return of last 100 games: 18.970000
Start episode 736
float32
frame_idx = 16054.000000, ts = 1619447730.620651, tsframe = 16040.000000, time = 1619447730.644919
Episode 736 completed, timesteps played: 16054, return: 14, speed: 576.304639, epsilon: 0.839460
Mean return of last 100 games: 18.870000
Start episode 737
float32
frame_idx = 16087.000000, ts = 1619447730.645024, tsframe = 16054.000000, time = 1619447730.702438
Episode 737 completed, timesteps played: 16087, return: 33, speed: 574.512110, epsilon: 0.839130
Mean return of 

frame_idx = 16754.000000, ts = 1619447731.889243, tsframe = 16740.000000, time = 1619447731.914272
Episode 770 completed, timesteps played: 16754, return: 14, speed: 558.671221, epsilon: 0.832460
Mean return of last 100 games: 20.140000
Start episode 771
float32
frame_idx = 16766.000000, ts = 1619447731.914718, tsframe = 16754.000000, time = 1619447731.939318
Episode 771 completed, timesteps played: 16766, return: 12, speed: 487.303681, epsilon: 0.832340
Mean return of last 100 games: 20.050000
Start episode 772
float32
frame_idx = 16799.000000, ts = 1619447731.939440, tsframe = 16766.000000, time = 1619447731.998300
Episode 772 completed, timesteps played: 16799, return: 33, speed: 560.402094, epsilon: 0.832010
Mean return of last 100 games: 19.970000
Start episode 773
float32
frame_idx = 16815.000000, ts = 1619447731.998406, tsframe = 16799.000000, time = 1619447732.026934
Episode 773 completed, timesteps played: 16815, return: 16, speed: 560.337862, epsilon: 0.831850
Mean return of 

frame_idx = 17364.000000, ts = 1619447733.165551, tsframe = 17337.000000, time = 1619447733.213871
Episode 803 completed, timesteps played: 17364, return: 27, speed: 558.432530, epsilon: 0.826360
Mean return of last 100 games: 19.930000
Start episode 804
float32
frame_idx = 17377.000000, ts = 1619447733.214440, tsframe = 17364.000000, time = 1619447733.237591
Episode 804 completed, timesteps played: 17377, return: 13, speed: 560.931959, epsilon: 0.826230
Mean return of last 100 games: 19.850000
Start episode 805
float32
frame_idx = 17391.000000, ts = 1619447733.237693, tsframe = 17377.000000, time = 1619447733.262625
Episode 805 completed, timesteps played: 17391, return: 14, speed: 560.971531, epsilon: 0.826090
Mean return of last 100 games: 19.850000
Start episode 806
float32
frame_idx = 17419.000000, ts = 1619447733.262728, tsframe = 17391.000000, time = 1619447733.312328
Episode 806 completed, timesteps played: 17419, return: 28, speed: 564.235360, epsilon: 0.825810
Mean return of 

frame_idx = 18008.000000, ts = 1619447734.484108, tsframe = 17995.000000, time = 1619447734.517174
Episode 840 completed, timesteps played: 18008, return: 13, speed: 392.776015, epsilon: 0.819920
Mean return of last 100 games: 18.530000
Start episode 841
float32
frame_idx = 18028.000000, ts = 1619447734.517606, tsframe = 18008.000000, time = 1619447734.562472
Episode 841 completed, timesteps played: 18028, return: 20, speed: 445.510590, epsilon: 0.819720
Mean return of last 100 games: 18.570000
Start episode 842
float32
frame_idx = 18043.000000, ts = 1619447734.562579, tsframe = 18028.000000, time = 1619447734.600204
Episode 842 completed, timesteps played: 18043, return: 15, speed: 398.407751, epsilon: 0.819570
Mean return of last 100 games: 18.490000
Start episode 843
float32
frame_idx = 18062.000000, ts = 1619447734.600685, tsframe = 18043.000000, time = 1619447734.647504
Episode 843 completed, timesteps played: 18062, return: 19, speed: 405.601523, epsilon: 0.819380
Mean return of 

frame_idx = 18802.000000, ts = 1619447736.221911, tsframe = 18790.000000, time = 1619447736.253189
Episode 874 completed, timesteps played: 18802, return: 12, speed: 382.875374, epsilon: 0.811980
Mean return of last 100 games: 19.670000
Start episode 875
float32
frame_idx = 18815.000000, ts = 1619447736.253385, tsframe = 18802.000000, time = 1619447736.283093
Episode 875 completed, timesteps played: 18815, return: 13, speed: 437.193925, epsilon: 0.811850
Mean return of last 100 games: 19.650000
Start episode 876
float32
frame_idx = 18837.000000, ts = 1619447736.283203, tsframe = 18815.000000, time = 1619447736.324353
Episode 876 completed, timesteps played: 18837, return: 22, speed: 534.290773, epsilon: 0.811630
Mean return of last 100 games: 19.750000
Start episode 877
float32
frame_idx = 18852.000000, ts = 1619447736.324728, tsframe = 18837.000000, time = 1619447736.352144
Episode 877 completed, timesteps played: 18852, return: 15, speed: 546.603070, epsilon: 0.811480
Mean return of 

frame_idx = 19463.000000, ts = 1619447737.522945, tsframe = 19440.000000, time = 1619447737.565176
Episode 908 completed, timesteps played: 19463, return: 23, speed: 543.836560, epsilon: 0.805370
Mean return of last 100 games: 20.060000
Start episode 909
float32
frame_idx = 19481.000000, ts = 1619447737.565345, tsframe = 19463.000000, time = 1619447737.602887
Episode 909 completed, timesteps played: 19481, return: 18, speed: 478.607567, epsilon: 0.805190
Mean return of last 100 games: 20.030000
Start episode 910
float32
frame_idx = 19493.000000, ts = 1619447737.603486, tsframe = 19481.000000, time = 1619447737.628753
Episode 910 completed, timesteps played: 19493, return: 12, speed: 474.406168, epsilon: 0.805070
Mean return of last 100 games: 19.850000
Start episode 911
float32
frame_idx = 19503.000000, ts = 1619447737.629173, tsframe = 19493.000000, time = 1619447737.647974
Episode 911 completed, timesteps played: 19503, return: 10, speed: 531.173334, epsilon: 0.804970
Mean return of 

frame_idx = 20151.000000, ts = 1619447738.810927, tsframe = 20114.000000, time = 1619447738.883684
Episode 944 completed, timesteps played: 20151, return: 37, speed: 508.315557, epsilon: 0.798490
Mean return of last 100 games: 20.720000
Start episode 945
float32
frame_idx = 20165.000000, ts = 1619447738.884195, tsframe = 20151.000000, time = 1619447738.910071
Episode 945 completed, timesteps played: 20165, return: 14, speed: 540.493143, epsilon: 0.798350
Mean return of last 100 games: 20.660000
Start episode 946
float32
frame_idx = 20174.000000, ts = 1619447738.910404, tsframe = 20165.000000, time = 1619447738.928071
Episode 946 completed, timesteps played: 20174, return: 9, speed: 508.688228, epsilon: 0.798260
Mean return of last 100 games: 20.600000
Start episode 947
float32
frame_idx = 20211.000000, ts = 1619447738.928176, tsframe = 20174.000000, time = 1619447738.997188
Episode 947 completed, timesteps played: 20211, return: 37, speed: 535.928142, epsilon: 0.797890
Mean return of l

frame_idx = 20793.000000, ts = 1619447740.232926, tsframe = 20767.000000, time = 1619447740.299016
Episode 976 completed, timesteps played: 20793, return: 26, speed: 393.216473, epsilon: 0.792070
Mean return of last 100 games: 19.560000
Start episode 977
float32
frame_idx = 20813.000000, ts = 1619447740.299129, tsframe = 20793.000000, time = 1619447740.348570
Episode 977 completed, timesteps played: 20813, return: 20, speed: 404.317029, epsilon: 0.791870
Mean return of last 100 games: 19.610000
Start episode 978
float32
frame_idx = 20826.000000, ts = 1619447740.348678, tsframe = 20813.000000, time = 1619447740.373217
Episode 978 completed, timesteps played: 20826, return: 13, speed: 529.177806, epsilon: 0.791740
Mean return of last 100 games: 19.610000
Start episode 979
float32
frame_idx = 20838.000000, ts = 1619447740.373718, tsframe = 20826.000000, time = 1619447740.396403
Episode 979 completed, timesteps played: 20838, return: 12, speed: 528.394062, epsilon: 0.791620
Mean return of 

In [31]:
batch = replay_memory.sample(32)
states, actions, rewards, dones, next_states = batch

In [32]:
states

array([[-0.23509048, -0.99674344,  0.14202823,  1.205893  ],
       [-0.01221185,  0.2033036 , -0.01137768, -0.39452153],
       [ 0.01403981,  0.38471097, -0.06356827, -0.6559813 ],
       [ 0.02471143,  0.00704753, -0.02086607,  0.0056784 ],
       [ 0.04553305, -0.03973223, -0.03497742,  0.04182764],
       [ 0.07546449,  0.22233449, -0.07402361, -0.33955365],
       [-0.02647018,  0.23083852, -0.04325013, -0.6108779 ],
       [ 0.01288328,  0.04091272,  0.06666889,  0.16054414],
       [-0.03464189,  0.19988637, -0.01580142, -0.389599  ],
       [-0.13472049, -0.57529694, -0.0236813 ,  0.26569486],
       [ 0.00225858, -0.4042985 , -0.0386311 ,  0.5520086 ],
       [-0.04573006,  0.195285  ,  0.0571992 , -0.18666917],
       [ 0.0150539 , -0.22190392,  0.03292352,  0.34754074],
       [ 0.13244607,  1.1784552 , -0.17124087, -1.856896  ],
       [ 0.02920668,  0.62964714,  0.01218806, -0.7824796 ],
       [-0.0029187 ,  0.21680176,  0.06969614,  0.00571806],
       [ 0.02098421,  0.

In [38]:
ENV_NAME = "CartPole-v0"
env = gym.make(ENV_NAME)
x=[]
for i in range(32):
    x.append(env.reset())
x

[array([-0.0243938 , -0.02700503, -0.00631201, -0.02593988]),
 array([ 3.97240209e-03, -2.10942985e-02, -7.00966233e-03, -7.57605349e-05]),
 array([ 0.02826306, -0.04461565,  0.028929  ,  0.04662189]),
 array([-0.04987784,  0.00602483,  0.04034786,  0.04221054]),
 array([-0.04907038,  0.00118571,  0.02718911, -0.01325019]),
 array([ 0.00278728,  0.02502537, -0.02910984,  0.04138255]),
 array([ 0.02125563, -0.01070709, -0.01323864, -0.01632808]),
 array([-0.02007005,  0.04879822,  0.03645241, -0.02047673]),
 array([-0.0234892 , -0.03666565, -0.04548653, -0.01133783]),
 array([ 0.0397614 ,  0.0072732 ,  0.02193817, -0.01645931]),
 array([ 0.01814969, -0.02026478,  0.03972562, -0.04069806]),
 array([-0.00868179,  0.01349155,  0.01389249, -0.00343418]),
 array([ 0.03027661,  0.0090223 ,  0.030169  , -0.00435507]),
 array([-0.02088356,  0.00168808,  0.0410512 , -0.03460227]),
 array([-0.047762  ,  0.01205452, -0.01435437, -0.04210863]),
 array([-0.02007547,  0.00981842,  0.01450585, -0.0070