In [1]:
import import_ipynb

In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import random, datetime
from pathlib import Path

import gymnasium
from gymnasium.wrappers import FrameStack, GrayScaleObservation, TransformObservation
from metrics import MetricLogger
from agent import SpaceInvader
from wrappers import ResizeObservation, SkipFrame

importing Jupyter notebook from metrics.ipynb
importing Jupyter notebook from agent.ipynb
importing Jupyter notebook from neural.ipynb
importing Jupyter notebook from wrappers.ipynb


In [3]:
import gymnasium as gym

In [26]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, truncated, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, truncated, info

In [27]:
env = gymnasium.make("ALE/SpaceInvaders-v5", mode=2)

In [28]:
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, f=lambda x: x / 255.)
env = FrameStack(env, num_stack=4)

In [29]:
env.reset()

(<gymnasium.wrappers.frame_stack.LazyFrames at 0x28504d760>,
 {'lives': 3, 'episode_frame_number': 3, 'frame_number': 3})

In [30]:
save_dir = Path('checkpoints') / datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
save_dir.mkdir(parents=True)

In [31]:
env.action_space.n

6

In [32]:
checkpoint = None
spaceInvader = SpaceInvader(state_dim=(4,84,84), action_dim=env.action_space.n, save_dir=save_dir, checkpoint=checkpoint)

In [33]:
logger = MetricLogger(save_dir)


In [34]:
episodes = 40000

In [35]:
action = 1

In [36]:
result = env.step(action)
next_state = result[0]
reward = result[1]
done = result[2]
info = result[3]

In [37]:
result = env.step(action)
result

(<gymnasium.wrappers.frame_stack.LazyFrames at 0x2b01bd0d0>,
 0.0,
 False,
 False,
 {'lives': 3, 'episode_frame_number': 35, 'frame_number': 35})

In [38]:
state = env.reset()
state

(<gymnasium.wrappers.frame_stack.LazyFrames at 0x2b0ba8fe0>,
 {'lives': 3, 'episode_frame_number': 3, 'frame_number': 38})

In [None]:
### for Loop that train the model num_episodes times by playing the game
for e in range(episodes):

    state = env.reset()
    state = state[0]
    # Play the game!
    while True:

        # 3. Show environment (the visual) [WIP]
        # env.render()

        # 4. Run agent on the state
        action = spaceInvader.act(state)
        # 5. Agent performs action
        result = env.step(action)
        next_state = result[0]
        reward = result[1]
        done = result[2]
        info = result[3]
        
        # 6. Remember
        spaceInvader.cache(state, next_state, action, reward, done)

        # 7. Learn
        q, loss = spaceInvader.learn()

        # 8. Logging
        logger.log_step(reward, loss, q)

        # 9. Update state
        state = next_state
        # 10. Check if end of game
        if done or info:
            break
    

    logger.log_episode()

    if e % 20 == 0:
        logger.record(
            episode=e,
            epsilon=spaceInvader.exploration_rate,
            step=spaceInvader.curr_step
        )



Episode 0 - Step 124 - Epsilon 0.9999690004766157 - Mean Reward 115.0 - Mean Length 124.0 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 3.556 - Time 2023-05-11T17:10:20
Episode 20 - Step 2500 - Epsilon 0.9993751951936526 - Mean Reward 96.429 - Mean Length 119.048 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 12.862 - Time 2023-05-11T17:10:33
Episode 40 - Step 5241 - Epsilon 0.9986906078390085 - Mean Reward 112.683 - Mean Length 127.829 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 15.12 - Time 2023-05-11T17:10:48
Episode 60 - Step 8158 - Epsilon 0.9979625781122371 - Mean Reward 135.164 - Mean Length 133.738 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 15.692 - Time 2023-05-11T17:11:04
Episode 80 - Step 10788 - Epsilon 0.9973066333005305 - Mean Reward 134.506 - Mean Length 133.185 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 14.375 - Time 2023-05-11T17:11:18
Episode 100 - Step 13388 - Epsilon 0.9966585945432022 - Mean Reward 128.45 - Mean Length 132.64 - Mean Loss 0.0 - Mean