# Sonic The Hedgehog 2 RL

## ACT 1: Import Libraries

In [1]:
import retro
import time
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt
import torch



import math

%matplotlib inline

In [2]:

!python -m retro.import .

Importing SonicTheHedgehog2-Genesis
Imported 1 games


## ACT 2: Create custom environment

In [3]:
#Limit Posible Actions
possible_actions = {
    # No Operation
    0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    # Left
    1: [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
    # Right
    2: [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
    # Left, Down
    3: [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
    # Right, Down
    4: [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
    # Down
    5: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    # Down, B
    6: [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    # B
    7: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
}

In [4]:
class SonicTheHedgehog2(Env):
    def __init__(self):
        super().__init__()
        #Specify action space and observation space
        self.observation_space = Box(low= 0, 
                                    high=255, 
                                    shape=(84, 84, 1), 
                                    dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # Startup an instance of the game
        self.game = retro.make(game='SonicTheHedgehog2-Genesis', 
                                state='EmeraldHillZone.Act1', 
                                scenario='contest',
                                use_restricted_actions=retro.Actions.FILTERED)

    def reset(self):
        # Return the first frame
        obs = self.game.reset()
        #Current Frame - Previous Frame
        obs = self.preprocess(obs)
        self.previous_frame = obs

        # Create a placeholder attribute to hold the score delta
        self.score = 96
        return obs
    
    def preprocess(self, observation):
        # Grayscaling
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84, 84, 1))
        return channels


    def step(self, action):
        # Take a step
        # Recieve the unprocessed items
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)

        # Frame Delta
        # Substract previous frame from the Current frame in order to see pixel changes
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        #Reshape the reward function TODO
        reward = (info['x'] - self.score) #+ info['level_end_bonus']
        #reward = info['x']
        self.score = info['x']

        return frame_delta, reward, done, info


    def render(self, *args, **kwargs):
        self.game.render()

    def close(self):
        self.game.close()

## ACT 3: Start the game environment

In [None]:
env.close()

In [5]:
env = SonicTheHedgehog2()
print("The size of frame is: ", env.observation_space.shape)
print("No. of Actions: ", env.action_space.n)

obs = env.reset()

done = False

The size of frame is:  (84, 84, 1)
No. of Actions:  12


In [None]:
plt.figure()
plt.imshow(env.reset())
plt.title('Original Frame')
plt.show

In [None]:
plt.figure()
plt.imshow(cv2.cvtColor(obs, cv2.COLOR_BGR2RGB))
plt.title('Pre Processed image')
plt.show()

In [None]:
score = 0
for game in range(1):
    while not done:
        env.render()
        action = possible_actions[np.random.randint(len(possible_actions))]
        #Takes random desicions
        obs, reward, done, info = env.step(action)
        #time.sleep(0.005)
        score += reward

        #if reward > 0:
        print(reward)
            
        if done:
            print("Your Score at the end of the game is: " + str(score))
            break

    env.reset()
    env.render(close=True)
    env.close()

## ACT 4: Hyperparameter Optimization


In [5]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os

In [6]:
#Log Directory
LOG_DIR = './logs/'
#Where to save the models
OPT_DIR = './opt/'

In [7]:
# Function to return test hyperparameters - define the object function
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [8]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

In [9]:
# Run a training loop and return mean reward 
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = SonicTheHedgehog2()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Create algo 
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=30000)
        #model.learn(total_timesteps=100000)

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        return -1000

In [None]:
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

In [None]:
study.best_params

In [None]:
study.best_trial

In [10]:
model = PPO.load(os.path.join(OPT_DIR, 'trial_4_best_model.zip'))

## ACT 5: Setup Callback


In [11]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [12]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [13]:
CHECKPOINT_DIR = './train/'

In [13]:
#Check frequency means basically that every 10000 steps we are gonna save the model
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# ACT 6: Train Model

In [14]:
# Create environment 
env = SonicTheHedgehog2()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
model_params = study.best_params

In [15]:
model_params = {'n_steps': 4544,
 'gamma': 0.9324234706226547,
 'learning_rate': 6.071018543662078e-05,
 'clip_range': 0.3840304585833002,
 'gae_lambda': 0.8587815088803813}

In [None]:
125*64

In [None]:
model_params = study.best_params
model_params['n_steps'] = 2944  # set n_steps to 2944 or a factor of 64
model_params['learning_rate'] = 5e-7
model_params

In [16]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [17]:
# Reload previous weights from HPO
model.load(os.path.join(OPT_DIR, 'trial_4_best_model.zip'))

<stable_baselines3.ppo.ppo.PPO at 0x2665aafdac8>

In [None]:
# Kick off training 
model.learn(total_timesteps=100000, callback=callback)
# model.learn(total_timestep=30000) 
# model.learn(total_timestep=5000000) 

# ACT 7: Evaluate the Model


In [18]:
model.load(os.path.join(CHECKPOINT_DIR, 'best_model_100000A'))

<stable_baselines3.ppo.ppo.PPO at 0x2665b1925c8>

In [19]:
mean_reward, _ = evaluate_policy(model, env, render=False, n_eval_episodes=1)

In [20]:
mean_reward

960.0

# ACT 8: Test Model

In [21]:
obs = env.reset()

In [None]:
obs.shape

In [22]:
env.step(model.predict(obs)[0])

(array([[[[  0,   0,  41,   0],
          [  0,   0,  41,   0],
          [  0,   0,  41,   0],
          ...,
          [  0,   0,  41,   0],
          [  0,   0,  41,   0],
          [  0,   0,  41,   0]],
 
         [[  0,   0,  41,   0],
          [  0,   0,  41,   0],
          [  0,   0,  41,   0],
          ...,
          [  0,   0,  41,   0],
          [  0,   0,  41,   0],
          [  0,   0,  41,   0]],
 
         [[  0,   0,  41,   0],
          [  0,   0,  41,   0],
          [  0,   0,  41,   0],
          ...,
          [  0,   0,  41,   0],
          [  0,   0,  39,   0],
          [  0,   0,  41,   0]],
 
         ...,
 
         [[  0,   0,  58,   0],
          [  0,   0,  81,   0],
          [  0,   0,  81,   0],
          ...,
          [  0,   0, 110,   0],
          [  0,   0, 110,   0],
          [  0,   0,  84,   0]],
 
         [[  0,   0,  58,   0],
          [  0,   0,  40,   0],
          [  0,   0,  40,   0],
          ...,
          [  0,   0,  28,   0],
 

In [23]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        print(reward)



[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[0.]
[1.]
[0.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[-1.

[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[-2.]
[-1.]
[-1.]
[-2.]
[-1.]
[-1.]
[-1.]
[-2.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[-1.]
[-2.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]

[0.]
[-1.]
[1.]
[0.]
[0.]
[0.]
[-1.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]


[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[1.]
[0.]
[-1.]
[0.]
[0.]
[

[-2.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[-1.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.]
[1.]
[0.]
[0.]
[-1.]
[-1.]
[0.]
[-

[0.]
[0.]
[-1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[-1.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[-1.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[-1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]


In [24]:
info


[{'prev_lives': 3,
  'offset_x': -96,
  'x': 659,
  'lives': 2,
  'screen_x_end': 10656,
  'xpos_last_x': 0,
  'game_mode': 12,
  'prev_progress': 0,
  'score': 0,
  'zone': 0,
  'level_end_bonus': 0,
  'rings': 0,
  'act': 0,
  'y': 1085,
  'screen_x': 503,
  'screen_y': 528,
  'episode': {'r': 563, 'l': 8399, 't': 333.1422},
  'terminal_observation': array([[[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]],
  
         [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]],
  
         [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]],
  
         ...,
  
         [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          ...,
          [0, 0, 0, 0],
          [0, 0, 0, 0],
      