In [1]:
import retro # The main library
import time # For timing learning, if needed
import pygame # For rendering the game


pygame 2.1.0 (SDL 2.0.16, Python 3.6.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
retro.data.list_games()
# There are actually many games provided by Gym Retro: this command let's you check them out!

['1942-Nes',
 '1943-Nes',
 '3NinjasKickBack-Genesis',
 '8Eyes-Nes',
 'AaahhRealMonsters-Genesis',
 'AbadoxTheDeadlyInnerWar-Nes',
 'AcceleBrid-Snes',
 'ActRaiser2-Snes',
 'ActionPachio-Snes',
 'AddamsFamily-GameBoy',
 'AddamsFamily-Genesis',
 'AddamsFamily-Nes',
 'AddamsFamily-Sms',
 'AddamsFamily-Snes',
 'AddamsFamilyPugsleysScavengerHunt-Nes',
 'AddamsFamilyPugsleysScavengerHunt-Snes',
 'AdvancedBusterhawkGleylancer-Genesis',
 'Adventure-Atari2600',
 'AdventureIsland-GameBoy',
 'AdventureIsland3-Nes',
 'AdventureIslandII-Nes',
 'AdventuresOfBatmanAndRobin-Genesis',
 'AdventuresOfBayouBilly-Nes',
 'AdventuresOfDinoRiki-Nes',
 'AdventuresOfDrFranken-Snes',
 'AdventuresOfKidKleets-Snes',
 'AdventuresOfMightyMax-Genesis',
 'AdventuresOfMightyMax-Snes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Genesis',
 'AdventuresOfRockyAndBullwinkleAndFriends-Nes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Snes',
 'AdventuresOfStarSaver-GameBoy',
 'AdventuresOfYogiBear-Snes',
 'AeroFighters-Snes',
 

In [3]:
# Command to create your environment
env = retro.make(game = "StreetFighterIISpecialChampionEdition-Genesis")

In [4]:
env.reset()
obs, reward, done, info=env.step(env.action_space.sample())


In [5]:
info

{'enemy_matches_won': 0,
 'score': 0,
 'matches_won': 0,
 'continuetimer': 0,
 'enemy_health': 176,
 'health': 176}

In [6]:
# This is the basic loop to be run. Here the sampling is random.
# Watch this run carefully. Do you realise that finishing the first level barely requires any skill?
# The juicy victory won by smashing keys randomly is an incentive to bring in more players to the arcade!
# Reset game to starting state
obs = env.reset()
# Set flag to false
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        time.sleep(0.01)
        
env.close()

In [6]:
env.close()

In [7]:
env.action_space.sample()

array([1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1], dtype=int8)

In [8]:
env.observation_space


Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

In [9]:
!pip install opencv-python



In [10]:
 
from gym import Env 
from gym.spaces import MultiBinary, Box 
import numpy as np
import cv2
from matplotlib import pyplot as plt

In [11]:
env.close()

In [12]:
class StreetFighter(Env): 
    def __init__(self):
        super().__init__()
        # Specify action space and observation space 
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # Startup and instance of the game 
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def preprocess(self, observation): 
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize 
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84,84,1))
        return channels 
    
    def step(self, action): 
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 
        
        # Frame delta 
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs 
        
        # Reshape the reward function
        reward = info['score'] - self.score 
        self.score = info['score'] 
        
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()

In [13]:
env=StreetFighter()

In [33]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        time.sleep(0.01)
        if reward != 0: 
            print(reward)

36
51
-32
51
28
-30
2
-36
-31
9
-48
-24
-23
-23
-32
-23
-24
-18
-9
28
-1
149
-35
36
13
7
-5
-5
29
18
10
12
7
-24
31
-42


KeyboardInterrupt: 

In [14]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

# Create the "logs" directory if it doesn't exist
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

# Create the "opt" directory if it doesn't exist
if not os.path.exists(OPT_DIR):
    os.makedirs(OPT_DIR)

In [16]:
if os.path.exists(LOG_DIR):
    print("The 'logs' directory exists.")
else:
    print("The 'logs' directory does not exist.")

# Check if the "opt" directory exists
if os.path.exists(OPT_DIR):
    print("The 'opt' directory exists.")
else:
    print("The 'opt' directory does not exist.")

The 'logs' directory exists.
The 'opt' directory exists.


In [17]:
# Function to return test hyperparameters - define the object function
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 1024, 2048),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-8, 1e-6),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [18]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best9model'.format(1))

In [19]:
env.close()

In [20]:
# Run a training loop and return mean reward 
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 10, channels_order='last')

        # Create algo 
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=30000)
        #model.learn(total_timesteps=100000)

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best9model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        return -1000

In [21]:
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

[32m[I 2023-08-06 03:18:38,340][0m A new study created in memory with name: no-name-3787a826-d176-4584-83e2-181823503a1f[0m
  """
  
  import sys
  
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1258 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2023-08-06 03:53:16,984][0m Trial 0 finished with value: 27900.0 and parameters: {'n_steps': 1258, 'gamma': 0.956834901722174, 'learning_rate': 4.3552912090602685e-08, 'clip_range': 0.3852380210050572, 'gae_lambda': 0.8321901660621602}. Best is trial 0 with value: 27900.0.[0m
  """
  
  import sys
  
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1640 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2023-08-06 04:27:59,658][0m Trial 1 finished with value: 2000.0 and parameters: {'n_steps': 1640, 'gamma': 0.8436911984011691, 'learning_rate': 9.096416393073904e-07, 'clip_range': 0.3997431778

In [22]:
study.best_params


{'n_steps': 1258,
 'gamma': 0.956834901722174,
 'learning_rate': 4.3552912090602685e-08,
 'clip_range': 0.3852380210050572,
 'gae_lambda': 0.8321901660621602}

In [23]:
study.best_trial

FrozenTrial(number=0, values=[27900.0], datetime_start=datetime.datetime(2023, 8, 6, 3, 18, 38, 344689), datetime_complete=datetime.datetime(2023, 8, 6, 3, 53, 16, 983593), params={'n_steps': 1258, 'gamma': 0.956834901722174, 'learning_rate': 4.3552912090602685e-08, 'clip_range': 0.3852380210050572, 'gae_lambda': 0.8321901660621602}, distributions={'n_steps': IntDistribution(high=2048, log=False, low=1024, step=1), 'gamma': FloatDistribution(high=0.9999, log=True, low=0.8, step=None), 'learning_rate': FloatDistribution(high=1e-06, log=True, low=1e-08, step=None), 'clip_range': FloatDistribution(high=0.4, log=False, low=0.1, step=None), 'gae_lambda': FloatDistribution(high=0.99, log=False, low=0.8, step=None)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)

In [42]:
#EVALUATE THE BEST MODEL NOW
model = PPO.load(os.path.join(OPT_DIR, 'trial_0_best9model.zip'))

In [43]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [44]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best9model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [45]:
CHECKPOINT_DIR = './train/'

if not os.path.exists(CHECKPOINT_DIR):
    os.makedirs(CHECKPOINT_DIR)

if os.path.exists(CHECKPOINT_DIR):
    print("The 'checkpoint' directory exists.")
else:
    print("The 'checkpiont' directory does not exist.")


The 'checkpoint' directory exists.


In [46]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [47]:
env.close()

In [48]:
# Create environment 
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 10, channels_order='last')

In [49]:
model_params = study.best_params
#model_params['n_steps'] = 5632 # set n_steps to 7488 or a factor of 64
# model_params['learning_rate'] = 5e-7
model_params

{'n_steps': 1258,
 'gamma': 0.956834901722174,
 'learning_rate': 4.3552912090602685e-08,
 'clip_range': 0.3852380210050572,
 'gae_lambda': 0.8321901660621602}

In [50]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [51]:
# Reload previous weights from HPO
model.load(os.path.join(OPT_DIR, 'trial_0_best9model.zip'))

<stable_baselines3.ppo.ppo.PPO at 0x1c73910c6d8>

In [52]:
# Kick off training 
model.learn(total_timesteps=100000, callback=callback)
# model.learn(total_timestep=5000000) 

Logging to ./logs/PPO_110
-----------------------------
| time/              |      |
|    fps             | 103  |
|    iterations      | 1    |
|    time_elapsed    | 12   |
|    total_timesteps | 1258 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 32            |
|    iterations           | 2             |
|    time_elapsed         | 77            |
|    total_timesteps      | 2516          |
| train/                  |               |
|    approx_kl            | 2.1464768e-09 |
|    clip_fraction        | 0             |
|    clip_range           | 0.385         |
|    entropy_loss         | -8.32         |
|    explained_variance   | -7.15e-06     |
|    learning_rate        | 4.36e-08      |
|    loss                 | 8.42          |
|    n_updates            | 10            |
|    policy_gradient_loss | -6.84e-07     |
|    value_loss           | 6.67e+03      |
----------------

<stable_baselines3.ppo.ppo.PPO at 0x1c73913af60>

In [25]:
model = PPO.load('./opt/trial_4_best_model.zip')

In [54]:
env.close()

In [55]:
env = StreetFighter()
        
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 10, channels_order='last')


In [56]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)



In [57]:
mean_reward

1200.0

In [27]:
env.close()

In [23]:
model = PPO.load('./opt/trial_4_best_model.zip')

In [24]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)



In [25]:
mean_reward

4900.0