## Setup Street Fighter

In [1]:
# Import gym-retro to load ROM
import retro
# Import time to set FPS
import time

In [2]:
!python -m retro.import roms

Importing StreetFighterIISpecialChampionEdition-Genesis
Imported 1 games


In [None]:
# list all possible games
retro.data.list_games()

In [None]:
# load game (can only open one at a time)
env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')

In [None]:
env.close()

In [None]:
# (200, 256, 3) image
env.observation_space

In [None]:
# 12 actions and all combinations of them
env.action_space

In [None]:
# simple game loop
obs = env.reset()
done = False
for game in range(2):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        print(reward)
        time.sleep(1/30)

## Setup Custom environment

- Preprocess
    - grayscale
    - frame delta
    - resize image smaller
- Filter actions to less combinations
- Change reward function
    - Set to score

In [3]:
# Import wrapper base class
from gym import Env
# import space shapes  for the env
from gym.spaces import MultiBinary, Box
# For calculating change from one frame to the next
import numpy as np
# For grayscaling
import cv2 as cv
# For plotting images
from matplotlib import pyplot as plt

In [4]:
class StreetFighter(Env):
    def __init__(self) -> None:
        super().__init__()
        
        # Specify action and observation spaces
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) # shape=(200, 256, 3)
        self.action_space = MultiBinary(12)
        
        # start up game
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis',
                               use_restricted_actions=retro.Actions.FILTERED)
    
    def step(self, action):
        # take step
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        
        # frame delta
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        
        # new reward function
        reward = info['score'] - self.score
        self.score = info['score']
        
        return frame_delta, reward, done, info
        
    def render(self, *args, **kwargs):
        self.game.render()
    
    def reset(self):
        obs = self.game.reset()
        
        # need the first frame to calculate delta for first frame
        obs = self.preprocess(obs)
        self.previous_frame = obs
        
        # same for score
        self.score = 0
        
        return obs
    
    def preprocess(self, observation):
        obs = cv.cvtColor(observation, cv.COLOR_RGB2GRAY)
        obs = cv.resize(obs, (84, 84), cv.INTER_CUBIC)
        obs = np.expand_dims(obs, -1)
        return obs
    
    def close(self):
        self.game.close()
        return super().close()

In [None]:
env.close()
env.game.close()

In [None]:
env = StreetFighter()

In [None]:
# simple game loop
obs = env.reset()
done = False
for game in range(1):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        if reward > 0:
            print(reward)
        time.sleep(1/60)

## Hyperparameter tune

In [5]:
import os
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
# For logging (can't get metrics from Vectorizen env)
from stable_baselines3.common.monitor import Monitor
# For vectorizing
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
CHECKPOINT_DIR = './street_fighter/train' # for model weights
LOG_DIR = './street_fighter/log' # for tf logs
OPT_DIR = './street_fighter/opt'

In [7]:
# get hyperparameters from optuna
# Tell optuna the hyperparamter space
def get_hyperparam_suggestion(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192, step=64),
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range': trial.suggest_uniform('clip_range', .1, .4),
        'gae_lambda': trial.suggest_uniform('gae_lambda', .8, .99),
    }

In [8]:
# run a training loop and return mean reward
def optimize_agent(trial):
    try:
        model_params = get_hyperparam_suggestion(trial)
        
        # create env
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        # create ppo
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=75000) # should use a bit more
        
        # evaluate
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()
        
        SAVE_PATH = os.path.join(OPT_DIR, f'trial_{trial.number}')#
        model.save(SAVE_PATH)
        
        return mean_reward
        
    except Exception as e:
        # should set trial to failed rather than this
        print(f'Error on trial {trial.number}')
        print(e)
        return -1000

In [9]:
storage = optuna.storages.RDBStorage(
    url="sqlite:///street_fighter/opt/info.db",
)

In [None]:
# create a new study
study = optuna.create_study(direction='maximize', storage=storage)

In [None]:
# This code sets all still running trials to "FAIL"
# This is necessary if the code execution stops unexpectedly and the trial is left open
study_id = 1
for trial in storage.get_all_trials(study_id):
    print(trial.state)
    if trial.state == optuna.structs.TrialState.RUNNING:
        trial_id = storage.get_trial_id_from_study_id_trial_number(study_id, trial.number)
        storage.set_trial_state(trial_id, optuna.structs.TrialState.FAIL)

In [10]:
# load the study from the storage to continue
study = optuna.load_study(study_name=None, storage=storage)

[32m[I 2022-08-06 16:26:21,726][0m Study name was omitted but trying to load 'no-name-ce1b861a-8ac9-40dd-93aa-eb8212cd18d1' because that was the only study found in the storage.[0m


In [None]:
# add x trials to the study
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

In [None]:
# run until x trials are "COMPLETE"
study.optimize(optimize_agent,callbacks=[optuna.study.MaxTrialsCallback(25)], n_jobs=1)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
study.best_trial

## Setup Callback

In [11]:
from stable_baselines3.common.callbacks import BaseCallback

In [12]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [13]:
callback = TrainAndLoggingCallback(check_freq=33000, save_path=CHECKPOINT_DIR)

## Continue training

In [14]:
best_model_params = study.best_params
best_model_params

{'clip_range': 0.14495297449061462,
 'gae_lambda': 0.9799275931712156,
 'gamma': 0.8809880250755672,
 'learning_rate': 2.0103023114093075e-05,
 'n_steps': 3200}

In [15]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [16]:
# create ppo
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, device='cuda', **best_model_params)
model.set_parameters('./street_fighter/opt/trial_4.zip')
model.learn(total_timesteps=200000, callback=callback)

<stable_baselines3.ppo.ppo.PPO at 0x1d341ade848>

## Evaluate model

In [23]:
model = PPO.load('./street_fighter/train/best_model_99000.zip')

In [24]:
# simple game loop
obs = env.reset()
done = False
for game in range(2):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(model.predict(obs)[0])
        print(reward)
        time.sleep(1/100)

[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[500.]
[0.]
[0.

KeyboardInterrupt: 