In [1]:
#import dependencies
import retro
#import time to slow down the game with time.sleep
import time
import os

#import environment base class, import multibinary and box for the correct action space shapes
import gym
from gym import Env
from gym.spaces import MultiBinary, Box, Discrete

#numpy helps calculate frame delta
import numpy as np
#import opencv for greyscaling
import cv2

#import matplotlib for plotting the image
from matplotlib import pyplot as plt

#To train our model we will be using optuna. Will allow us to train and tune at the same time
#PPO (proximal policy optimization - model free RL algos that search in the space 
#of policies rather than assigning values to state-action pairs) 
#hyperparameters to tune: n_steps: batch size, gamma: discount rate for calculating returns, 
#learning_rate: learning coefficient for optimizer, clip_range: clipping amount for advantage calc,
#gae_lambda: advantage smoothing parameter

#importing the optimization frame -HPO
import optuna

#gives PPO algo RL
from stable_baselines3 import PPO, A2C, DQN

# evaluate policy allows you to calculate key performance indicators (KPIS) for the algo attached
# to the env. Evaluate policy allows you to see how a agent performs in a specific environment
from stable_baselines3.common.evaluation import evaluate_policy

#import the sb3 monitor for logging. wrap our env inside a vectorised wrapper (concept used to 
#wrap an object, data structure, or piece of code into a standardized interface or behaviour) 
#by default we are not able to access the mean ep reward or length. 
# using a monitor we can use sb3 to extract it out
from stable_baselines3.common.monitor import Monitor
import tensorboard
#import vec wrappers to vectorize and frame stack. dummyvecenv wraps the env inside a vectorized
#wrapper.
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
#discretizing the process
#from https://github.com/openai/retro/blob/master/cores/genesis.json we know that the buttons are
#buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
#https://github.com/openai/retro-baselines/blob/master/agents/sonic_util.py also useful!
#control manual https://manuals.sega.com/genesismini/pdf/STREET_FIGHTER_2.pdf
#can't press start OR mode, as well as any of 

combos = [["DOWN", "LEFT"],["DOWN", "LEFT", "A"],["DOWN", "LEFT", "B"],["DOWN", "LEFT", "C"],["DOWN", "LEFT", "X"],["DOWN", "LEFT", "Y"], ["DOWN", "LEFT", "Z"], 
          ["DOWN", "RIGHT"], ["DOWN", "RIGHT", "A"],["DOWN", "RIGHT", "B"],["DOWN", "RIGHT", "C"],["DOWN", "RIGHT", "X"],["DOWN", "RIGHT", "Y"], ["DOWN", "RIGHT", "Z"], 
          ["DOWN"], ["DOWN", "A"], ["DOWN", "B"], ["DOWN", "C"], ["DOWN", "X"], ["DOWN", "Y"], ["DOWN", "Z"],
          ["UP", "LEFT"],["UP", "LEFT", "A"],["UP", "LEFT", "B"],["UP", "LEFT", "C"],["UP", "LEFT", "X"],["UP", "LEFT", "Y"], ["UP", "LEFT", "Z"], 
          ["UP", "RIGHT"], ["UP", "RIGHT", "A"],["UP", "RIGHT", "B"],["UP", "RIGHT", "C"],["UP", "RIGHT", "X"],["UP", "RIGHT", "Y"], ["UP", "RIGHT", "Z"], 
          ["UP"],["UP", "A"], ["UP", "B"], ["UP", "C"], ["UP", "X"], ["UP", "Y"], ["UP", "Z"],
          ["LEFT"],["LEFT", "A"], ["LEFT", "B"], ["LEFT", "C"], ["LEFT", "X"], ["LEFT", "Y"], ["LEFT", "Z"],
          ["RIGHT"],["RIGHT", "A"], ["RIGHT", "B"], ["RIGHT", "C"], ["RIGHT", "X"], ["RIGHT", "Y"], ["RIGHT", "Z"],
          ["A"],["B"],["C"],["X"],["Y"],["Z"]]


In [2]:
class Discretizer(gym.ActionWrapper):
    """
    Wrap a gym environment and make it use discrete actions.

    Args:
        combos: ordered list of lists of valid button combinations
    """

    def __init__(self, env, combos):
        super().__init__(env)
        assert isinstance(env.action_space, gym.spaces.MultiBinary)
        buttons = env.unwrapped.buttons
        self._decode_discrete_action = []
        for combo in combos:
            arr = np.array([0] * env.action_space.n)
            for button in combo:
                arr[buttons.index(button)] = 1
            self._decode_discrete_action.append(arr)

        self.action_space = gym.spaces.Discrete(len(self._decode_discrete_action))

    def action(self, act):
        return self._decode_discrete_action[act].copy()


class StreetFighterDiscretizer(Discretizer):
    def __init__(self, env):
        super().__init__(env=env, combos=combos)

In [3]:
class StreetFighter(Env):

    def __init__(self):
        super().__init__() 
        #the observation space is a 84x84 box with each value correp to a colour
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        
        #12-long vector where each action corresps to a 0 or a 1
        self.action_space = MultiBinary(12)
        self.buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
        #start up an instance of the game
        #use restricted actions ensures that only valid button combinations are chosen
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis")
    
    def preprocess(self, observation):
        #turn to grey
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        #resize
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        #need to regain the channels value. need this for stable baselines (the RL package we use here)
        channels = np.reshape(resize, (84,84,1))
        return channels
    
    def step(self,action):
        #take a step
        obs, reward, done, info = self.game.step(action)

        #want to preprocess the observation
        obs = self.preprocess(obs)

        #frame delta: pixel change
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        #reshape the reward function. want the change in score, so we just subtract scores.
        #what other info can the game give us?
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()

    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def close(self):
        self.game.close()

Hyperparameter tuning

In [4]:
"""setting up directories so that models from any trial are 
accessible and don't have to train from the beginninbg"""
LOG_DIR = './DQNlogs/'
OPT_DIR = './DQNopt/'

In [5]:
def objectiveDQN(trial):
    return {
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-5,1e-4, log=True),
    }

In [6]:
def optimize_agent(trial):
    #try:
        #training loop
        #obtain set of hyperparameters
        model_params = objectiveDQN(trial)

        #create environment
        env = StreetFighter()
        env = StreetFighterDiscretizer(env)
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        print("made env")
        model = DQN('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        #policy_kwargs=dict(normalize_images=False)
        print("made model")

        model.learn(total_timesteps=1)
        print("model learned")


        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=2)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    #except Exception as e:
     #   return -1000
    

In [122]:
env.close()

In [123]:
study = optuna.create_study(direction='maximize')

#trying to obtain greatest mean reward
study.optimize(optimize_agent, n_trials=2, n_jobs=1)

[I 2023-08-11 17:06:22,389] A new study created in memory with name: no-name-814f2f05-6d36-4786-86b9-40684404dc5f


MultiBinary(12) [1 0 1 1 1 1 0 0 0 1 0 0]
made env
made model
model learned


[I 2023-08-11 17:07:12,115] Trial 0 finished with value: 500.0 and parameters: {'gamma': 0.8015434396374793, 'learning_rate': 1.6928655043444773e-05}. Best is trial 0 with value: 500.0.


MultiBinary(12) [0 1 0 0 1 1 0 1 1 0 0 1]
made env
made model
model learned


[W 2023-08-11 17:07:16,751] Trial 1 failed with parameters: {'gamma': 0.9194186024101256, 'learning_rate': 6.0151352038621995e-05} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/Cheks/Desktop/Durham /Durham Part 2/Data Science/Project/mySFBot/.venv/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/1q/8vymbqss1bgdgr4kvck49gtw0000gn/T/ipykernel_1107/1824538200.py", line 22, in optimize_agent
    mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=2)
  File "/Users/Cheks/Desktop/Durham /Durham Part 2/Data Science/Project/mySFBot/.venv/lib/python3.8/site-packages/stable_baselines3/common/evaluation.py", line 89, in evaluate_policy
    observations, rewards, dones, infos = env.step(actions)
  File "/Users/Cheks/Desktop/Durham /Durham Part 2/Data Science/Project/mySFBot/.venv/lib/python3.8/site-packages/stable_baselines3/common/vec_env/base_ve

KeyboardInterrupt: 

In [69]:
env.close()

In [14]:
study.best_params
study.best_trial

NameError: name 'study' is not defined

In [None]:
#how to load a model
model = DQN.load(os.path.join(OPT_DIR, "trial_0_best_model"))

In [None]:
#callbacks are functions that is passed as an argument to another function or method

class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = "./DQNtrain/"

In [None]:
#save the model every 10,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

Training the Model

In [12]:
env.close()

In [13]:
env = StreetFighter()
env = StreetFighterDiscretizer(env)
print(env.action_space)
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

Discrete(62)


In [None]:
#create new instance of policy, using the best parameters
model_params = study.best_params 
#set the number of steps, n_steps, to the nearest multiple of 64
model_params


In [None]:
model = DQN('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
model.load(os.path.join(OPT_DIR, 'trial__best_model.zip'))
model.learn(total_timesteps=100000, callback= callback)

Evaluating the Model

In [7]:
model = DQN.load("/Users/Cheks/Desktop/Durham /Durham Part 2/Data Science/Project/mySFBot/trained_bots/DQN_1000000.zip")



In [None]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)

Testing the Model

In [14]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase. Then predict using the model.
done = False
for game in range(10): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.005)
        print(reward)

#Can also have a separate testing notebook alongside the training notebook 
# so you can test and train at the same time and check you are headed in the right 
# direction - so don't need to wait until the end to see if the model is performing well

2023-08-13 01:17:04.811 Python[20707:1047150] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/1q/8vymbqss1bgdgr4kvck49gtw0000gn/T/org.python.python.savedState


[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]


: 