In [None]:
#Import dependencies
import retro
import time
import os
import gym
from gym import Env
from gym.spaces import MultiBinary, Box, Discrete
import numpy as np
import cv2
from matplotlib import pyplot as plt
import optuna
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import tensorboard
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
#input the viable button combinations that can be inputted on the same frame
combos = [["DOWN", "LEFT"],["DOWN", "LEFT", "A"],["DOWN", "LEFT", "B"],["DOWN", "LEFT", "C"],["DOWN", "LEFT", "X"],["DOWN", "LEFT", "Y"], ["DOWN", "LEFT", "Z"], 
          ["DOWN", "RIGHT"], ["DOWN", "RIGHT", "A"],["DOWN", "RIGHT", "B"],["DOWN", "RIGHT", "C"],["DOWN", "RIGHT", "X"],["DOWN", "RIGHT", "Y"], ["DOWN", "RIGHT", "Z"], 
          ["DOWN"], ["DOWN", "A"], ["DOWN", "B"], ["DOWN", "C"], ["DOWN", "X"], ["DOWN", "Y"], ["DOWN", "Z"],
          ["UP", "LEFT"],["UP", "LEFT", "A"],["UP", "LEFT", "B"],["UP", "LEFT", "C"],["UP", "LEFT", "X"],["UP", "LEFT", "Y"], ["UP", "LEFT", "Z"], 
          ["UP", "RIGHT"], ["UP", "RIGHT", "A"],["UP", "RIGHT", "B"],["UP", "RIGHT", "C"],["UP", "RIGHT", "X"],["UP", "RIGHT", "Y"], ["UP", "RIGHT", "Z"], 
          ["UP"],["UP", "A"], ["UP", "B"], ["UP", "C"], ["UP", "X"], ["UP", "Y"], ["UP", "Z"],
          ["LEFT"],["LEFT", "A"], ["LEFT", "B"], ["LEFT", "C"], ["LEFT", "X"], ["LEFT", "Y"], ["LEFT", "Z"],
          ["RIGHT"],["RIGHT", "A"], ["RIGHT", "B"], ["RIGHT", "C"], ["RIGHT", "X"], ["RIGHT", "Y"], ["RIGHT", "Z"],
          ["A"],["B"],["C"],["X"],["Y"],["Z"]]
#unforunately "do nothing" was forgotten

In [None]:
class Discretizer(gym.ActionWrapper):
    """
    Wrap a gym environment and make it use discrete actions.

    Args:
        combos: ordered list of lists of valid button combinations
    """

    def __init__(self, env, combos):
        super().__init__(env)
        assert isinstance(env.action_space, gym.spaces.MultiBinary)
        buttons = env.unwrapped.buttons
        self._decode_discrete_action = []
        for combo in combos:
            arr = np.array([0] * env.action_space.n)
            for button in combo:
                arr[buttons.index(button)] = 1
            self._decode_discrete_action.append(arr)

        self.action_space = gym.spaces.Discrete(len(self._decode_discrete_action))

    def action(self, act):
        return self._decode_discrete_action[act].copy()


class StreetFighterDiscretizer(Discretizer):
    def __init__(self, env):
        super().__init__(env=env, combos=combos)

In [None]:
class StreetFighter(Env):

    def __init__(self):
        super().__init__() 
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        
        #12-long vector where each action corresps to a 0 or a 1
        self.action_space = MultiBinary(12)
        #need order of buttons so that the combos array can convert actions into the corresp multibinary array
        self.buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
        
        #start up an instance of the game
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis")
    
    def preprocess(self, observation):
        #turn to grey, resize, and regain the channels value
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels
    
    def step(self,action):
        #take a step
        obs, reward, done, info = self.game.step(action)

        #preprocess the observation
        obs = self.preprocess(obs)

        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        #reward function is score delta
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        # Set score to zero at start
        self.score = 0 
        return obs
    
    def close(self):
        self.game.close()

Hyperparameter tuning

In [None]:
#logs for tensorboard data and hyperparameter models
LOG_DIR = './DQNlogs/'
OPT_DIR = './DQNopt/'

In [None]:
def objectiveDQN(trial):
    #generates an example set of hyperparamaters
    return {
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-5,1e-4, log=True),
    }

In [None]:
def optimize_agent(trial):
    #evaluate the agent's performance when it trains using different sets of hyperparameters
    try:
        model_params = objectiveDQN(trial)

        env = StreetFighter()
        env = StreetFighterDiscretizer(env)
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        model = DQN('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        print("made model")

        model.learn(total_timesteps=100000)
        print("model learned")


        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    except Exception as e:
        return -1000
    

In [None]:
#create the experiment/study. since returning a positive value, want to maximise the function. 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

In [None]:
#obtain the best set of hyperparameters (trial 2) and set this to model_params

study.best_params #= {'gamma': 0.8607026864367819, 'learning_rate': 1.240332072345838e-05}

In [None]:
model_params = {'gamma': 0.8607026864367819, 'learning_rate': 1.240332072345838e-05}

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        print("Checkpoint reached!")

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls+800000))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = "./DQNtrain/"

In [None]:
#save the model every 100,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

Training the Model

In [None]:
#StreetFighter environment is passed into the discretizer to deal with the actions
env = StreetFighter()
env = StreetFighterDiscretizer(env)
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
model = DQN('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)

In [None]:
#load model and learn for 200,000 steps (repeated 5 times to get to 1M timesteps)
model.load("./DQNtrain/best_model_800000.zip")
model.learn(total_timesteps=200000, callback= callback)

Evaluating the Model

In [None]:
#Load fully trained model
model.load("./DQNtrain/best_model_1000000.zip")

In [None]:
#obtain mean reward after 30 episodes
mean_reward,_ = evaluate_policy(model, env, n_eval_episodes=30)

In [None]:
mean_reward

Testing the Model

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase. Then predict using the model.
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        print(reward)
