In [1]:
#Import dependencies
import retro
import time
import os
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt
import optuna
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.evaluation import evaluate_policy
import tensorboard
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

Define the environment: observation space, action space, preprocessing steps, what happens every timestep, and resetting the game.

In [2]:
class StreetFighter(Env):

    def __init__(self):
        super().__init__() 
        #the observation space is a 84x84 box with each value correp to a colour
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        
        #action space of 12-long vectors where each action corresps to a 0 or a 1
        self.action_space = MultiBinary(12)
        
        #start up an instance of the game
        #use restricted actions ensures that only valid button combinations are chosen
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis",
                               use_restricted_actions = retro.Actions.FILTERED)

    def preprocess(self, observation):
        #turn to grey, resize, and regain the channels value
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels
    
    def step(self,action):
        obs, reward, done, info = self.game.step(action)

        obs = self.preprocess(obs)

        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        
        #reward function is score delta
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        # Set score to dero 
        self.score = 0 
        return obs
    
    def close(self):
        self.game.close()

Hyperparameter tuning

In [3]:
#logs for tensorboard data and hyperparameter models
LOG_DIR = './A2Clogs/'
OPT_DIR = './A2Copt/'

In [12]:
def objectiveA2C(trial):
    #generates an example set of hyperparamaters
    return {
        'n_steps': trial.suggest_int('n_steps',2048,8192),
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-6,1e-5, log=True),
        'gae_lambda': trial.suggest_float('gae_lambda',0.8,0.99)
    }

In [13]:
def optimize_agent(trial):
    #evaluate the agent's performance when it trains using different sets of hyperparameters
    try:
        model_params = objectiveA2C(trial)

        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env =  DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        model = A2C('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        print("model_made")
        model.learn(total_timesteps=100000)
        print("model learned")

        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    
    except Exception as e:
       return -1000
    



In [15]:
#create the experiment/study. since returning a positive value, want to maximise the function. 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

[I 2023-08-12 13:30:15,628] A new study created in memory with name: no-name-8aad12ad-e796-440e-8973-8f6d4433ad0c


model_made
model learned


[I 2023-08-12 13:35:12,965] Trial 0 finished with value: 4500.0 and parameters: {'n_steps': 7941, 'gamma': 0.8663412679242408, 'learning_rate': 1.3756970061853513e-06, 'gae_lambda': 0.9095683342511351}. Best is trial 0 with value: 4500.0.


model_made
model learned


[I 2023-08-12 13:41:04,588] Trial 1 finished with value: 18100.0 and parameters: {'n_steps': 5834, 'gamma': 0.9532514222680888, 'learning_rate': 3.8509210607144295e-06, 'gae_lambda': 0.9790912340563886}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 13:46:34,858] Trial 2 finished with value: 0.0 and parameters: {'n_steps': 4362, 'gamma': 0.98268551561208, 'learning_rate': 7.127695420229584e-06, 'gae_lambda': 0.8321319788402167}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 13:51:30,202] Trial 3 finished with value: 1000.0 and parameters: {'n_steps': 4252, 'gamma': 0.9806151632514849, 'learning_rate': 4.656764298235243e-06, 'gae_lambda': 0.9583989449522734}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 13:57:01,079] Trial 4 finished with value: 1900.0 and parameters: {'n_steps': 4149, 'gamma': 0.9531188114863924, 'learning_rate': 5.079551467056385e-06, 'gae_lambda': 0.9886142257354342}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 14:02:25,797] Trial 5 finished with value: 1000.0 and parameters: {'n_steps': 5082, 'gamma': 0.9200829751203636, 'learning_rate': 6.068525608623374e-06, 'gae_lambda': 0.8620870837237014}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 14:07:59,345] Trial 6 finished with value: 1800.0 and parameters: {'n_steps': 6588, 'gamma': 0.8530801768400269, 'learning_rate': 2.9679885414326437e-06, 'gae_lambda': 0.8528718224668003}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 14:13:23,528] Trial 7 finished with value: 200.0 and parameters: {'n_steps': 7167, 'gamma': 0.9900433416017995, 'learning_rate': 8.287108694225938e-06, 'gae_lambda': 0.9137101821077287}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 14:18:37,957] Trial 8 finished with value: 4600.0 and parameters: {'n_steps': 2054, 'gamma': 0.9502517312474695, 'learning_rate': 1.303061036066252e-06, 'gae_lambda': 0.9069686611891224}. Best is trial 1 with value: 18100.0.


model_made
model learned


[I 2023-08-12 14:23:48,818] Trial 9 finished with value: 4600.0 and parameters: {'n_steps': 5875, 'gamma': 0.9822658979737727, 'learning_rate': 1.5233019714238133e-06, 'gae_lambda': 0.9434599087519566}. Best is trial 1 with value: 18100.0.


In [4]:
#obtain the best set of hyperparameters (trial 1) and set this to model_params
"""study.best_params ={'n_steps': 5834,
  'gamma': 0.9532514222680888,
  'learning_rate': 3.8509210607144295e-06,
  'gae_lambda': 0.9790912340563886}"""

model_params = {'n_steps': 5834,
  'gamma': 0.9532514222680888,
  'learning_rate': 3.8509210607144295e-06,
  'gae_lambda': 0.9790912340563886}

In [5]:
#make the n_steps parameter a multiple of 64
5834/64 #=91.15625
91*64 #=5824
model_params['n_steps'] = 5824

In [6]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [7]:
#where we save the intermediate saved models
CHECKPOINT_DIR = "./A2Ctrain/"

In [31]:
#save the model every 100,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

Training the Model

In [8]:
#create env and process for stacking

env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [34]:
#load and train the model
model = A2C('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
model.load(os.path.join(OPT_DIR, 'trial_1_best_model.zip'))
model.learn(total_timesteps=1000000, callback= callback)

<stable_baselines3.a2c.a2c.A2C at 0x7f22e63b9d30>

Evaluating the Model

In [11]:
#Load fully trained model
model.load("./A2Ctrain/best_model_1000000.zip")

<stable_baselines3.a2c.a2c.A2C at 0x7f5442a65580>

In [12]:
#obtain mean reward after 30 episodes
mean_reward,_ = evaluate_policy(model, env, n_eval_episodes=30)

In [14]:
mean_reward

4200.0

Testing the Model

In [None]:
# Reset game to starting state
obs = env.reset()
# Run the game, obtaining the next actions using the model.
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        print(reward)