In [None]:
#install dependencies
import retro
import time
import os
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [None]:
#create environment. call env.close() to close
env = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis")

In [None]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__() 
        
        #the observation space is a 84x84 box with each value correp to a colour
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        
        #action space of 12-long vectors where each action corresps to a 0 or a 1
        self.action_space = MultiBinary(12)

        #start up an instance of the game
        #use restricted actions ensures that only valid button combinations are chosen
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis",
                               use_restricted_actions = retro.Actions.FILTERED)

    def step(self,action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        
        #reward function is score delta
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info

    def render(self, *args, **kwargs):
        self.game.render()

    def reset(self):
        #Set first frame and score to zero at start
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        
        self.score=0
        return obs

    def preprocess(self, observation):
        #turn to grey, resize, and regain the channels value

        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels

    def close(self):
        self.game.close()

Hyperparameter Tuning

In [None]:
#logs for tensorboard data and hyperparameter models
LOG_DIR = './PPOlogs/'
OPT_DIR = './PPOopt/'


In [None]:
def objective(trial):
    #generates an example set of hyperparamaters

    return {
        'n_steps': trial.suggest_int('n_steps',2048,8192),
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-6,1e-5, log=True),
        'clip_range': trial.suggest_float('clip_range',0.1,0.4),
        'gae_lambda': trial.suggest_float('gae_lambda',0.8,0.99)
    }

In [None]:
def optimize_agent(trial):
    #evaluate the agent's performance when it trains using different sets of hyperparameters

    try:

        model_params = objective(trial)

        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env =  DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')

        model = PPO('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        print("made model")

        model.learn(total_timesteps=100000)
        print("model learned")

        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    except Exception as e:
        return -1000

In [None]:
#create the experiment/study. since returning a positive value, want to maximise the function. 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

In [None]:
#obtain the best set of hyperparameters (trial 8) and set this to model_params

"""study.best_params = {'n_steps': 5595,
 'gamma': 0.8157202903839094,
 'learning_rate': 1.154858774456118e-06,
 'clip_range': 0.26012333935931625,
 'gae_lambda': 0.879540718426021}"""

In [None]:
model_params = {'n_steps': 5595,
 'gamma': 0.8157202903839094,
 'learning_rate': 1.154858774456118e-06,
 'clip_range': 0.26012333935931625,
 'gae_lambda': 0.879540718426021}


In [None]:
#change n_steps to be a multiple of 64
5595/64 #=87.421875
87*64 #=5568
model_params['n_steps'] = 5568

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
#where we save the intermediate saved models
CHECKPOINT_DIR = "./PPOtrain"

In [None]:
#save the model every 100,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

Train Model

In [None]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env =  DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
model_params

In [None]:
#load and train the model
model = PPO('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)

In [None]:
model.load(os.path.join(OPT_DIR, 'trial_8_best_model.zip'))

In [None]:
model.learn(total_timesteps=1000000, callback= callback)

Evaluating the Model

In [None]:
#Load fully trained model
model.load('./PPOtrain/best_model_1000000.zip')

In [None]:
#obtain mean reward after 30 episodes
mean_reward,_ = evaluate_policy(model, env, render=False, n_eval_episodes=30)

In [None]:
mean_reward

In [None]:
#Load 800k trained model to check if reward is higher or lower
model.load('./PPOtrain3/best_model_800000.zip')

In [None]:
mean_reward2,_ = evaluate_policy(model, env, render=False, n_eval_episodes=30)


In [None]:
mean_reward2

In [None]:
#Load 2.8M trained model to check if reward is higher or lower 
model.load('./PPOtrain3/best_model_2800000.zip')

In [None]:
mean_reward3,_ = evaluate_policy(model, env, render=False, n_eval_episodes=30)


In [None]:
mean_reward3

Testing the model

In [None]:
obs = env.reset()
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        print(reward)
