In [None]:
#import dependencies
import retro
#import time to slow down the game with time.sleep
import time
import os

#import environment base class, import multibinary and box for the correct action space shapes
from gym import Env
from gym.spaces import MultiBinary, Box

#numpy helps calculate frame delta
import numpy as np
#import opencv for greyscaling
import cv2

#import matplotlib for plotting the image
from matplotlib import pyplot as plt

#To train our model we will be using optuna. Will allow us to train and tune at the same time
#PPO (proximal policy optimization - model free RL algos that search in the space 
#of policies rather than assigning values to state-action pairs) 
#hyperparameters to tune: n_steps: batch size, gamma: discount rate for calculating returns, 
#learning_rate: learning coefficient for optimizer, clip_range: clipping amount for advantage calc,
#gae_lambda: advantage smoothing parameter

#importing the optimization frame -HPO
import optuna

#gives PPO algo RL
from stable_baselines3 import PPO, A2C, DQN

# evaluate policy allows you to calculate key performance indicators (KPIS) for the algo attached
# to the env. Evaluate policy allows you to see how a agent performs in a specific environment
from stable_baselines3.common.evaluation import evaluate_policy

#import the sb3 monitor for logging. wrap our env inside a vectorised wrapper (concept used to 
#wrap an object, data structure, or piece of code into a standardized interface or behaviour) 
#by default we are not able to access the mean ep reward or length. 
# using a monitor we can use sb3 to extract it out
from stable_baselines3.common.monitor import Monitor
import tensorboard
#import vec wrappers to vectorize and frame stack. dummyvecenv wraps the env inside a vectorized
#wrapper.
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
class StreetFighter(Env):

    def __init__(self):
        super().__init__() 
        #the observation space is a 84x84 box with each value correp to a colour
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        
        #12-long vector where each action corresps to a 0 or a 1
        self.action_space = MultiBinary(12)
        
        #start up an instance of the game
        #use restricted actions ensures that only valid button combinations are chosen
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis",
                               use_restricted_actions = retro.Actions.FILTERED)

    def preprocess(self, observation):
        #turn to grey
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        #resize
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        #need to regain the channels value. need this for stable baselines (the RL package we use here)
        channels = np.reshape(resize, (84,84,1))
        return channels
    
    def step(self,action):
        #take a step
        obs, reward, done, info = self.game.step(action)

        #want to preprocess the observation
        obs = self.preprocess(obs)

        #frame delta: pixel change
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        #reshape the reward function. want the change in score, so we just subtract scores.
        #what other info can the game give us?
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def close(self):
        self.game.close()

Hyperparameter tuning

In [None]:
"""setting up directories so that models from any trial are 
accessible and don't have to train from the beginninbg"""
LOG_DIR = './DQNlogs/'
OPT_DIR = './DQNopt/'

In [None]:
def objectiveDQN(trial):
    return {
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-5,1e-4, log=True),
    }

In [None]:
def optimize_agent(trial):
    #try:
        #training loop
        #obtain set of hyperparameters
        model_params = objectiveA2C(trial)

        #create environment
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env =  DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        print("made env")
        model = A2C('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        #policy_kwargs=dict(normalize_images=False)
        print("made model")

        model.learn(total_timesteps=1)
        print("model learned")


        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=2)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    #except Exception as e:
     #   return -1000
    

In [None]:
#create the experiment/study. since returning a positive value, want to maximise the function. 
study = optuna.create_study(direction='maximize')

#trials refers to how many sets of hyperparameters we are going to tune
#the jobs param (may be deprecated soon) shows how many parallel environments 
#are trained at the same time. gym-retro is not available for parallelization
study.optimize(optimize_agent, n_trials=2, n_jobs=1)

In [None]:
study.best_params
study.best_trial

In [None]:
#how to load a model
model = DQN.load(os.path.join(OPT_DIR, "trial_0_best_model"))

In [None]:
#callbacks are functions that is passed as an argument to another function or method

class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = "./DQNtrain/"

In [None]:
#save the model every 10,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

Training the Model

In [None]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
#create new instance of policy, using the best parameters
model_params = study.best_params 
#set the number of steps, n_steps, to the nearest multiple of 64
model_params['n_steps'] = 1920
model_params


In [None]:
model = PPO('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params, policy_kwargs=dict(normalize_images=False))
model.load(os.path.join(OPT_DIR, 'trial_1_best_model.zip'))
model.learn(total_timesteps=100000, callback= callback)

Evaluating the Model

In [None]:
model = PPO.load()

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)

Testing the Model

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase. Then predict using the model.
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        print(reward)

#Can also have a separate testing notebook alongside the training notebook 
# so you can test and train at the same time and check you are headed in the right 
# direction - so don't need to wait until the end to see if the model is performing well