In [None]:
"""This file is almost identical to A2C_SFBot except that the model is trained for 5M timesteps. 
Please refer to the comments of that file for an explanation of the code."""

#Import dependencies
import retro
import time
import os
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt
import optuna
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.evaluation import evaluate_policy
import tensorboard
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

#define the environment: observation space, action space


In [None]:
class StreetFighter(Env):

    def __init__(self):
        super().__init__() 
        #the observation space is a 84x84 box with each value correp to a colour
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        
        #12-long vector where each action corresps to a 0 or a 1
        self.action_space = MultiBinary(12)
        
        #start up an instance of the game
        #use restricted actions ensures that only valid button combinations are chosen
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis",
                               use_restricted_actions = retro.Actions.FILTERED)

    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels
    
    def step(self,action):
        obs, reward, done, info = self.game.step(action)

        obs = self.preprocess(obs)

        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        #initialize score to zero
        self.score = 0 
        return obs
    
    def close(self):
        self.game.close()

Hyperparameter tuning

In [None]:
#Intend to use same best hyperparamaters as we did for 1M trained model
#but training tensorboard logs will be in a new folder
LOG_DIR = './A2C3logs/'
OPT_DIR = './A2Copt/'

In [None]:
def objectiveA2C(trial):
    return {
        'n_steps': trial.suggest_int('n_steps',2048,8192),
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-6,1e-5, log=True),
        'gae_lambda': trial.suggest_float('gae_lambda',0.8,0.99)
    }

In [None]:
def optimize_agent(trial):
    try:
        model_params = objectiveA2C(trial)

        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env =  DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        model = A2C('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        print("model_made")
        model.learn(total_timesteps=100000)
        print("model learned")

        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    
    except Exception as e:
       return -1000
    



In [None]:
#create the experiment/study. since returning a positive value, want to maximise the function. 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

In [None]:
"""study.best_params ={'n_steps': 5834,
  'gamma': 0.9532514222680888,
  'learning_rate': 3.8509210607144295e-06,
  'gae_lambda': 0.9790912340563886}"""

model_params = {'n_steps': 5834,
  'gamma': 0.9532514222680888,
  'learning_rate': 3.8509210607144295e-06,
  'gae_lambda': 0.9790912340563886}

In [None]:
5834/64 #=91.15625
91*64 #=5824
model_params['n_steps'] = 5824

In [None]:
#callbacks are functions that is passed as an argument to another function or method

class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = "./A2C3train/"

In [None]:
#save the model every 100,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

Training the Model

In [None]:
#create env with preprocessing

env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
#load and train the model for 5M timesteps
model = A2C('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
model.load(os.path.join(OPT_DIR, 'trial_1_best_model.zip'))
model.learn(total_timesteps=5000000, callback= callback)

Evaluating the Model

In [None]:
#load fully trained model
model.load("./A2C3train/best_model_5000000.zip")

In [None]:
mean_reward,_ = evaluate_policy(model, env, n_eval_episodes=30)

In [None]:
mean_reward

Testing the Model

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase. Then predict using the model.
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        print(reward)