In [2]:
import retro
import time
import os
import gym
from gym import Env
from gym.spaces import MultiBinary, Box, Discrete
import numpy as np
import cv2
from matplotlib import pyplot as plt
import optuna
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import tensorboard
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

In [3]:
combos = [["DOWN", "LEFT"],["DOWN", "LEFT", "A"],["DOWN", "LEFT", "B"],["DOWN", "LEFT", "C"],["DOWN", "LEFT", "X"],["DOWN", "LEFT", "Y"], ["DOWN", "LEFT", "Z"], 
          ["DOWN", "RIGHT"], ["DOWN", "RIGHT", "A"],["DOWN", "RIGHT", "B"],["DOWN", "RIGHT", "C"],["DOWN", "RIGHT", "X"],["DOWN", "RIGHT", "Y"], ["DOWN", "RIGHT", "Z"], 
          ["DOWN"], ["DOWN", "A"], ["DOWN", "B"], ["DOWN", "C"], ["DOWN", "X"], ["DOWN", "Y"], ["DOWN", "Z"],
          ["UP", "LEFT"],["UP", "LEFT", "A"],["UP", "LEFT", "B"],["UP", "LEFT", "C"],["UP", "LEFT", "X"],["UP", "LEFT", "Y"], ["UP", "LEFT", "Z"], 
          ["UP", "RIGHT"], ["UP", "RIGHT", "A"],["UP", "RIGHT", "B"],["UP", "RIGHT", "C"],["UP", "RIGHT", "X"],["UP", "RIGHT", "Y"], ["UP", "RIGHT", "Z"], 
          ["UP"],["UP", "A"], ["UP", "B"], ["UP", "C"], ["UP", "X"], ["UP", "Y"], ["UP", "Z"],
          ["LEFT"],["LEFT", "A"], ["LEFT", "B"], ["LEFT", "C"], ["LEFT", "X"], ["LEFT", "Y"], ["LEFT", "Z"],
          ["RIGHT"],["RIGHT", "A"], ["RIGHT", "B"], ["RIGHT", "C"], ["RIGHT", "X"], ["RIGHT", "Y"], ["RIGHT", "Z"],
          ["A"],["B"],["C"],["X"],["Y"],["Z"]]


In [4]:
class Discretizer(gym.ActionWrapper):
    """
    Wrap a gym environment and make it use discrete actions.

    Args:
        combos: ordered list of lists of valid button combinations
    """

    def __init__(self, env, combos):
        super().__init__(env)
        assert isinstance(env.action_space, gym.spaces.MultiBinary)
        buttons = env.unwrapped.buttons
        self._decode_discrete_action = []
        for combo in combos:
            arr = np.array([0] * env.action_space.n)
            for button in combo:
                arr[buttons.index(button)] = 1
            self._decode_discrete_action.append(arr)

        self.action_space = gym.spaces.Discrete(len(self._decode_discrete_action))

    def action(self, act):
        return self._decode_discrete_action[act].copy()


class StreetFighterDiscretizer(Discretizer):
    def __init__(self, env):
        super().__init__(env=env, combos=combos)

In [5]:
class StreetFighter(Env):

    def __init__(self):
        super().__init__() 
        #the observation space is a 84x84 box with each value correp to a colour
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        
        #12-long vector where each action corresps to a 0 or a 1
        self.action_space = MultiBinary(12)
        self.buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
        #start up an instance of the game
        #use restricted actions ensures that only valid button combinations are chosen
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis")
    
    def preprocess(self, observation):
        #turn to grey
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        #resize
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        #need to regain the channels value. need this for stable baselines (the RL package we use here)
        channels = np.reshape(resize, (84,84,1))
        return channels
    
    def step(self,action):
        #take a step
        obs, reward, done, info = self.game.step(action)

        #want to preprocess the observation
        obs = self.preprocess(obs)

        #frame delta: pixel change
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        #reshape the reward function. want the change in score, so we just subtract scores.
        #what other info can the game give us?
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def close(self):
        self.game.close()

Hyperparameter tuning

In [6]:
"""setting up directories so that models from any trial are 
accessible and don't have to train from the beginninbg"""
LOG_DIR = './DQNlogs/'
OPT_DIR = './DQNopt/'

In [7]:
def objectiveDQN(trial):
    return {
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-5,1e-4, log=True),
    }

In [8]:
def optimize_agent(trial):
    #try:
        #training loop
        #obtain set of hyperparameters
        model_params = objectiveDQN(trial)

        #create environment
        env = StreetFighter()
        env = StreetFighterDiscretizer(env)
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        print("made env")
        model = DQN('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        print("made model")

        model.learn(total_timesteps=100000)
        print("model learned")


        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    #except Exception as e:
    #    return -1000
    

In [9]:
env.close()

NameError: name 'env' is not defined

In [30]:
study = optuna.create_study(direction='maximize')

#trying to obtain greatest mean reward
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

[I 2023-08-12 17:14:13,472] A new study created in memory with name: no-name-b8b09996-348b-4f12-8ee9-def6263d1014


made env
made model
model learned


[I 2023-08-12 17:19:16,162] Trial 0 finished with value: 4300.0 and parameters: {'gamma': 0.8551141150554815, 'learning_rate': 3.5999933728822436e-05}. Best is trial 0 with value: 4300.0.


made env
made model
model learned


[I 2023-08-12 17:25:40,990] Trial 1 finished with value: 17000.0 and parameters: {'gamma': 0.9718310610932153, 'learning_rate': 6.31245955595259e-05}. Best is trial 1 with value: 17000.0.


made env
made model
model learned


[I 2023-08-12 17:30:35,731] Trial 2 finished with value: 24200.0 and parameters: {'gamma': 0.8607026864367819, 'learning_rate': 1.240332072345838e-05}. Best is trial 2 with value: 24200.0.


made env
made model
model learned


[I 2023-08-12 17:35:40,876] Trial 3 finished with value: 20700.0 and parameters: {'gamma': 0.9192585447351365, 'learning_rate': 3.106373951152704e-05}. Best is trial 2 with value: 24200.0.


made env
made model
model learned


[I 2023-08-12 17:40:20,675] Trial 4 finished with value: 3100.0 and parameters: {'gamma': 0.856085249635548, 'learning_rate': 2.7504609232089266e-05}. Best is trial 2 with value: 24200.0.


made env
made model
model learned


[I 2023-08-12 17:45:08,969] Trial 5 finished with value: 4600.0 and parameters: {'gamma': 0.8776840683001154, 'learning_rate': 9.351503604307526e-05}. Best is trial 2 with value: 24200.0.


made env
made model
model learned


[I 2023-08-12 17:49:59,701] Trial 6 finished with value: 3800.0 and parameters: {'gamma': 0.8539614240230364, 'learning_rate': 3.359381101894916e-05}. Best is trial 2 with value: 24200.0.


made env
made model
model learned


[I 2023-08-12 17:54:52,821] Trial 7 finished with value: 600.0 and parameters: {'gamma': 0.8670854734032942, 'learning_rate': 1.319833323208281e-05}. Best is trial 2 with value: 24200.0.


made env
made model
model learned


[I 2023-08-12 17:59:42,811] Trial 8 finished with value: 6000.0 and parameters: {'gamma': 0.9520029384884906, 'learning_rate': 2.3529303408603747e-05}. Best is trial 2 with value: 24200.0.


made env
made model
model learned


[I 2023-08-12 18:04:26,952] Trial 9 finished with value: 3200.0 and parameters: {'gamma': 0.9101693876396653, 'learning_rate': 3.8625176925141133e-05}. Best is trial 2 with value: 24200.0.


In [69]:
env.close()

In [10]:
study.best_params #= {'gamma': 0.8607026864367819, 'learning_rate': 1.240332072345838e-05}

NameError: name 'study' is not defined

In [9]:
model_params = {'gamma': 0.8607026864367819, 'learning_rate': 1.240332072345838e-05}

In [9]:
#how to load a model
model = DQN.load(os.path.join(OPT_DIR, "trial_2_best_model"))

In [10]:
#callbacks are functions that is passed as an argument to another function or method

class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        print("Checkpoint reached!")

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls+800000))
            self.model.save(model_path)

        return True

In [11]:
CHECKPOINT_DIR = "./DQNtrain/"

In [12]:
#save the model every 10,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

Checkpoint reached!


Training the Model

In [11]:
env.close()

NameError: name 'env' is not defined

In [13]:
env = StreetFighter()
env = StreetFighterDiscretizer(env)
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [14]:
model = DQN('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
#model.load(os.path.join(OPT_DIR, 'trial_2_best_model.zip'))
model.load("./DQNtrain/best_model_800000.zip")
model.learn(total_timesteps=200000, callback= callback)

<stable_baselines3.dqn.dqn.DQN at 0x7f1f984fa4f0>

Evaluating the Model

In [15]:
model = DQN.load("./DQNtrain/best_model_1000000.zip")

In [16]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)

In [17]:
mean_reward

15500.0

Testing the Model

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase. Then predict using the model.
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        print(reward)

#Can also have a separate testing notebook alongside the training notebook 
# so you can test and train at the same time and check you are headed in the right 
# direction - so don't need to wait until the end to see if the model is performing well