In [2]:
"""This file is identical to PPO_SFBot except that the model is trained for 5M timesteps. 
Please refer to the comments of that file for an explanation of the code."""

#install dependencies
import retro
import time
import os
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [8]:
env = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis")

In [3]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__() 
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis",
                               use_restricted_actions = retro.Actions.FILTERED)

    def step(self,action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info

    def render(self, *args, **kwargs):
        self.game.render()

    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        self.score=0
        return obs

    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels

    def close(self):
        self.game.close()

Hyperparameter Tuning

In [4]:
#Intend to use same best hyperparamaters as we did for 1M trained model
#but training tensorboard logs will be in a new folder
LOG_DIR = './PPOlogs3/'
OPT_DIR = './PPOopt/'

In [20]:
def objective(trial):
    return {

        'n_steps': trial.suggest_int('n_steps',2048,8192),
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-6,1e-5, log=True),
        'clip_range': trial.suggest_float('clip_range',0.1,0.4),
        'gae_lambda': trial.suggest_float('gae_lambda',0.8,0.99)
    }

In [25]:
def optimize_agent(trial):
    try:

        model_params = objective(trial)

        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env =  DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        print("made env")
        model = PPO('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        print("made model")

        model.learn(total_timesteps=100000)
        print("model learned")

        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    except Exception as e:
        return -1000

In [32]:
#create the experiment/study. since returning a positive value, want to maximise the function. 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

[I 2023-08-12 01:47:44,605] A new study created in memory with name: no-name-b27f5974-be7d-4751-b88c-63b419cb8dfd
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7223 and n_envs=1)


made env
made model
model learned


[I 2023-08-12 01:56:19,940] Trial 0 finished with value: 0.0 and parameters: {'n_steps': 7223, 'gamma': 0.8597738092524785, 'learning_rate': 1.1119985900035e-06, 'clip_range': 0.3030221233562785, 'gae_lambda': 0.8964788387717223}. Best is trial 0 with value: 0.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7256 and n_envs=1)


model learned


[I 2023-08-12 02:04:38,151] Trial 1 finished with value: 2000.0 and parameters: {'n_steps': 7256, 'gamma': 0.8099543624492647, 'learning_rate': 5.330761863556447e-06, 'clip_range': 0.36752556272775394, 'gae_lambda': 0.9300076682403225}. Best is trial 1 with value: 2000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2161 and n_envs=1)


model learned


[I 2023-08-12 02:13:08,272] Trial 2 finished with value: 0.0 and parameters: {'n_steps': 2161, 'gamma': 0.9696700551363613, 'learning_rate': 2.3856354938769044e-06, 'clip_range': 0.2962566757048234, 'gae_lambda': 0.8878275307611758}. Best is trial 1 with value: 2000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6037 and n_envs=1)


model learned


[I 2023-08-12 02:21:21,853] Trial 3 finished with value: 2000.0 and parameters: {'n_steps': 6037, 'gamma': 0.9225925607306555, 'learning_rate': 4.873246617719623e-06, 'clip_range': 0.3984094530817629, 'gae_lambda': 0.872715215283243}. Best is trial 1 with value: 2000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2128 and n_envs=1)


model learned


[I 2023-08-12 02:28:49,737] Trial 4 finished with value: 1000.0 and parameters: {'n_steps': 2128, 'gamma': 0.9377849869946006, 'learning_rate': 3.219485200321302e-06, 'clip_range': 0.20730405370079533, 'gae_lambda': 0.8278776465180212}. Best is trial 1 with value: 2000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8149 and n_envs=1)


model learned


[I 2023-08-12 02:37:07,504] Trial 5 finished with value: 1000.0 and parameters: {'n_steps': 8149, 'gamma': 0.8822822145338179, 'learning_rate': 7.271807282504287e-06, 'clip_range': 0.12416690166331842, 'gae_lambda': 0.9510671299595304}. Best is trial 1 with value: 2000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3630 and n_envs=1)


model learned


[I 2023-08-12 02:45:08,541] Trial 6 finished with value: 2100.0 and parameters: {'n_steps': 3630, 'gamma': 0.8234369637268758, 'learning_rate': 9.063651462172827e-06, 'clip_range': 0.3020869434870336, 'gae_lambda': 0.851760832369112}. Best is trial 6 with value: 2100.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3729 and n_envs=1)


model learned


[I 2023-08-12 02:53:03,628] Trial 7 finished with value: 2000.0 and parameters: {'n_steps': 3729, 'gamma': 0.9007280153377641, 'learning_rate': 2.3914816011014576e-06, 'clip_range': 0.21240491368324968, 'gae_lambda': 0.8466024609724703}. Best is trial 6 with value: 2100.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5595 and n_envs=1)


model learned


[I 2023-08-12 03:00:44,030] Trial 8 finished with value: 3000.0 and parameters: {'n_steps': 5595, 'gamma': 0.8157202903839094, 'learning_rate': 1.154858774456118e-06, 'clip_range': 0.26012333935931625, 'gae_lambda': 0.879540718426021}. Best is trial 8 with value: 3000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2521 and n_envs=1)


model learned


[I 2023-08-12 03:08:50,044] Trial 9 finished with value: 2500.0 and parameters: {'n_steps': 2521, 'gamma': 0.8639640543263564, 'learning_rate': 3.567881682657694e-06, 'clip_range': 0.15732607934744744, 'gae_lambda': 0.851311492805658}. Best is trial 8 with value: 3000.0.


In [None]:
study.best_params ="""{'n_steps': 5595,
 'gamma': 0.8157202903839094,
 'learning_rate': 1.154858774456118e-06,
 'clip_range': 0.26012333935931625,
 'gae_lambda': 0.879540718426021}"""

In [10]:
#get the parameters from the best model
model_params = {'n_steps': 5595,
 'gamma': 0.8157202903839094,
 'learning_rate': 1.154858774456118e-06,
 'clip_range': 0.26012333935931625,
 'gae_lambda': 0.879540718426021}


In [11]:
#change n_steps to be a multiple of 64
5595/64 #=87.421875
87*64 #=5568
model_params['n_steps'] = 5568

In [14]:
model = PPO.load(os.path.join(OPT_DIR, "trial_8_best_model"))

In [12]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls+1300000))
            self.model.save(model_path)

        return True

In [13]:
CHECKPOINT_DIR = "./PPOtrain3"

In [None]:
#save the model every 100,000 steps at checkpoint_dir
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

Train Model

In [8]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env =  DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [14]:
#load and train the model for 5M timesteps total
model = PPO('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)

In [29]:
#here training was accidentally cut at 1.3M timesteps so this model is loaded
#and then training is continued for 3.7M timesteps
model.load(os.path.join(CHECKPOINT_DIR, 'best_model_1300000.zip'))
model.learn(total_timesteps=3700000, callback= callback)

<stable_baselines3.ppo.ppo.PPO at 0x7fac800bf6d0>

Evaluating the Model

In [15]:
#load fully trained model
model.load('./PPOtrain3/best_model_5000000.zip')

<stable_baselines3.ppo.ppo.PPO at 0x7fb17188cbe0>

In [17]:
mean_reward,_ = evaluate_policy(model, env, render=False, n_eval_episodes=30)

In [19]:
mean_reward

13700.0

Testing the model

In [None]:
obs = env.reset()
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        print(reward)
