In [1]:
#install dependencies
import retro
import time
import os
import gym
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis")

In [12]:
env.close()

In [2]:
#Now we build our custom environment
class StreetFighter(Env):
    def __init__(self):
        super().__init__() 
        self.observation_space = Box(low=0, high=255, 
                                     shape=(84,84,1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis",
                               use_restricted_actions = retro.Actions.FILTERED)

    def step(self,action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info

    def render(self, *args, **kwargs):
        self.game.render()

    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        self.score=0
        return obs

    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation= cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84,84,1))
        return channels

    def close(self):
        self.game.close()

Hyperparameter Tuning

In [3]:
LOG_DIR = './PPOlogs/'
OPT_DIR = './PPOopt/'


In [5]:
def objective(trial):
    return {

        'n_steps': trial.suggest_int('n_steps',2048,8192),
        'gamma': trial.suggest_float('gamma',0.8,0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate',1e-5,1e-4, log=True),
        'clip_range': trial.suggest_float('clip_range',0.1,0.4),
        'gae_lambda': trial.suggest_float('gae_lambda',0.8,0.99)
    }

In [6]:
env.close()

In [7]:
def optimize_agent(trial):
    try:

        model_params = objective(trial)

        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env =  DummyVecEnv([lambda:env])
        env = VecFrameStack(env, 4, channels_order='last')
        print("made env")
        model = PPO('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        print("made model")

        model.learn(total_timesteps=30000)
        print("model learned")

        mean_reward, __ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    except Exception as e:
        return -1000

In [17]:
study = optuna.create_study(direction='maximize')

study.optimize(optimize_agent, n_trials=10, n_jobs=1)

[I 2023-08-10 20:17:26,265] A new study created in memory with name: no-name-8ac8d1a6-3b8e-429a-982a-bf9fbd09898b
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6355 and n_envs=1)


made env
made model
model learned


[I 2023-08-10 20:20:47,715] Trial 0 finished with value: 0.0 and parameters: {'n_steps': 6355, 'gamma': 0.9944096998535085, 'learning_rate': 7.177443405274055e-05, 'clip_range': 0.3630473712972051, 'gae_lambda': 0.9849584791264125}. Best is trial 0 with value: 0.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4821 and n_envs=1)


model learned


[I 2023-08-10 20:23:49,209] Trial 1 finished with value: 2000.0 and parameters: {'n_steps': 4821, 'gamma': 0.9623231774508424, 'learning_rate': 1.4337157808746458e-05, 'clip_range': 0.12908770095937713, 'gae_lambda': 0.9582207432706459}. Best is trial 1 with value: 2000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4432 and n_envs=1)


model learned


[I 2023-08-10 20:27:01,039] Trial 2 finished with value: 2600.0 and parameters: {'n_steps': 4432, 'gamma': 0.842748361381151, 'learning_rate': 2.5549272069169798e-05, 'clip_range': 0.3379494829981735, 'gae_lambda': 0.8063512879820152}. Best is trial 2 with value: 2600.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7480 and n_envs=1)


model learned


[I 2023-08-10 20:30:52,018] Trial 3 finished with value: 1200.0 and parameters: {'n_steps': 7480, 'gamma': 0.9073625341023731, 'learning_rate': 5.857180446639393e-05, 'clip_range': 0.17847678501779857, 'gae_lambda': 0.9327790387959478}. Best is trial 2 with value: 2600.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4344 and n_envs=1)


model learned


[I 2023-08-10 20:34:12,001] Trial 4 finished with value: 4700.0 and parameters: {'n_steps': 4344, 'gamma': 0.9219164403334561, 'learning_rate': 2.9990805091277332e-05, 'clip_range': 0.10914199716008474, 'gae_lambda': 0.9796462010029485}. Best is trial 4 with value: 4700.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4232 and n_envs=1)


model learned


[I 2023-08-10 20:37:45,801] Trial 5 finished with value: 4900.0 and parameters: {'n_steps': 4232, 'gamma': 0.9914730269222318, 'learning_rate': 9.578763968483479e-05, 'clip_range': 0.2317825988110016, 'gae_lambda': 0.9479627545499079}. Best is trial 5 with value: 4900.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6736 and n_envs=1)


model learned


[I 2023-08-10 20:41:24,227] Trial 6 finished with value: 2700.0 and parameters: {'n_steps': 6736, 'gamma': 0.9547090896821083, 'learning_rate': 3.143635355820494e-05, 'clip_range': 0.23887918823001422, 'gae_lambda': 0.891704217842593}. Best is trial 5 with value: 4900.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2167 and n_envs=1)


model learned


[I 2023-08-10 20:45:40,232] Trial 7 finished with value: 47000.0 and parameters: {'n_steps': 2167, 'gamma': 0.9621655852263234, 'learning_rate': 1.802672235052777e-05, 'clip_range': 0.13188020139190582, 'gae_lambda': 0.8252202115040619}. Best is trial 7 with value: 47000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2173 and n_envs=1)


model learned


[I 2023-08-10 20:48:39,091] Trial 8 finished with value: 2100.0 and parameters: {'n_steps': 2173, 'gamma': 0.9119230880651862, 'learning_rate': 8.994699307729342e-05, 'clip_range': 0.23720051306203996, 'gae_lambda': 0.8214252694180043}. Best is trial 7 with value: 47000.0.


made env
made model


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4119 and n_envs=1)


model learned


[I 2023-08-10 20:52:01,482] Trial 9 finished with value: 4300.0 and parameters: {'n_steps': 4119, 'gamma': 0.8668257909037713, 'learning_rate': 6.570940174033798e-05, 'clip_range': 0.31314901687531166, 'gae_lambda': 0.8809252197018208}. Best is trial 7 with value: 47000.0.


In [19]:
""""study.best_params = 'n_steps': 2167,
  'gamma': 0.9621655852263234,
  'learning_rate': 1.802672235052777e-05,
  'clip_range': 0.13188020139190582,
  'gae_lambda': 0.8252202115040619}"""

({'n_steps': 2167,
  'gamma': 0.9621655852263234,
  'learning_rate': 1.802672235052777e-05,
  'clip_range': 0.13188020139190582,
  'gae_lambda': 0.8252202115040619},
 FrozenTrial(number=7, state=TrialState.COMPLETE, values=[47000.0], datetime_start=datetime.datetime(2023, 8, 10, 20, 41, 24, 229502), datetime_complete=datetime.datetime(2023, 8, 10, 20, 45, 40, 231553), params={'n_steps': 2167, 'gamma': 0.9621655852263234, 'learning_rate': 1.802672235052777e-05, 'clip_range': 0.13188020139190582, 'gae_lambda': 0.8252202115040619}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_steps': IntDistribution(high=8192, log=False, low=2048, step=1), 'gamma': FloatDistribution(high=0.9999, log=True, low=0.8, step=None), 'learning_rate': FloatDistribution(high=0.0001, log=True, low=1e-05, step=None), 'clip_range': FloatDistribution(high=0.4, log=False, low=0.1, step=None), 'gae_lambda': FloatDistribution(high=0.99, log=False, low=0.8, step=None)}, trial_id=7, value=None))

In [15]:
#get the parameters from the best model
model_params = {'n_steps': 2167,
  'gamma': 0.9621655852263234,
  'learning_rate': 1.802672235052777e-05,
  'clip_range': 0.13188020139190582,
  'gae_lambda': 0.8252202115040619}

In [8]:
model = PPO.load(os.path.join(OPT_DIR, "trial_7_best_model"))

In [9]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [10]:
CHECKPOINT_DIR = "./PPOtrain"

In [11]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

Train Model

In [6]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env =  DummyVecEnv([lambda:env])
env = VecFrameStack(env, 4, channels_order='last')

In [21]:
model_params

{'n_steps': 2112,
 'gamma': 0.9621655852263234,
 'learning_rate': 1.802672235052777e-05,
 'clip_range': 0.13188020139190582,
 'gae_lambda': 0.8252202115040619}

In [20]:
2167/64 #=33.86
33*64 #=2112
model_params['n_steps'] = 2112

In [22]:
model = PPO('CnnPolicy',env, tensorboard_log=LOG_DIR, verbose=0, **model_params)

In [23]:
model.load(os.path.join(OPT_DIR, 'trial_7_best_model.zip'))

<stable_baselines3.ppo.ppo.PPO at 0x7f7f6271ff70>

In [24]:
model.learn(total_timesteps=100000, callback= callback)

<stable_baselines3.ppo.ppo.PPO at 0x7f7f6259ddc0>

Evaluating the Model

In [4]:
model = PPO.load('/Users/Cheks/Desktop/Durham /Durham Part 2/Data Science/Project/mySFBot/trained_bots/PPO_1000000.zip')

In [26]:
mean_reward, _ = evaluate_policy(model, env, render=False, n_eval_episodes=10)


In [27]:
mean_reward

3400.0

Testing the model

In [8]:
obs = env.reset()
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        print(reward)


[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[500.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.

KeyboardInterrupt: 

: 