## python -m pip install pip==23.2
^ this version worked for me

## pip install gym-retro

## pip install setuptools==65.5.0 "wheel<0.40.0"

## pip install gym==0.21.0

In [1]:
%pip install gym gym-retro

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Take note of my versions^

In [2]:
# Import retro to play Street Fighter using a ROM
import retro
# Import time to slow down game
import time

In [3]:
# python -m retro.import . # Run this from the roms folder, or where you have your game roms 

In [4]:
# Closes the game environment - important given we can only run one at a time 
# env.close()

# Setup Environment
## What we are going to do! FUNNN

- Observation Preprocess - grayscale (DONE), frame delta, resize the frame so we have less pixels (DONE)
- Filter the action - parameter DONE
- Reward function - set this to the score


In [5]:
%pip install opencv-python

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Import environment base class for a wrapper 
from gym import Env 
# Import the space shapes for the environment
from gym.spaces import Discrete, Box
# Import numpy to calculate frame delta 
import numpy as np
# Import opencv for grayscaling
import cv2
# Import matplotlib for plotting the image
from matplotlib import pyplot as plt

In [7]:
# 1. frame
# 2. preprocess 200x256x3 -> 84x84x1
# 3. change in pixels: current_frame-last_frame

In [8]:
# Create custom environment 
class StreetFighter(Env): # pass in basic env from above to preprocessing
    def __init__(self):
        super().__init__() # inherit from base env
        # Specify action space and observation space 
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) # grayscaled frame, smaller amt of pixels
        self.action_space = Discrete(12) # type of actions that can be taken
        # Startup and instance of the game 
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.DISCRETE) # used to get valid button combos
    
    def reset(self): # restart
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs # sets previous frame to current frame
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def preprocess(self, observation): # grayscale, resize
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize 
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84,84,1))
        return channels 
    
    def step(self, action): # how do we process action
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 
        
        # Frame delta 
        frame_delta = obs - self.previous_frame # change in pixels (was dropped in final model of tutorial)
        self.previous_frame = obs 
        
        # Reshape the reward function
        reward = info['score'] - self.score 
        self.score = info['score'] 
        
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs): # unpack any args and kwargs from stable baseline
        self.game.render()
        
    def close(self):
        self.game.close()

# Hyperparameter Tune

https://pytorch.org/get-started/locally/  <- use this site to download pytorch

In [9]:
%pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
%pip install stable-baselines3[extra]==1.3.0

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO, A2C, DQN
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [14]:
# alternative version to use later to bypass factor 64 error

# PPO optimization parameters
PPO_PARAMS = {
    'n_steps_range': (2048, 8192),
    'gamma_range': (0.8, 0.9999),
    'learning_rate_range': (1e-5, 1e-4),
    'clip_range_range': (0.1, 0.4),
    'gae_lambda_range': (0.8, 0.99),
}

A2C_PARAMS = {
    'n_steps_range': (5, 20),                  # Small steps, more frequent updates
    'gamma_range': (0.8, 0.99),                # General discount range
    'learning_rate_range': (1e-5, 1e-3),       # Higher learning rates typically better
}

DQN_PARAMS = {
    'buffer_size_range': (50000, 100000),      # Smaller buffer size for quicker access
    'gamma_range': (0.9, 0.99),                # High gamma for long-term rewards
    'learning_rate_range': (1e-5, 1e-4),       # Lower learning rate for stability
    'batch_size_range': (32, 128),             # Standard DQN batch sizes
    'train_freq_range': (4, 16),               # Frequent updates to maintain stability
}

# Define the optimization function for PPO
def optimize_ppo(trial): 
    n_steps = trial.suggest_categorical('n_steps', range(PPO_PARAMS['n_steps_range'][0], PPO_PARAMS['n_steps_range'][1], 64))  # Steps of 64
    return {
        'n_steps': n_steps,
        'gamma': trial.suggest_loguniform('gamma', *PPO_PARAMS['gamma_range']),
        'learning_rate': trial.suggest_loguniform('learning_rate', *PPO_PARAMS['learning_rate_range']),
        'clip_range': trial.suggest_uniform('clip_range', *PPO_PARAMS['clip_range_range']),
        'gae_lambda': trial.suggest_uniform('gae_lambda', *PPO_PARAMS['gae_lambda_range']),
    }

def optimize_a2c(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', *A2C_PARAMS['n_steps_range']),
        'gamma': trial.suggest_loguniform('gamma', *A2C_PARAMS['gamma_range']),
        'learning_rate': trial.suggest_loguniform('learning_rate', *A2C_PARAMS['learning_rate_range']),
    }

def optimize_dqn(trial):
    return {
        'buffer_size': trial.suggest_int('buffer_size', *DQN_PARAMS['buffer_size_range']),
        'gamma': trial.suggest_loguniform('gamma', *DQN_PARAMS['gamma_range']),
        'learning_rate': trial.suggest_loguniform('learning_rate', *DQN_PARAMS['learning_rate_range']),
        'batch_size': trial.suggest_categorical('batch_size', range(*DQN_PARAMS['batch_size_range'])),
        'train_freq': trial.suggest_categorical('train_freq', range(*DQN_PARAMS['train_freq_range'])),
    }


In [15]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

In [16]:
ALGORITHMS = {
    'PPO': (PPO, optimize_ppo),
    'A2C': (A2C, optimize_a2c),
    'DQN': (DQN, optimize_dqn),
}

def optimize_agent(trial, algo_name='PPO'):
    try:
        # Select algorithm and get hyperparameters
        ModelClass, optimize_fn = ALGORITHMS[algo_name]
        model_params = optimize_fn(trial)

        # Create environment
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Initialize and train model
        model = ModelClass('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params) # would recommend looking into other cnnpolicy's if they are compatible
        model.learn(total_timesteps=10000)

        # Evaluate model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=2)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        return -1000

In [17]:
# Creating the experiment 
study = optuna.create_study(direction='maximize') # since mean reward is positive we maximize, otherwise minimize
study.optimize(lambda trial: optimize_agent(trial, algo_name='DQN'), n_trials=5) # for prod used n_trials=100 (change algo name to change algos)

[I 2024-11-05 14:33:31,301] A new study created in memory with name: no-name-e666023a-1304-41d1-91ac-e75e9b79a73e
  'gamma': trial.suggest_loguniform('gamma', *DQN_PARAMS['gamma_range']),
  'learning_rate': trial.suggest_loguniform('learning_rate', *DQN_PARAMS['learning_rate_range']),
[I 2024-11-05 14:34:14,976] Trial 0 finished with value: 3900.0 and parameters: {'buffer_size': 83400, 'gamma': 0.9841654381927392, 'learning_rate': 4.288856918915396e-05, 'batch_size': 48, 'train_freq': 9}. Best is trial 0 with value: 3900.0.
  'gamma': trial.suggest_loguniform('gamma', *DQN_PARAMS['gamma_range']),
  'learning_rate': trial.suggest_loguniform('learning_rate', *DQN_PARAMS['learning_rate_range']),
[I 2024-11-05 14:34:50,932] Trial 1 finished with value: 2000.0 and parameters: {'buffer_size': 58034, 'gamma': 0.9059318651401091, 'learning_rate': 2.3440341823828695e-05, 'batch_size': 83, 'train_freq': 14}. Best is trial 0 with value: 3900.0.
[I 2024-11-05 14:35:35,918] Trial 2 finished with va

In [18]:
study.best_params

{'buffer_size': 83400,
 'gamma': 0.9841654381927392,
 'learning_rate': 4.288856918915396e-05,
 'batch_size': 48,
 'train_freq': 9}

In [19]:
study.best_trial

FrozenTrial(number=0, state=TrialState.COMPLETE, values=[3900.0], datetime_start=datetime.datetime(2024, 11, 5, 14, 33, 31, 301143), datetime_complete=datetime.datetime(2024, 11, 5, 14, 34, 14, 975679), params={'buffer_size': 83400, 'gamma': 0.9841654381927392, 'learning_rate': 4.288856918915396e-05, 'batch_size': 48, 'train_freq': 9}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'buffer_size': IntDistribution(high=100000, log=False, low=50000, step=1), 'gamma': FloatDistribution(high=0.99, log=True, low=0.9, step=None), 'learning_rate': FloatDistribution(high=0.0001, log=True, low=1e-05, step=None), 'batch_size': CategoricalDistribution(choices=(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 10

In [21]:
model = DQN.load(os.path.join(OPT_DIR, 'trial_0_best_model.zip'))

  th_object = th.load(file_content, map_location=device)


# Setup Callback

In [22]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [23]:
class TrainAndLoggingCallback(BaseCallback): # continuously learn by starting from best parameters done above

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [24]:
CHECKPOINT_DIR = './train/'

In [25]:
callback = TrainAndLoggingCallback(check_freq=1000, save_path=CHECKPOINT_DIR)

# Train Model

In [33]:
# Create environment 
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [34]:
model_params = study.best_params
# model_params['n_steps'] = 7488  # set n_steps to 7488 or a factor of 64 (ONLY NEEDED FOR PPO, CHECK YOUR STEPS AND CHANGE TO FAC OF 64)
# model_params['learning_rate'] = 5e-7 -> if really slow at training
model_params

{'buffer_size': 83400,
 'gamma': 0.9841654381927392,
 'learning_rate': 4.288856918915396e-05,
 'batch_size': 48,
 'train_freq': 9}

In [35]:
# model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params) # verbose 1 shows results as training
model = DQN('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params) # verbose 1 shows results as training

Using cpu device
Wrapping the env in a VecTransposeImage.




In [36]:
# Reload previous weights from HPO
model.load(os.path.join(OPT_DIR, 'trial_0_best_model.zip'))



<stable_baselines3.dqn.dqn.DQN at 0x1f610939580>

In [37]:
# Kick off training 
model.learn(total_timesteps=500000, callback=callback) # timestep 5000000 recommended

Logging to ./logs/DQN_7
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 9.59e+03 |
|    ep_rew_mean      | 2.35e+04 |
|    exploration_rate | 0.271    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1318     |
|    time_elapsed     | 29       |
|    total_timesteps  | 38353    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.99e+03 |
|    ep_rew_mean      | 1.85e+04 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 433      |
|    time_elapsed     | 165      |
|    total_timesteps  | 71949    |
| train/              |          |
|    learning_rate    | 4.29e-05 |
|    loss             | 0.000336 |
|    n_updates        | 2439     |
----------------------------------
----------------------------------
| rollout/            |        

KeyboardInterrupt: 

In [45]:
# tensorboard --logdir=. 
# cd to logs
# ^ use to visually see learning progress

# Evaluate Model

In [None]:
# model = PPO.load('./train/best_model_37000.zip')
# model = A2C.load('./train/best_model_66000.zip')
model = DQN.load('./train/best_model_66000.zip')

In [70]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)

In [88]:
mean_reward

4800.0

# Testing Model

In [71]:
obs = env.reset()

In [90]:
obs.shape

(1, 84, 84, 4)

In [72]:
env.step(model.predict(obs)[0])

(array([[[[  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  33,   0],
          ...,
          [  0,   0,  37,   0],
          [  0,   0,  34,   0],
          [  0,   0,  36,   0]],
 
         [[  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  35,   0],
          ...,
          [  0,   0,  36,   0],
          [  0,   0,  40,   0],
          [  0,   0,  40,   0]],
 
         [[  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  36,   0],
          ...,
          [  0,   0,  36,   0],
          [  0,   0,  36,   0],
          [  0,   0,  36,   0]],
 
         ...,
 
         [[  0,   0, 162,   0],
          [  0,   0, 159,   0],
          [  0,   0, 159,   0],
          ...,
          [  0,   0, 159,   0],
          [  0,   0, 159,   0],
          [  0,   0, 159,   0]],
 
         [[  0,   0, 162,   0],
          [  0,   0, 162,   0],
          [  0,   0, 162,   0],
          ...,
          [  0,   0, 162,   0],
 

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        print(action)
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        # print(reward)

1
3
3
5
7
6
5
10
13
11
2
11
12
13
10
1
4
2
7
1
11
14
9
7
7
14
14
3
9
4
7
8
1
4
7
2
5
6
8
0
10
4
9
6
11
3
8
9
11
3
10
2
6
3
13
14
12
12
14
10
4
4
10
6
9
3
1
9
3
0
6
8
9
3
8
12
7
11
3
13
7
6
0
12
3
1
14
3
14
1
5
11
11
2
9
14
2
5
10
4
14
12
12
13
12
10
0
4
2
9
1
1
4
11
1
5
3
6
7
9
8
2
9
13
1
6
4
2
0
1
8
4
11
13
8
2
3
4
4
5
11
14
8
6
14
12
13
14
7
14
8
1
0
12
7
9
0
6
8
3
8
12
13
5
3
8
11
4
2
5
1
13
8
11
0
9
9
14
11
3
14
11
4
0
5
5
7
9
3
13
1
7
2
1
12
9
14
6
3
6
4
10
5
8
9
6
10
1
6
7
11
12
12
4
14
7
8
6
12
5
8
6
4
11
12
6
9
9
14
12
6
12
12
1
12
12
6
2
12
8
12
5
4
14
14
12
8
11
10
8
13
12
4
10
7
5
7
4
13
3
9
12
10
6
12
12
7
4
3
10
6
12
4
14
1
6
7
2
3
14
10
12
1
7
0
13
0
1
12
7
9
1
2
6
9
14
5
2
12
14
2
7
12
12
2
4
11
0
14
14
7
12
11
13
7
10
14
9
3
4
2
6
0
3
2
7
14
5
5
8
12
14
4
7
13
1
6
0
2
8
6
13
5
9
10
13
9
13
13
8
14
11
11
14
6
14
12
11
6
3
3
2
8
14
7
14
10
13
6
6
4
9
0
0
6
11
10
4
9
8
7
8
11
2
14
12
7
4
14
5
1
10
2
3
7
6
13
4
10
4
13
9
4
14
13
2
4
7
12
11
11
6
14
5
6
6
4
11
10
6
1
9
4
0
1

: 

In [23]:
env = StreetFighter()

In [20]:
import numpy as np # for calculating frame changes

In [32]:
env.close()

In [None]:
# pip list

In [None]:
# was on protobuf 5.28.2

In [None]:
# please note that pyglet 1.3.2 was for tensorboard, the rendering was for newesrt version