In [1]:
# Import retro to play Street Fighter using a ROM
import retro
# Import time to slow down game
import time

In [2]:
# python -m retro.import . # Run this from the roms folder, or where you have your game roms 

In [2]:
# Import environment base class for a wrapper 
from gym import Env 
# Import the space shapes for the environment
from gym.spaces import MultiBinary, Discrete, Box
# Import numpy to calculate frame delta 
import numpy as np
# Import opencv for grayscaling
import cv2
# Import matplotlib for plotting the image
from matplotlib import pyplot as plt

In [3]:
# B, A, _, _, UP, DOWN, LEFT, RIGHT, C, Y, X, Z
# Every possible action for a given step in Street Fighter II
possible_actions = {
    # Idle
    0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    # Left
    1 : [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
    # Right
    2 : [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
    # Up
    3 : [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
    # Down
    4 : [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    # Light Kick
    5 : [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    # Medium Kick
    6 : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    # Hard Kick
    7 : [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    # Light Punch
    8 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
    # Medium Punch
    9 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
    # Hard Punch
    10 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
    # Down Left
    11 : [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
    # Down Right
    12 : [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
    # Up Left
    13 : [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
    # Up Right
    14 : [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
}

In [4]:
# Class based on github
class StreetFighter(Env): # pass in basic env from above to preprocessing
    def __init__(self):
        super().__init__() # inherit from base env
        # Specify action space and observation space 
        self.observation_space = Box(
            low=0,
            high=255,
            shape=(84, 84, 3),  # Keep 3 channels for RGB
            dtype=np.uint8
        )
        self.action_space = Discrete(12) # type of actions that can be taken
        self.health = 144
        self.enemy_health = 144
        self.score = 0
        self.matches_won = 0
        self.continue_timer = 100
        self.enemy_matches_won = 0
        # self.previous_action = np.zeros(12)
        # self.combo_scaler = 1
        # self.last_damage_instance = 0
        # Startup and instance of the game 
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.DISCRETE) # used to get valid button combos
    
    def reset(self): # restart
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs # sets previous frame to current frame
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def preprocess(self, observation):
        # Resize first to reduce computation
        resized = cv2.resize(observation, (84, 84), interpolation=cv2.INTER_AREA)
        
        # Simple color quantization using bitwise operations
        # Reduce to 3 bits per channel (8 values per channel)
        quantized = resized & 0b11100000
        
        # Optional: Create more distinct colors by increasing contrast
        # This helps make different elements more distinguishable
        quantized = cv2.convertScaleAbs(quantized, alpha=1.2, beta=10)
        
        # Method 1: Simple bitwise quantization
        return quantized
    
    def reward_function(self, state):
        # Extract variables
        continuetimer = state['continuetimer']
        enemy_matches_won = state['enemy_matches_won']
        enemy_health = state['enemy_health']
        health = state['health']
        matches_won = state['matches_won']
        score = state['score']

        # Initialize reward
        reward = 0

        # Reward for increasing score each frame (scaled down to avoid excessively large rewards)
        reward += score * 0.001  

        enemy_health_diff = self.enemy_health - enemy_health
        health_diff = self.health - health

        # catching edge cases to make sure no reward is being earned outside of a fight (i.e. in between rounds)
        if (self.enemy_health != 0 and state['enemy_health'] == 0 and self.health != 0 and state['health'] == 0) or (enemy_health_diff == 0 and health_diff == 0) or (self.health == 0 and self.enemy_health == 0):
            reward += 0
        else:
            if enemy_health_diff > health_diff:
                reward += ((enemy_health_diff) - (health_diff)) * 10
            else:
                reward += ((enemy_health_diff) - (health_diff))

        # Update previous states to enable frame-by-frame comparison
        self.enemy_health = enemy_health
        self.health = health
        self.matches_won = matches_won
        self.enemy_matches_won = enemy_matches_won
        self.continue_timer = continuetimer
        self.score = score
        # self.last_damage_instance += 1

        return reward
    
    def step(self, action): # how do we process action
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 

        self.previous_action = action
        
        # Frame delta 
        frame_delta = obs
        self.previous_frame = obs 
        
        # Reshape the reward function
        reward = self.reward_function(info)

        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs): # unpack any args and kwargs from stable baseline
        self.game.render()
        
    def close(self):
        self.game.close()

In [5]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO, A2C, DQN
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [7]:
# alternative version to use later to bypass factor 64 error

# PPO optimization parameters
PPO_PARAMS = {
    # 1. Core Parameters
    'n_steps_range': (2048, 4096),            
    # How many steps to run before updating the policy
    # - Larger values (4096): More stable learning, better for complex patterns
    # - Smaller values (2048): Faster updates, but potentially less stable
    # - Must be factor of 64 for efficiency
    
    'gamma_range': (0.95, 0.9999),             
    # Discount factor for future rewards
    # - Higher values (0.995): Values future rewards more (good for long-term strategy)
    # - Lower values (0.95): More focus on immediate rewards
    # - Street Fighter needs high gamma for combo learning
    
    'learning_rate_range': (5e-8, 1e-6),      
    # How big steps to take when updating the policy
    # - Higher values (5e-5): Faster learning but more unstable
    # - Lower values (1e-5): More stable but slower learning
    # - Lowered from original due to training instability
    
    'clip_range_range': (0.1, 0.3),           
    # How much to limit policy updates
    # - Higher values (0.3): Allows bigger policy changes
    # - Lower values (0.1): More conservative updates
    # - Centered around PPO standard of 0.2
    
    'gae_lambda_range': (0.9, 0.98),          
    # Controls advantage estimation smoothing
    # - Higher values (0.98): Better for long-term credit assignment
    # - Lower values (0.9): More focus on immediate advantages
    # - Important for linking moves into combos
    
    # 2. Advanced Parameters
    'ent_coef_range': (1e-8, 1e-3),          
    # Controls exploration vs exploitation
    # - Higher values (1e-3): More random actions/exploration
    # - Lower values (1e-8): More focused on best known moves
    # - Critical for discovering new combat strategies
    
    'vf_coef_range': (0.5, 1.0),             
    # Balances value function vs policy learning
    # - Higher values (1.0): More emphasis on state value estimation
    # - Lower values (0.5): More emphasis on action selection
    # - Helps agent understand positioning and health advantages
    
    'n_epochs_range': (5, 15),                
    # How many times to reuse each batch of data
    # - Higher values (15): More thorough learning from each experience
    # - Lower values (5): Less chance of overfitting to recent experiences
    # - Balances learning efficiency vs stability
    
    'batch_size_range': (64, 256)             
    # Size of chunks for processing training data
    # - Larger sizes (256): More stable updates but slower
    # - Smaller sizes (64): Faster but potentially noisier updates
    # - Must be power of 2 for efficiency
}

A2C_PARAMS = {
    # Your current parameters
    'n_steps_range': (5, 30),                  # Small steps, more frequent updates
    'gamma_range': (0.95, 0.9999),               # General discount range
    'learning_rate_range': (5e-8, 1e-6),       # Higher learning rates typically better
    'ent_coef_range': (1e-8, 1e-3),           # Entropy coefficient for exploration
    'vf_coef_range': (0.5, 1.0),              # Value function coefficient   
    'gae_lambda_range': (0.9, 0.98),           # Generalized Advantage Estimation lambda           
}

DQN_PARAMS = {
    'buffer_size_range': (50000, 100000),      # Smaller buffer size for quicker access
    'gamma_range': (0.9, 0.99),                # High gamma for long-term rewards
    'learning_rate_range': (1e-5, 1e-4),       # Lower learning rate for stability
    'batch_size_range': (32, 128),             # Standard DQN batch sizes
    'train_freq_range': (4, 16),               # Frequent updates to maintain stability
}

# Define the optimization function for PPO
def optimize_ppo(trial):
    return {
        # Dynamically Optimized Parameters
        'n_steps': trial.suggest_categorical('n_steps', range(PPO_PARAMS['n_steps_range'][0], PPO_PARAMS['n_steps_range'][1], 64)),

        'gamma': trial.suggest_loguniform('gamma', *PPO_PARAMS['gamma_range']),

        'learning_rate': trial.suggest_loguniform('learning_rate', *PPO_PARAMS['learning_rate_range']),

        'clip_range': trial.suggest_uniform('clip_range', *PPO_PARAMS['clip_range_range']),

        'gae_lambda': trial.suggest_uniform('gae_lambda', *PPO_PARAMS['gae_lambda_range']),

        'ent_coef': trial.suggest_loguniform('ent_coef', *PPO_PARAMS['ent_coef_range']),

        'vf_coef': trial.suggest_uniform('vf_coef', *PPO_PARAMS['vf_coef_range']),

        'n_epochs': trial.suggest_int('n_epochs', *PPO_PARAMS['n_epochs_range']),

        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256]),
        
        # Fixed Parameters (Stability Controls)
        'max_grad_norm': 0.5,          # Prevents explosive gradients
        'clip_range_vf': None,         # Uses same clipping as policy
        'target_kl': None,             # No KL divergence target
    }

def optimize_a2c(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', *A2C_PARAMS['n_steps_range']),

        'gamma': trial.suggest_loguniform('gamma', *A2C_PARAMS['gamma_range']),

        'learning_rate': trial.suggest_loguniform('learning_rate', *A2C_PARAMS['learning_rate_range']),

        'ent_coef': trial.suggest_loguniform('ent_coef', *A2C_PARAMS['ent_coef_range']),

        'vf_coef': trial.suggest_uniform('vf_coef', *A2C_PARAMS['vf_coef_range']),
        
        'gae_lambda': trial.suggest_uniform('gae_lambda', *A2C_PARAMS['gae_lambda_range']),

        # Fixed Parameters (Stability Controls)
        'max_grad_norm': 0.5,          # Prevents explosive gradients
    }

def optimize_dqn(trial):
    return {
        'buffer_size': trial.suggest_int('buffer_size', *DQN_PARAMS['buffer_size_range']),
        'gamma': trial.suggest_loguniform('gamma', *DQN_PARAMS['gamma_range']),
        'learning_rate': trial.suggest_loguniform('learning_rate', *DQN_PARAMS['learning_rate_range']),
        'batch_size': trial.suggest_categorical('batch_size', range(*DQN_PARAMS['batch_size_range'])),
        'train_freq': trial.suggest_categorical('train_freq', range(*DQN_PARAMS['train_freq_range'])),
    }


In [8]:
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

In [9]:
ALGORITHMS = {
    'PPO': (PPO, optimize_ppo),
    'A2C': (A2C, optimize_a2c),
    'DQN': (DQN, optimize_dqn),
}

def optimize_agent(trial, algo_name='PPO'):
    try:
        # Select algorithm and get hyperparameters
        ModelClass, optimize_fn = ALGORITHMS[algo_name]
        model_params = optimize_fn(trial)

        # Create environment
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Initialize and train model
        model = ModelClass('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params) # would recommend looking into other cnnpolicy's if they are compatible
        model.learn(total_timesteps=100000)

        # Evaluate model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        return -1000

In [11]:
# Creating the experiment 
study = optuna.create_study(direction='maximize') # since mean reward is positive we maximize, otherwise minimize
study.optimize(lambda trial: optimize_agent(trial, algo_name='DQN'), n_trials=25) # for prod used n_trials=100 (change algo name to change algos)

[I 2024-11-10 19:21:53,743] A new study created in memory with name: no-name-9674381e-9d79-42d6-b593-ef3525b1e988
  'gamma': trial.suggest_loguniform('gamma', *DQN_PARAMS['gamma_range']),
  'learning_rate': trial.suggest_loguniform('learning_rate', *DQN_PARAMS['learning_rate_range']),
[I 2024-11-10 19:39:12,447] Trial 0 finished with value: 8490.099999999999 and parameters: {'buffer_size': 53292, 'gamma': 0.9387479312094781, 'learning_rate': 1.9629879304410023e-05, 'batch_size': 112, 'train_freq': 7}. Best is trial 0 with value: 8490.099999999999.
  'gamma': trial.suggest_loguniform('gamma', *DQN_PARAMS['gamma_range']),
  'learning_rate': trial.suggest_loguniform('learning_rate', *DQN_PARAMS['learning_rate_range']),
[I 2024-11-10 19:55:49,063] Trial 1 finished with value: 3012.5 and parameters: {'buffer_size': 97357, 'gamma': 0.9168646825796313, 'learning_rate': 2.1138786372168886e-05, 'batch_size': 122, 'train_freq': 8}. Best is trial 0 with value: 8490.099999999999.
[I 2024-11-10 20:

In [None]:
study.best_params

In [10]:
best_params = {'buffer_size': 51607, 'gamma': 0.9394297700138379, 'learning_rate': 4.1994307627873816e-05, 'batch_size': 56, 'train_freq': 12}

In [None]:
study.best_trial

In [11]:
# model = A2C.load(os.path.join(OPT_DIR, 'trial_4_best_model.zip'))
model = DQN.load(os.path.join(OPT_DIR, 'trial_2_best_model.zip'))

  th_object = th.load(file_content, map_location=device)


# Setup Callback

In [12]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [13]:
class TrainAndLoggingCallback(BaseCallback): # continuously learn by starting from best parameters done above

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [14]:
CHECKPOINT_DIR = './train/'

In [15]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# Train Model

In [16]:
# Create environment 
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [18]:
model_params = best_params
# model_params = hyperparams

In [19]:
# model_params['n_steps'] = model_params['n_steps'] - (model_params['n_steps'] % 64)  # set n_steps to 7488 or a factor of 64 (ONLY NEEDED FOR PPO, CHECK YOUR STEPS AND CHANGE TO FAC OF 64)
# model_params['learning_rate'] = 5e-7 -> if really slow at training
model_params

{'buffer_size': 51607,
 'gamma': 0.9394297700138379,
 'learning_rate': 4.1994307627873816e-05,
 'batch_size': 56,
 'train_freq': 12}

In [20]:
model = DQN('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params) # verbose 1 shows results as training
# model = A2C('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params) # verbose 1 shows results as training

Using cpu device
Wrapping the env in a VecTransposeImage.




In [21]:
# Reload previous weights from HPO
model.load(os.path.join(OPT_DIR, 'trial_2_best_model.zip'))

  th_object = th.load(file_content, map_location=device)


<stable_baselines3.dqn.dqn.DQN at 0x24b9972c6a0>

In [22]:
# Kick off training 
model.learn(total_timesteps=5000000, callback=callback) # timestep 5000000 recommended

Logging to ./logs/DQN_16
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.06e+04 |
|    ep_rew_mean      | 2.5e+05  |
|    exploration_rate | 0.92     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 714      |
|    time_elapsed     | 59       |
|    total_timesteps  | 42291    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 9.67e+03 |
|    ep_rew_mean      | 1.95e+05 |
|    exploration_rate | 0.853    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 306      |
|    time_elapsed     | 252      |
|    total_timesteps  | 77351    |
| train/              |          |
|    learning_rate    | 4.2e-05  |
|    loss             | 12.7     |
|    n_updates        | 2279     |
----------------------------------
----------------------------------
| rollout/            |       

KeyboardInterrupt: 

In [37]:
# tensorboard --logdir=. 
# cd to logs
# ^ use to visually see learning progress

In [None]:
model = DQN.load('./train/best_model_500000.zip')
# model = A2C.load('./train/best_model_7000000.zip')

In [39]:
obs = env.reset()

In [27]:
env.close()

In [None]:
# Reset game to starting state
try:
    obs = env.reset()
except:
    pass

# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        print(action)
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        # print(reward)