# Setup StreetFighter

In [None]:
#Code must be run with Python 3.7
%pip install setuptools==66
%pip install wheel==0.38.4
%pip install gym==0.21.0
%pip install importlib-metadata==4.8.1
%pip install pyglet==1.3.2

In [None]:
# Import retro to play Street Fighter using a ROM
# in cmd go to rom path and write: py -3.7 -m retro.import .
# %pip install gym-retro
import retro

Gym Retro is an open-source platform that combines artificial intelligence (AI) with retro video games. It's built on top of OpenAI's Gym toolkit, providing a way to train and test AI agents using classic games from platforms like Atari, Nintendo, and Sega. The platform serves as a bridge between reinforcement learning algorithms and vintage video games, allowing researchers and enthusiasts to develop and test AI models within these familiar gaming environments. 

In [None]:
# Import time to slow down game
import time

In [None]:
# Starts up the game environment
env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')

In [None]:
# Sample the observation space
env.observation_space

In [None]:
# Sample the actions that are avaialble - MultiBinary
env.action_space.sample()

In [None]:
#To see the game played by itself

# # # Reset game to starting state
# obs = env.reset()
# # Set flag to flase
# done = False
# for game in range(1): 
#     while not done: 
#         if done: 
#             obs = env.reset()
#         env.render()
#         obs, reward, done, info = env.step(env.action_space.sample())
#         time.sleep(0.01)
#         print(reward)

In [None]:
env.close()

In [None]:
# info

# Setup Environment

- Observation Preprocess - grayscale , frame delta, resize the frame so we have less pixels   
- Filter the action - parameter 
- Reward function - set this to the score

In [None]:
%pip install numpy==1.20.3
%pip install opencv-python
%pip install matplotlib

In [None]:
# Import environment base class for a wrapper 
from gym import Env 
# Import the space shapes for the environment
from gym.spaces import MultiBinary, Box
# Import numpy to calculate frame delta 
import numpy as np
# Import matplotlib for plotting the image
from matplotlib import pyplot as plt
# Import opencv for grayscaling
import cv2

## 1. frame
## 2. preprocess 200x256x3 -> 84x84x1
## 3. change in pixels current_frame - the last frame

In [None]:
# Create custom environment 
class StreetFighter(Env): 
    def __init__(self):
        super().__init__()
        # Specify action space and observation space 
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # Startup and instance of the game 
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        self.health=176
        self.enemy_health=176
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def preprocess(self, observation): 
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize 
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84,84,1))
        return channels 
    
    def step(self, action): 
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 
        
        # Frame delta 
        frame_delta = obs
        # self.previous_frame = obs 
        
        # print(f"self.enemy_health: {self.enemy_health}")
        # print(f"info['enemy_health']: {info['enemy_health']}")
        # print(f"info['health']: {info['health']}")
        # print(f"self.health: {self.health}")
        # Reshape the reward function
        reward = info['score'] - self.score 
        self.score = info['score'] 
        # reward = (self.enemy_health-info['enemy_health'])*2+(info['health']-self.health)
        # print(f"reward: {reward}")
        # self.health=info['health']
        # self.enemy_health=info['enemy_health']
        
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()

In [None]:
env = StreetFighter()

In [None]:
env.observation_space.shape

In [None]:
env.action_space.shape

In [None]:
# # # Reset game to starting state
# obs = env.reset()
# # Set flag to flase
# done = False
# for game in range(1): 
#     while not done: 
#         if done: 
#             obs = env.reset()
#         env.render()
#         obs, reward, done, info = env.step(env.action_space.sample())
#         time.sleep(0.01)
#         if reward > 0: 
#             print(reward)

In [None]:
obs = env.reset()

In [None]:
obs, reward, done, info = env.step(env.action_space.sample())

In [None]:
plt.imshow(cv2.cvtColor(obs, cv2.COLOR_BGR2RGB))

In [None]:
env.close()

# Hyperparameters Tuning

In [None]:
%pip install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio===0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
%pip install optuna
%pip install stable-baselines3[extra]==1.2.0

In [None]:
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

import os

In [None]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'
PARAM_DIR = './param/'

def optimize_ppo(trial): #define the objective function
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),                 
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-7, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

# Run a training loop and return mean reward 
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Create algo 
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        #model.learn(total_timesteps=100000) 
        model.learn(total_timesteps=10000)
        

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=25)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        BEST_PARAM_PATH = os.path.join(PARAM_DIR,  'trial_{}_best_params.txt'.format(trial.number))
        with open(BEST_PARAM_PATH, 'w') as f:
            for key, value in model_params.items():
                f.write('{}: {}\n'.format(key, value))
            f.write('mean reward : {}'.format(mean_reward))

        return mean_reward

    except Exception as e:
        import traceback
        print("Exception occurred:", e)
        print(traceback.format_exc())
        return -1000

In [None]:
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=5, n_jobs=1)  # n_jobs>=1 for parallel execution, doesnt work for gym retro
#study.optimize(optimize_agent, n_trials=10, n_jobs=1)

In [None]:
best_trial_number = study.best_trial.number
best_mean = study.best_trial.values
BEST_PARAM_PATH = os.path.join(PARAM_DIR,  'trial_{}_best_params.txt'.format(best_trial_number))
with open(BEST_PARAM_PATH, 'w') as f:
    for key, value in study.best_params.items():
        f.write('{}: {}\n'.format(key, value))
    f.write('mean reward : {}'.format(best_mean))



In [None]:
best_params = study.best_params

In [None]:
best_params

In [None]:
study.best_trial
study_best_trial_log = study.best_trial.number - 1 

In [None]:
study.get_trials()

In [None]:
#how to reload the best model
model = PPO.load(os.path.join(OPT_DIR, f'trial_{study_best_trial_log}_best_model.zip'))

# Setup Callbacks

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = './train/'

In [None]:
callback = TrainAndLoggingCallback(check_freq=50000, save_path=CHECKPOINT_DIR)

# Train Model

In [None]:
# Create environment 

env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:

model_params = study.best_params
model_params['n_steps'] = 2368  # set n_steps to 7488 or a factor of 64 for train to train3, train4 with 2368 closer to 2386 same train5
model_params['learning_rate'] = 2.3e-7  #from 5.91e-05 to 5e-9 to 5e-8 to 5e-7 to 1e-7 to 3e-7 to 2.5e-7 to 2.3e-7 to 2.3e-7
model_params

In [None]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

In [None]:
# Reload previous weights from HPO
model.load(os.path.join(OPT_DIR, f'trial_{1}_best_model.zip'))

In [None]:
# Kick off training 
model.learn(total_timesteps=1000000, callback=callback)
# model.learn(total_timestep=5000000) 

# Evaluate Model

In [None]:
# start_value = 3000000
# stop_value = 4010000
# step_size = 10000
# max=0
# imax=3000000

# for i in range(start_value, stop_value, step_size):
#     model = PPO.load(f'./train/best_model_{i}.zip')
#     mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=3)
#     if mean_reward > max:
#         max = mean_reward
#         imax = i

In [None]:
model = PPO.load('./train7/best_model_5800000.zip')

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)

In [None]:
mean_reward

# Test out the Model

In [None]:
obs = env.reset()

In [None]:
obs.shape

In [None]:
env.step(model.predict(obs)[0])

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.005)
        print(reward)