In [1]:
import retro
import time
import os

In [None]:
retro.data.list_games()

In [None]:
# python3 -m retro.import .     Run this command in the folder where the ROM is present

In [None]:
env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')
obs = env.reset()
done = False
for game in range(1):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        time.sleep(0.01)
        print(reward)
env.close()

In [None]:
info

In [2]:
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
import matplotlib.pyplot as plt

In [None]:
plt.imshow(obs)

In [3]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)

    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs

        self.score = 0
        return obs

    def preprocess(self, observation):
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84, 84, 1))
        return channels

    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)

        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info

    def render(self, *args, **kwargs):
        self.game.render()

    def close(self):
        self.game.close()  

In [None]:
env = StreetFighter()
obs = env.reset()
done = False
for game in range(1):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        time.sleep(0.01)
        print(reward)
env.close()

In [None]:
env = StreetFighter()
obs = env.reset()

In [None]:
plt.imshow(obs)

In [4]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

In [5]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [6]:
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs'

In [7]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [None]:
env.close()

In [8]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [9]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, n_steps=8192, gamma=0.9, learning_rate=1e-04, clip_range=0.45, gae_lambda=0.8)
model.learn(total_timesteps=500000, callback=callback)

Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to ./logs/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 6.31e+03 |
|    ep_rew_mean     | 6.6e+03  |
| time/              |          |
|    fps             | 571      |
|    iterations      | 1        |
|    time_elapsed    | 14       |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 6.88e+03    |
|    ep_rew_mean          | 6.6e+03     |
| time/                   |             |
|    fps                  | 250         |
|    iterations           | 2           |
|    time_elapsed         | 65          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.046882972 |
|    clip_fraction        | 0.0725      |
|    clip_range           | 0.45        |
|    entropy_loss         | -8.2

<stable_baselines3.ppo.ppo.PPO at 0x7f376e109a90>

In [None]:
!cd ~/RL_Games/StreetFighter/roms 
!python3 -m retro.import .

In [10]:
env.close()

In [12]:
import gym
import retro
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
import cv2
import time

model_path = "/home/sumukh/RL_Games/StreetFighter/train/best_model_280000.zip"
model = PPO.load(model_path)

env = StreetFighter()
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')  

obs = env.reset()
done = False
total_reward = 0

while not done:
    action, _ = model.predict(obs, deterministic=True)

    obs, reward, done, info = env.step(action)
    total_reward += reward

    env.render()
    time.sleep(0.001)

print("Total reward:", total_reward)

env.close()


Total reward: [13200.]
