In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.optim.lr_scheduler import ExponentialLR
import numpy as np
import random
import gymnasium as gym
from gymnasium.spaces import Box
from gymnasium.wrappers import FrameStackObservation, TimeLimit, ResizeObservation, RecordVideo, MaxAndSkipObservation
from collections import deque
import retro
import io
import time

In [42]:
RENDER_ENV = False
RESIZE_ENV = True
LOAD_MODEL = False
Render_Frame_rate=4
new_size = (84,120) #Original Size 320, 224
batch_size = 32
num_episodes = 46
max_steps_per_episode = 5400
num_stacked_frames = 4
num_frame_skip = 2
version = 3
Model = "DQN"

In [3]:
import os

def get_last_modified_file(directory_path):
    if not os.path.isdir(directory_path):
        print(f"Error: Directory '{directory_path}' does not exist.")
        return None
    files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
    if not files:
        return None
    files.sort(key=os.path.getmtime, reverse=True)
    return files[0]

target_directory = f"../Saved_Models/{Model}"  # Replace with your directory path
model_load_path = get_last_modified_file(target_directory)

if model_load_path:
    print(f"The last modified file is: {model_load_path}")
else:
    print("No files found in the directory or directory does not exist.")

The last modified file is: ../Saved_Models/DQN/SB3-DQN-Sonic-V3-E200-S5400.zip


In [4]:
class ButtonActionWrapper(gym.Wrapper):
    def __init__(self, env, buttons):
        super().__init__(env)
        self.buttons = buttons
        self._actions = np.identity(len(buttons), dtype=np.int8)
        self.action_space = gym.spaces.Discrete(len(buttons))

    def step(self, action):
        return self.env.step(self._actions[action])

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return obs, info

In [6]:
class CustomRewardWrapper(gym.RewardWrapper):
    def __init__(self, env, mov_rew=0.01, score_rew=0.05, hp_rew=4, ring_rew=1, end_bonus=100):
        super(CustomRewardWrapper, self).__init__(env)
        self.mov_rew = mov_rew
        self.score_rew = score_rew
        self.hp_rew = hp_rew
        self.ring_rew = ring_rew
        self.end_bonus = end_bonus

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        game_variables = self.env.unwrapped.data.lookup_all()

        self.previous_pos_x = game_variables['x']
        self.previous_score = game_variables['score']
        self.previous_lives = game_variables['lives']
        self.previous_rings = game_variables['rings']
        self.previous_end_bonus = game_variables['level_end_bonus']

        return obs, info

    def reward(self, reward):
        #print(f"Reward original: {reward}")
        custom_reward = reward
        game_state = self.env.unwrapped.data

        if game_state:
            game_variables = game_state.lookup_all()
            current_pos_x = game_variables['x']
            current_score = game_variables['score']
            current_lives = game_variables['lives']
            current_rings = game_variables['rings']
            current_end_bonus = game_variables['level_end_bonus']

            # moverse hacia la derecha
            if current_pos_x > self.previous_pos_x:
                #Recompensa
                custom_reward += self.mov_rew
            else:
                #Penalizacion
                custom_reward -= (self.mov_rew/2)

            #Recompensa por puntaje
            if current_score > self.previous_score:
                custom_reward += self.score_rew*(current_score-self.previous_score)
            
            #Recompensa por ganar vida
            if current_lives > self.previous_lives:
                custom_reward += self.hp_rew*(current_lives-self.previous_lives)

            #Penalizacion por perder vida
            if current_lives < self.previous_lives:
                custom_reward += (self.hp_rew/2)*(current_lives-self.previous_lives)

            #Recompensa por conseguir anillos
            if current_rings > self.previous_rings:
                custom_reward += self.ring_rew*(current_rings-self.previous_rings)
            
            #Penalizacion por perder anillos
            if current_rings < self.previous_rings:
                custom_reward += (self.ring_rew/2)*(current_rings-self.previous_rings)

            #Recompensa por completar nivel
            if current_end_bonus > self.previous_end_bonus:
                custom_reward += self.end_bonus

            self.previous_pos_x = current_pos_x
            self.previous_score = current_score
            self.previous_lives = current_lives
            self.previous_rings = current_rings
            self.previous_end_bonus = current_end_bonus


        return custom_reward

In [7]:
class StochasticFrameSkip(gym.Wrapper):
    def __init__(self, env, n, stickprob):
        gym.Wrapper.__init__(self, env)
        self.n = n
        self.stickprob = stickprob
        self.curac = None
        self.rng = np.random.RandomState()
        self.supports_want_render = hasattr(env, "supports_want_render")

    def reset(self, **kwargs):
        self.curac = None
        return self.env.reset(**kwargs)

    def step(self, ac):
        terminated = False
        truncated = False
        totrew = 0
        for i in range(self.n):
            # First step after reset, use action
            if self.curac is None:
                self.curac = ac
            # First substep, delay with probability=stickprob
            elif i == 0:
                if self.rng.rand() > self.stickprob:
                    self.curac = ac
            # Second substep, new action definitely kicks in
            elif i == 1:
                self.curac = ac
            if self.supports_want_render and i < self.n - 1:
                ob, rew, terminated, truncated, info = self.env.step(
                    self.curac,
                    want_render=False,
                )
            else:
                ob, rew, terminated, truncated, info = self.env.step(self.curac)
            totrew += rew
            if terminated or truncated:
                break
        return ob, totrew, terminated, truncated, info

In [8]:
class ConvDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(ConvDQN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(64, 64, kernel_size=2, stride=1),
            nn.LeakyReLU()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(self.calc_conv_output(input_shape), 512),
            nn.LeakyReLU(),
            nn.Linear(512, num_actions)
        )

    def calc_conv_output(self, shape):
        dummy_input = torch.zeros(1, *shape)
        dummy_output = self.conv_layers(dummy_input)
        return int(np.prod(dummy_output.size()))

    def forward(self, x):
        conv_out = self.conv_layers(x).view(x.size()[0], -1)
        return self.fc_layers(conv_out)

In [None]:
class ConvDQNAgent:
    def __init__(self, input_shape, num_actions, lr, gamma, epsilon, epsilon_decay, buffer_size):
        self.input_shape = input_shape
        self.num_actions = num_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)
        self.device = 'cuda'
        self.model = ConvDQN(input_shape, num_actions).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def preprocess(self, state):
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        transform = T.Lambda(lambda x: x.permute(0,3,1,2).reshape(-1, self.input_shape[1], self.input_shape[2]))
        return transform(state)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.num_actions)
        state = self.preprocess(state)
        q_values = self.model(state.unsqueeze(0).to(self.device))
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state = self.preprocess(next_state)
                target = reward + self.gamma * torch.max(self.model(next_state.unsqueeze(0).to(self.device))).item()
            state = self.preprocess(state)
            target_f = self.model(state.unsqueeze(0).to(self.device)).to("cpu").detach().numpy()
            target_f[0][action] = target
            self.optimizer.zero_grad()
            loss = nn.MSELoss()(torch.tensor(target_f).to(self.device), self.model(state.unsqueeze(0).to(self.device)))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay

In [10]:
#Guardar Modelo
def save_model(agent, episode):
    model_save_path = f'../Saved_Models/DQN/DQN-Sonic-V{version}-E{episode}-S{max_steps_per_episode}.pth' #ppt para jit, pth para statedict
    try:
        torch.save(agent.model.state_dict(), model_save_path)
        #torch.save(agent.model, model_save_path)
        print(f'Modelo exitosamente guardado en {model_save_path}')
    except Exception as e:
        print(f'Error guardando el modelo error: {e}')

In [43]:
try:
    env.close()
except:
    print('No Enviroment to close')

In [21]:
def make_env(*, game, state=None, max_episode_steps=4500, **kwargs):
    if state is None:
        state = retro.State.DEFAULT
    env = retro.make(game, state, **kwargs)
    env = ButtonActionWrapper(env, buttons=['B', 'A', 'MODE', 'START', 'UP', 'DOWN', 'LEFT', 'RIGHT', 'C', 'Y', 'X', 'Z']) #['LEFT', 'RIGHT', 'A']
    env = CustomRewardWrapper(env)
    env = StochasticFrameSkip(env, n=num_frame_skip, stickprob=0.25)
    if RESIZE_ENV:
        input_shape = (num_stacked_frames*3, *new_size)
        env = ResizeObservation(env, new_size)
    else:
        input_shape = (num_stacked_frames*3, 224, 320)
    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    env = FrameStackObservation(env, stack_size=num_stacked_frames)
    return env, input_shape

In [44]:
env, input_shape = make_env(game="SonicTheHedgehog-Genesis", render_mode='rgb_array', max_episode_steps=max_steps_per_episode) #rgb_array
action_dim = env.action_space.n
print(action_dim)
#venv = VecTransposeImage(VecFrameStack(SubprocVecEnv([make_env] * 8), n_stack=4))
agent = ConvDQNAgent(input_shape, action_dim, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.99, buffer_size=10000)
if LOAD_MODEL:
  agent.model.state_dict(torch.load(model_load_path, map_location=agent.device))

temp_reward = 0
frame_count_prev = 0
frame_count = 0
for episode in range(num_episodes):
  state, info = env.reset()
  total_reward = 0
  done = False
  while not done:
    frame_count += 1
    action = agent.act(state = state)
    observation, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    agent.remember(state, action, reward, observation, done)
    state = observation
    total_reward += reward
    temp_reward += reward
    if frame_count % 5 == 0:
      print(f'step n={frame_count} with reward {temp_reward}')
      temp_reward = 0
    agent.replay(batch_size)

  if (episode+1) % 2 == 0:
    print(f'Episode {episode+1} \nstep n={(frame_count-frame_count_prev)/4}\nreward {temp_reward/4}\n')
    temp_reward = 0
    frame_count_prev=frame_count
  if (episode+1) % 12 == 0:
    save_model(agent, episode)
env.close()
print(f"Episode finished with total reward: {total_reward}")

12
step n=5 with reward -0.05
step n=10 with reward -0.05
step n=15 with reward -0.05
step n=20 with reward -0.05
step n=25 with reward -0.05
step n=30 with reward -0.05
step n=35 with reward -0.05
step n=40 with reward 0.025
step n=45 with reward 0.1
step n=50 with reward 0.1
step n=55 with reward 0.085
step n=60 with reward 0.025
step n=65 with reward -0.05
step n=70 with reward -0.019999999999999997
step n=75 with reward -0.005
step n=80 with reward -0.05
step n=85 with reward -0.05
step n=90 with reward -0.05
step n=95 with reward -0.05
step n=100 with reward -0.05
step n=105 with reward -0.05
step n=110 with reward -0.05
step n=115 with reward -0.05
step n=120 with reward -0.05
step n=125 with reward -0.05
step n=130 with reward -0.05
step n=135 with reward -0.05
step n=140 with reward -0.05
step n=145 with reward -0.05
step n=150 with reward -0.05
step n=155 with reward -0.05
step n=160 with reward -0.05
step n=165 with reward -0.05
step n=170 with reward -0.05
step n=175 with re

KeyboardInterrupt: 

In [None]:
env, input_shape = make_env(game="SonicTheHedgehog-Genesis", render_mode='rgb_array', max_episode_steps=max_steps_per_episode) #rgb_array
env = RecordVideo(
    env,
    video_folder='../Video',    # Folder to save videos
    name_prefix=f'eval-V{version}-E{episode}-S{max_steps_per_episode}',               # Prefix for video filenames
    episode_trigger=lambda x: True    # Record every episode
)
dim = env.action_space.n
print(action_dim)
agent = ConvDQNAgent(input_shape, action_dim, lr=0.001, gamma=0.99, epsilon=0, epsilon_decay=0.9955, buffer_size=10000)
target_directory = f"../Saved_Models/{Model}"  # Replace with your directory path
model_load_path = get_last_modified_file(target_directory)
agent.model.state_dict(torch.load(model_load_path, map_location=agent.device))

In [None]:
episode = 10
for temp_episode in range(episode):
    obs, info = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = agent.act(state = obs)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        #print(f"Reward: {reward}")
        total_reward += reward

    print(f"Episode: {temp_episode} Reward: {total_reward}")