In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.optim.lr_scheduler import ExponentialLR
import numpy as np
import random
import gymnasium as gym
from gymnasium.spaces import Box
from gymnasium.wrappers import TimeLimit, ResizeObservation, RecordVideo, MaxAndSkipObservation
from collections import deque
import retro
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
import io
import time

In [2]:
RENDER_ENV = False
RESIZE_ENV = True
LOAD_MODEL = False
Render_Frame_rate=4
new_size = (84,120) #Original Size 320, 224
batch_size = 32
num_episodes = 200
max_episode_steps = 5400
num_stacked_frames = 4
num_frame_skip = 2
version = 3
prev_model = 'DQN-Sonic-V1-E15-S5400.pth'

In [3]:
class ButtonActionWrapper(gym.Wrapper):
    """
    A wrapper that maps discrete actions to a set of button presses for the game.
    This simplifies the action space for the agent.
    """
    def __init__(self, env, buttons):
        super().__init__(env)
        self.buttons = buttons
        # Create a mapping from a single action index to the full button array.
        self._actions = np.identity(len(buttons), dtype=np.int8)
        self.action_space = gym.spaces.Discrete(len(buttons))

    def step(self, action):
        return self.env.step(self._actions[action])

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return obs, info

In [4]:
class CustomRewardWrapper(gym.RewardWrapper):
    """
    Custom reward shaping to encourage forward movement.
    This wrapper modifies the reward based on the agent's horizontal position.
    """
    def __init__(self, env, mov_rew=0.01, score_rew=0.05, hp_rew=4, ring_rew=1, end_bonus=100):
        super(CustomRewardWrapper, self).__init__(env)
        self.mov_rew = mov_rew
        self.score_rew = score_rew
        self.hp_rew = hp_rew
        self.ring_rew = ring_rew
        self.end_bonus = end_bonus

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        game_variables = self.env.unwrapped.data.lookup_all()

        self.previous_pos_x = game_variables['x']
        self.previous_score = game_variables['score']
        self.previous_lives = game_variables['lives']
        self.previous_rings = game_variables['rings']
        self.previous_end_bonus = game_variables['level_end_bonus']

        return obs, info

    def reward(self, reward):
        #print(f"Reward original: {reward}")
        custom_reward = reward
        game_state = self.env.unwrapped.data

        if game_state:
            game_variables = game_state.lookup_all()
            current_pos_x = game_variables['x']
            current_score = game_variables['score']
            current_lives = game_variables['lives']
            current_rings = game_variables['rings']
            current_end_bonus = game_variables['level_end_bonus']

            # moverse hacia la derecha
            if current_pos_x > self.previous_pos_x:
                #Recompensa
                custom_reward += self.mov_rew
            else:
                #Penalizacion
                custom_reward -= (self.mov_rew/2)

            #Recompensa por puntaje
            if current_score > self.previous_score:
                custom_reward += self.score_rew*(current_score-self.previous_score)
            
            #Recompensa por ganar vida
            if current_lives > self.previous_lives:
                custom_reward += self.hp_rew*(current_lives-self.previous_lives)

            #Penalizacion por perder vida
            if current_lives < self.previous_lives:
                custom_reward += (self.hp_rew/2)*(current_lives-self.previous_lives)

            #Recompensa por conseguir anillos
            if current_rings > self.previous_rings:
                custom_reward += self.ring_rew*(current_rings-self.previous_rings)
            
            #Penalizacion por perder anillos
            if current_rings < self.previous_rings:
                custom_reward += (self.ring_rew/2)*(current_rings-self.previous_rings)

            #Recompensa por completar nivel
            if current_end_bonus > self.previous_end_bonus:
                custom_reward += self.end_bonus

            self.previous_pos_x = current_pos_x
            self.previous_score = current_score
            self.previous_lives = current_lives
            self.previous_rings = current_rings
            self.previous_end_bonus = current_end_bonus


        return custom_reward

In [5]:
class ConvDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(ConvDQN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 16, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=1),
            nn.LeakyReLU()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(self.calc_conv_output(input_shape), 512),
            nn.LeakyReLU(),
            nn.Linear(512, num_actions)
        )

    def calc_conv_output(self, shape):
        dummy_input = torch.zeros(1, *shape)
        dummy_output = self.conv_layers(dummy_input)
        return int(np.prod(dummy_output.size()))

    def forward(self, x):
        conv_out = self.conv_layers(x).view(x.size()[0], -1)
        return self.fc_layers(conv_out)

In [6]:
class FrameStackObservation(gym.Wrapper):
    """
    Stacks frames manually to ensure the final output shape is (height, width, channel x frames).
    
    The input to this wrapper is (H, W, C) where C=3 (RGB).
    The output is (H, W, stack_size * C), i.e., (84, 84, 12) in the current setup.
    """
    def __init__(self, env, stack_size):
        super().__init__(env)
        self.stack_size = stack_size
        # Use deque to efficiently manage the stack of frames
        self.frames = deque(maxlen=stack_size)

        # Calculate the new observation space shape
        obs_shape = env.observation_space.shape
        H, W, C = obs_shape
        
        # New shape: (H, W, stack_size * C). This achieves the requested H, W, (C*T) format.
        new_channels = C * stack_size
        self.observation_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(H, W, new_channels),
            dtype=env.observation_space.dtype
        )
        print(f"MODIFIED FrameStack (RGB): Final shape is (H, W, C*T) = {self.observation_space.shape}")

    def _get_observation(self):
        # Concatenate the stacked frames along the last axis (channel axis)
        # Resulting shape is (H, W, C*T)
        assert len(self.frames) == self.stack_size
        return np.concatenate(self.frames, axis=-1)

    def reset(self, **kwargs):
        observation, info = self.env.reset(**kwargs)
        # Initialize the stack with the first observation, replicated
        for _ in range(self.stack_size):
            self.frames.append(observation)
        return self._get_observation(), info

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        self.frames.append(observation)
        return self._get_observation(), reward, terminated, truncated, info

In [7]:
class ConvDQNAgent:
    def __init__(self, input_shape, num_actions, lr, gamma, epsilon, epsilon_decay, buffer_size):
        self.input_shape = input_shape # input_shape is (Stack, Height, Width) or (Stack, Height, Width, Channels)
        self.num_actions = num_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)
        # Correct the input shape calculation for ConvDQN to be (Stack * Channels, Height, Width)
        # Assuming input_shape is (Stack, Height, Width) and images are RGB (3 channels)
        num_channels = 3 # Assuming RGB images
        # The input shape from the environment after wrappers is (Stack, Height, Width, Channels)
        # We need to transform it to (Stack * Channels, Height, Width) for the ConvDQN
        conv_input_shape = (input_shape[0] * num_channels, *input_shape[1:])

        # If the original input_shape included channels, we need to adjust
        # Assuming input_shape is (Stack, Height, Width, Channels)
        if len(input_shape) == 4:
             # input_shape is (Stack, Height, Width, Channels)
             # We want (Stack * Channels, Height, Width)
             conv_input_shape = (input_shape[0] * input_shape[3], input_shape[1], input_shape[2])

        self.device = 'cuda' #if torch.cuda.is_available() else 'cpu'
        self.model = ConvDQN(conv_input_shape, num_actions).to(self.device)
        #dummy_input = torch.randn(1, conv_input_shape[0], conv_input_shape[1], conv_input_shape[2]).to(self.device)
        #self.model = torch.jit.trace(ConvDQN(conv_input_shape, num_actions).to(self.device), dummy_input)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=1e-5)
        # self.optimizer = optim.SGD(
        #     self.model.parameters(), 
        #     lr=lr, 
        #     momentum=0.9
        # )
        # self.scheduler = ExponentialLR(self.optimizer, gamma=0.997696) 

    def preprocess(self, state):
        # Convert the numpy array (Stack, Height, Width, Channels) to a PyTorch tensor
        state = torch.tensor(state, dtype=torch.float32)
        # Permute dimensions from (Stack, Height, Width, Channels) to (Stack, Channels, Height, Width)
        state = state.permute(0, 3, 1, 2)
        # Reshape to (Stack * Channels, Height, Width)
        state = state.reshape(-1, state.shape[2], state.shape[3])
        return state
    
    def preprocess_vectorized(self, state):
        is_batch = state.ndim == 5
        state = torch.tensor(state, dtype=torch.float32)
        print(state.shape)
        if is_batch:
            state = state.permute(1, 4, 0, 2, 3)
            state = state.reshape(-1, state.shape[2], state.shape[3], state.shape[4])
            state = state.permute(1, 0, 2, 3)
        else:
            state = state.permute(0, 3, 1, 2)
            state = state.reshape(-1, state.shape[2], state.shape[3])
        return state
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.num_actions)
        state = self.preprocess(state).unsqueeze(0).to(self.device)
        q_values = self.model(state)
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state = self.preprocess(next_state).unsqueeze(0).to(self.device) # Move to the correct device
                target = reward + self.gamma * torch.max(self.model(next_state)).item()
            state = self.preprocess(state).unsqueeze(0).to(self.device)
            current_q_values = self.model(state)
            target_f = current_q_values.clone().detach()
            target_f[0][action] = target

            self.optimizer.zero_grad()
            loss = nn.MSELoss()(target_f, current_q_values)
            loss.backward()
            self.optimizer.step()
            # self.scheduler.step() #for testing scheduler lr
            # for param_group in self.optimizer.param_groups:
            #     if param_group['lr'] < 0.0001:
            #         param_group['lr'] = 0.0001
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay
    
    def replay_vectorized(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        minibatch = random.sample(self.memory, batch_size)

        # Unzip the minibatch into separate lists
        states, actions, rewards, next_states, dones = zip(*minibatch)

        # Convert lists to tensors and move them to the device
        states_tensor = torch.tensor(states, device=self.device)
        actions_tensor = torch.tensor(actions, device=self.device)
        rewards_tensor = torch.tensor(rewards, device=self.device)
        next_states_tensor = torch.tensor(next_states, device=self.device)
        dones_tensor = torch.tensor(dones, device=self.device)
        
        # Preprocess all states and next states at once
        states_tensor = self.preprocess_vectorized(states_tensor)
        next_states_tensor = self.preprocess_vectorized(next_states_tensor)
        
        # Get current Q-values for all states in the batch
        current_q_values = self.model(states_tensor)
        
        # Get max Q-values for all next states in the batch
        next_q_values = self.model(next_states_tensor)
        max_next_q = torch.max(next_q_values, 1)[0]
        
        # Calculate target Q-values using the Bellman equation
        # Create a new tensor for the targets
        target_q_values = rewards_tensor + self.gamma * max_next_q * (~dones_tensor)
        
        # Create the tensor for target_f
        target_f = current_q_values.clone().detach()

        # Update the Q-value for the action that was taken
        target_f[range(batch_size), actions_tensor.long()] = target_q_values

        # Perform a single optimization step for the entire batch
        self.optimizer.zero_grad()
        loss = nn.MSELoss()(target_f, current_q_values)
        loss.backward()
        self.optimizer.step()

In [8]:
#Guardar Modelo
def save_model(agent, episode):
    model_save_path = f'../Saved_Models/DQN/DQN-Sonic-V{version}-E{episode}-S{max_episode_steps}.pth' #ppt para jit, pth para statedict
    try:
        torch.save(agent.model.state_dict(), model_save_path)
        #torch.save(agent.model, model_save_path)
        print(f'Modelo exitosamente guardado en {model_save_path}')
    except Exception as e:
        print(f'Error guardando el modelo error: {e}')

In [9]:
try:
    env.close()
except:
    print('no enviroment to close')

no enviroment to close


In [10]:
env = retro.make(game="SonicTheHedgehog-Genesis", render_mode='rgb_array') #rgb_array
env = ButtonActionWrapper(env, buttons=['B', 'A', 'MODE', 'START', 'UP', 'DOWN', 'LEFT', 'RIGHT', 'C', 'Y', 'X', 'Z']) #['LEFT', 'RIGHT', 'A']
env = CustomRewardWrapper(env)
env = MaxAndSkipObservation(env, skip=num_frame_skip)
if RESIZE_ENV:
  # input_shape = (num_stacked_frames, *new_size)
  input_shape = (num_stacked_frames, *new_size)
  env = ResizeObservation(env, new_size)
else:
  input_shape = (num_stacked_frames, 224, 320)
env = TimeLimit(env, max_episode_steps=max_episode_steps)
env = FrameStackObservation(env, stack_size=num_stacked_frames)
action_dim = env.action_space.n
print(action_dim)

model = DQN("CnnPolicy", env, verbose=1, buffer_size=10000)
model.learn(total_timesteps=num_episodes*max_episode_steps)
model.save(f'../Saved_Models/DQN/SB3-DQN-Sonic-V{version}-E{num_episodes}-S{max_episode_steps}')

env.close()

MODIFIED FrameStack (RGB): Final shape is (H, W, C*T) = (84, 120, 12)
12
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5.4e+03  |
|    ep_rew_mean      | 158      |
|    exploration_rate | 0.81     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 158      |
|    time_elapsed     | 136      |
|    total_timesteps  | 21600    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000616 |
|    n_updates        | 5374     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5.4e+03  |
|    ep_rew_mean      | 182      |
|    exploration_rate | 0.62     |
| time/               |          |
|    episodes         | 8        |
|    fps             

In [19]:
env.close()

In [12]:
print(env.buttons)

['B', 'A', 'MODE', 'START', 'UP', 'DOWN', 'LEFT', 'RIGHT', 'C', 'Y', 'X', 'Z']


In [20]:
episode = 0
env = retro.make(game="SonicTheHedgehog-Genesis", render_mode='human') #rgb_array
env = ButtonActionWrapper(env, buttons=['B', 'A', 'MODE', 'START', 'UP', 'DOWN', 'LEFT', 'RIGHT', 'C', 'Y', 'X', 'Z']) #['LEFT', 'RIGHT', 'A']
env = CustomRewardWrapper(env)
env = MaxAndSkipObservation(env, skip=num_frame_skip)
if RESIZE_ENV:
  # input_shape = (num_stacked_frames, *new_size)
  input_shape = (num_stacked_frames, *new_size)
  env = ResizeObservation(env, new_size)
else:
  input_shape = (num_stacked_frames, 224, 320)
env = TimeLimit(env, max_episode_steps=max_episode_steps)
env = FrameStackObservation(env, stack_size=num_stacked_frames)
action_dim = env.action_space.n
print(action_dim)
model = DQN("CnnPolicy", env, verbose=1, buffer_size=10000)
model = DQN.load(f'../Saved_Models/DQN/SB3-DQN-Sonic-V{version}-E{num_episodes}-S{max_episode_steps}', env=env)

MODIFIED FrameStack (RGB): Final shape is (H, W, C*T) = (84, 120, 12)
12
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [23]:
episode = 0
for _ in range(10):
    obs, info = env.reset()
    done = False
    total_reward = 0
    episode+=1
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        #print(f"Reward: {reward}")
        done = terminated or truncated
        total_reward += reward
        env.render()

    print(f"Episode: {episode} Reward: {total_reward}")

Episode: 1 Reward: -29.03000000000139
Episode: 2 Reward: -29.03000000000139
Episode: 3 Reward: -29.03000000000139
Episode: 4 Reward: -29.03000000000139
Episode: 5 Reward: -29.03000000000139
Episode: 6 Reward: -29.03000000000139
Episode: 7 Reward: -29.03000000000139
Episode: 8 Reward: -29.03000000000139
Episode: 9 Reward: -29.03000000000139
Episode: 10 Reward: -29.03000000000139


In [None]:
#Cargar Modelo
