In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.optim.lr_scheduler import ExponentialLR
from torch.distributions import MultivariateNormal, Categorical
import numpy as np
import random
import gymnasium as gym
from gymnasium.spaces import Box
from gymnasium.wrappers import FrameStackObservation, TimeLimit, ResizeObservation, RecordVideo, MaxAndSkipObservation
from collections import deque
import retro
import io
import time
import os
import gc
import pandas as pd

In [2]:
RENDER_ENV = False
RESIZE_ENV = True
LOAD_MODEL = False
Render_Frame_rate=4
new_size = (84,120) #Original Size 320, 224
num_episodes = 1000
max_steps_per_episode = 1800
num_stacked_frames = 4
num_frame_skip = 2
Model = "DQN"
save_interval = 100
episode_p_interval = 4
rew_p_interval = 5

#Version
version = 5

#Hiperparametros
LR = 2e-5
GAMMA = 0.999

#DQN y D3QN Params
EPSILON = 1.0
EPSILON_DECAY = 0.99
BUFFER_SIZE = 20000
batch_size = 256
EPSILON_END = 0.01

#D3QN
UPDATE_TARGET_FREQ = 10000

#PPO Params
N_STEPS = 2048
N_UPDATES_PER_ITERATION = 5
CLIP = 0.1
ENTROPY_COEF = 0.01



In [3]:

def get_last_modified_file(directory_path):
    if not os.path.isdir(directory_path):
        print(f"Error: Directory '{directory_path}' does not exist.")
        return None
    files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
    if not files:
        return None
    files.sort(key=os.path.getmtime, reverse=True)
    return files[0]

target_directory = f"../Saved_Models/{Model}"  # Replace with your directory path
model_load_path = get_last_modified_file(target_directory)

if model_load_path:
    print(f"The last modified file is: {model_load_path}")
else:
    print("No files found in the directory or directory does not exist.")

The last modified file is: ../Saved_Models/DQN/DQN-Sonic-V1-E20-S1800.pth


In [4]:
try:
    os.mkdir("../Saved_Models")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Saved_Models/PPO")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Saved_Models/PPO/Actor")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Saved_Models/PPO/Critic")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Saved_Models/DQN")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Saved_Models/D3QN")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Video")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Video/PPO")
except Exception as e:
    print(f"Error: {e}")
    
try:
    os.mkdir("../Video/DQN")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Video/D3QN")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Logs/PPO")
except Exception as e:
    print(f"Error: {e}")
    
try:
    os.mkdir("../Logs/DQN")
except Exception as e:
    print(f"Error: {e}")

try:
    os.mkdir("../Logs/D3QN")
except Exception as e:
    print(f"Error: {e}")

Error: [Errno 17] File exists: '../Saved_Models'
Error: [Errno 17] File exists: '../Saved_Models/PPO'
Error: [Errno 17] File exists: '../Saved_Models/PPO/Actor'
Error: [Errno 17] File exists: '../Saved_Models/PPO/Critic'
Error: [Errno 17] File exists: '../Saved_Models/DQN'
Error: [Errno 17] File exists: '../Saved_Models/D3QN'
Error: [Errno 17] File exists: '../Video'
Error: [Errno 17] File exists: '../Video/PPO'
Error: [Errno 17] File exists: '../Video/DQN'
Error: [Errno 17] File exists: '../Video/D3QN'
Error: [Errno 17] File exists: '../Logs/PPO'
Error: [Errno 17] File exists: '../Logs/DQN'
Error: [Errno 17] File exists: '../Logs/D3QN'


In [5]:
#Guardar Modelo
def save_model(agent, episode):
    model_save_path = f'../Saved_Models/{Model}' #ppt para jit, pth para statedict
    model_file_name = f'/{Model}-Sonic-V{version}-E{episode}-S{max_steps_per_episode}.pth'
    try:
        if Model == "DQN":
            torch.save(agent.model.state_dict(), model_save_path+model_file_name)
        if Model == "D3QN":
            torch.save(agent.model_online.state_dict(), model_save_path+model_file_name)
        if Model == "PPO":
            torch.save(agent.model_actor.state_dict(), model_save_path+"/Actor"+model_file_name)
            torch.save(agent.model_critic.state_dict(), model_save_path+"/Critic"+model_file_name)
        #torch.save(agent.model, model_save_path)
        print(f'Modelo exitosamente guardado en {model_save_path}')
    except Exception as e:
        print(f'Error guardando el modelo error: {e}')

In [6]:
class ButtonActionWrapper(gym.ActionWrapper):
    """
    Wrap a gym-retro environment and make it use discrete
    actions for the Sonic game.
    """
    def __init__(self, env):
        super(ButtonActionWrapper, self).__init__(env)
        buttons = env.unwrapped.buttons
        actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
                   ['DOWN', 'B'], ['B']]
        self._actions = []
        for action in actions:
            arr = np.array([False] * env.action_space.n)
            for button in action:
                arr[buttons.index(button)] = True
            self._actions.append(arr)
        self.action_space = gym.spaces.Discrete(len(self._actions))

    def action(self, a): # pylint: disable=W0221
        return self._actions[a].copy()

In [7]:
class CustomRewardWrapper(gym.RewardWrapper):
    def __init__(self, env, mov_rew=1, score_rew=1, hp_rew=1, ring_rew=0.5, end_bonus=100):
        super(CustomRewardWrapper, self).__init__(env)
        self.mov_rew = mov_rew
        self.score_rew = score_rew
        self.hp_rew = hp_rew
        self.ring_rew = ring_rew
        self.end_bonus = end_bonus

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        game_variables = self.env.unwrapped.data.lookup_all()

        self.previous_pos_x = game_variables['x']
        self.previous_score = game_variables['score']
        self.previous_lives = game_variables['lives']
        self.previous_rings = game_variables['rings']
        self.previous_end_bonus = game_variables['level_end_bonus']

        return obs, info

    def reward(self, reward):
        #print(f"Reward original: {reward}")
        custom_reward = reward
        game_state = self.env.unwrapped.data

        if game_state:
            game_variables = game_state.lookup_all()
            current_pos_x = game_variables['x']
            current_score = game_variables['score']
            current_lives = game_variables['lives']
            current_rings = game_variables['rings']
            current_end_bonus = game_variables['level_end_bonus']

            # moverse hacia la derecha
            if current_pos_x > self.previous_pos_x:
                #Recompensa
                custom_reward += self.mov_rew
            else:
                #Penalizacion
                custom_reward -= self.mov_rew

            #Recompensa por puntaje
            if current_score > self.previous_score:
                custom_reward += self.score_rew*(current_score-self.previous_score)
            
            #Recompensa por ganar vida
            if current_lives > self.previous_lives:
                custom_reward += self.hp_rew*(current_lives-self.previous_lives)

            #Penalizacion por perder vida
            if current_lives < self.previous_lives:
                custom_reward += (self.hp_rew/2)*(current_lives-self.previous_lives)

            #Recompensa por conseguir anillos
            if current_rings > self.previous_rings:
                custom_reward += self.ring_rew*(current_rings-self.previous_rings)
            
            #Penalizacion por perder anillos
            if current_rings < self.previous_rings:
                custom_reward += (self.ring_rew/2)*(current_rings-self.previous_rings)

            #Recompensa por completar nivel
            if current_end_bonus > self.previous_end_bonus:
                custom_reward += self.end_bonus

            self.previous_pos_x = current_pos_x
            self.previous_score = current_score
            self.previous_lives = current_lives
            self.previous_rings = current_rings
            self.previous_end_bonus = current_end_bonus


        return custom_reward

In [8]:
class StochasticFrameSkip(gym.Wrapper):
    def __init__(self, env, n, stickprob):
        gym.Wrapper.__init__(self, env)
        self.n = n
        self.stickprob = stickprob
        self.curac = None
        self.rng = np.random.RandomState()
        self.supports_want_render = hasattr(env, "supports_want_render")

    def reset(self, **kwargs):
        self.curac = None
        return self.env.reset(**kwargs)

    def step(self, ac):
        terminated = False
        truncated = False
        totrew = 0
        for i in range(self.n):
            # First step after reset, use action
            if self.curac is None:
                self.curac = ac
            # First substep, delay with probability=stickprob
            elif i == 0:
                if self.rng.rand() > self.stickprob:
                    self.curac = ac
            # Second substep, new action definitely kicks in
            elif i == 1:
                self.curac = ac
            if self.supports_want_render and i < self.n - 1:
                ob, rew, terminated, truncated, info = self.env.step(
                    self.curac,
                    want_render=False,
                )
            else:
                ob, rew, terminated, truncated, info = self.env.step(self.curac)
            totrew += rew
            if terminated or truncated:
                break
        return ob, totrew, terminated, truncated, info

In [9]:
class ConvDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(ConvDQN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.LeakyReLU()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(self.calc_conv_output(input_shape), 512),
            nn.LeakyReLU(),
            nn.Linear(512, num_actions)
        )

    def calc_conv_output(self, shape):
        dummy_input = torch.zeros(1, *shape)
        dummy_output = self.conv_layers(dummy_input)
        return int(np.prod(dummy_output.size()))

    def forward(self, x):
        conv_out = self.conv_layers(x).view(x.size()[0], -1)
        return self.fc_layers(conv_out)

In [10]:
class ConvDQNAgent:
    def __init__(self, env, input_shape, num_actions, lr = 1e-4, gamma = 0.99, epsilon = 1.0, epsilon_decay = 0.99, buffer_size = 10000, epsilon_end=0.01):
        self.env = env
        self.input_shape = input_shape
        self.num_actions = num_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)
        self.device = 'cuda'
        self.model = ConvDQN(input_shape, num_actions).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.epsilon_end = epsilon_end

    def preprocess(self, state):
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        transform = T.Lambda(lambda x: x.permute(0,3,1,2).reshape(-1, self.input_shape[1], self.input_shape[2]))
        return transform(state)
    
    def preprocess_wv(self, state):
        state_tensor = torch.tensor(state, dtype=torch.float32, device=self.device)
        state_tensor = state_tensor / 255.0
        state_tensor = state_tensor.permute(0, 3, 1, 2) 
        C_out = self.input_shape[0]
        H_out = self.input_shape[1] 
        W_out = self.input_shape[2] 
        state_tensor = state_tensor.contiguous().view(C_out, H_out, W_out)
        return state_tensor
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.num_actions)
        state = self.preprocess_wv(state)
        with torch.no_grad():
            q_values = self.model(state.unsqueeze(0))
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state = self.preprocess_wv(next_state)
                target = reward + self.gamma * torch.max(self.model(next_state.unsqueeze(0))).item()
            state = self.preprocess_wv(state)
            target_f = self.model(state.unsqueeze(0)).to("cpu").detach().numpy()
            target_f[0][action] = target
            self.optimizer.zero_grad()
            loss = nn.MSELoss()(torch.tensor(target_f).to(self.device), self.model(state.unsqueeze(0)))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_end:
            self.epsilon *= self.epsilon_decay

    def replay_vect(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)
        states_tensor = torch.stack([self.preprocess_wv(s) for s in states])
        next_states_tensor = torch.stack([self.preprocess_wv(ns) for ns in next_states])
        actions_tensor = torch.tensor(actions, dtype=torch.long, device=self.device)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32, device=self.device)
        dones_tensor = torch.tensor(dones, dtype=torch.bool, device=self.device)
        with torch.no_grad():
            next_q_values = self.model(next_states_tensor)
            max_next_q = torch.max(next_q_values, dim=1)[0]
        target_q_values = rewards_tensor + self.gamma * max_next_q * (~dones_tensor)
        current_q_values = self.model(states_tensor)
        current_q_for_actions = current_q_values.gather(1, actions_tensor.unsqueeze(1)).squeeze()
        self.optimizer.zero_grad()
        loss = nn.MSELoss()(current_q_for_actions, target_q_values)
        loss.backward()
        self.optimizer.step()
        if self.epsilon > 0.05:
            self.epsilon *= self.epsilon_decay

    def learn(self, total_timesteps):
        temp_reward = 0
        frame_count_prev = 0
        frame_count = 0
        ep_count = 0
        while frame_count < total_timesteps:
            state, _ = self.env.reset()
            total_reward = 0
            done = False
            ep_count += 1
            while not done:
                frame_count += 1
                action = self.act(state = state)
                observation, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated
                self.remember(state, action, reward, observation, done)
                state = observation
                total_reward += reward
                temp_reward += reward
                if frame_count % rew_p_interval == 0:
                    print(f'step n={frame_count} with reward {temp_reward}')
                    temp_reward = 0
                self.replay_vect(batch_size)

            if (ep_count+1) % episode_p_interval == 0:
                print(f'Episode {ep_count+1} \nstep n={(frame_count-frame_count_prev)/episode_p_interval}\nreward {temp_reward/episode_p_interval}\n')
                temp_reward = 0
                frame_count_prev=frame_count
            if (ep_count+1) % save_interval == 0:
                save_model(self, ep_count)
            print(f"Episode finished with total reward: {total_reward}")

In [11]:
class ConvD3QN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(ConvD3QN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(64, 64, kernel_size=2, stride=1),
            nn.LeakyReLU()
        )
        self.advance_stream = nn.Sequential(
            nn.Linear(self.calc_conv_output(input_shape), 512),
            nn.LeakyReLU(),
            nn.Linear(512, num_actions)
        )
        self.value_stream = nn.Sequential(
            nn.Linear(self.calc_conv_output(input_shape), 512),
            nn.LeakyReLU(),
            nn.Linear(512, 1)
        )

    def calc_conv_output(self, shape):
        dummy_input = torch.zeros(1, *shape)
        dummy_output = self.conv_layers(dummy_input)
        return int(np.prod(dummy_output.size()))

    def forward(self, x):
        conv_out = self.conv_layers(x).view(x.size()[0], -1)
        advantages = self.advance_stream(conv_out)
        value = self.value_stream(conv_out)
        q_values = value + (advantages - advantages.mean(dim=1, keepdim=True))
        return q_values

In [12]:
class ConvD3QNAgent:
    def __init__(self, env, input_shape, num_actions, lr = 1e-4, gamma = 0.99, epsilon = 1.0, epsilon_decay = 0.99, buffer_size = 10000, update_target_freq=10000, epsilon_end = 0.01):
        self.env = env
        self.input_shape = input_shape
        self.num_actions = num_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)
        self.device = 'cuda'
        self.model_online = ConvD3QN(input_shape, num_actions).to(self.device)
        self.model_target = ConvD3QN(input_shape, num_actions).to(self.device)
        self.model_target.load_state_dict(self.model_online.state_dict())
        self.optimizer = optim.Adam(self.model_online.parameters(), lr=lr)
        self.update_target_freq = update_target_freq
        self.step_counter = 0
        self.epsilon_end = epsilon_end

    def preprocess(self, state):
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        transform = T.Lambda(lambda x: x.permute(0,3,1,2).reshape(-1, self.input_shape[1], self.input_shape[2]))
        return transform(state)
    
    def preprocess_wv(self, state):
        state_tensor = torch.tensor(state, dtype=torch.float32, device=self.device)
        state_tensor = state_tensor / 255.0
        state_tensor = state_tensor.permute(0, 3, 1, 2) 
        C_out = self.input_shape[0]
        H_out = self.input_shape[1] 
        W_out = self.input_shape[2] 
        state_tensor = state_tensor.contiguous().view(C_out, H_out, W_out)
        return state_tensor
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.num_actions)
        state = self.preprocess_wv(state)
        with torch.no_grad():
            q_values = self.model_online(state.unsqueeze(0))
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_vect(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)
        states_tensor = torch.stack([self.preprocess_wv(s) for s in states])
        next_states_tensor = torch.stack([self.preprocess_wv(ns) for ns in next_states])
        actions_tensor = torch.tensor(actions, dtype=torch.long, device=self.device)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32, device=self.device)
        dones_tensor = torch.tensor(dones, dtype=torch.bool, device=self.device)
        with torch.no_grad():
            next_q_values_online = self.model_online(next_states_tensor)
            best_action_online_indices = torch.argmax(next_q_values_online, dim=1).unsqueeze(1)
            max_Q_next = self.model_target(next_states_tensor).gather(1, best_action_online_indices).squeeze()
        target_q_values = rewards_tensor + self.gamma * max_Q_next * (~dones_tensor)
        current_q_values = self.model_online(states_tensor)
        current_q_for_actions = current_q_values.gather(1, actions_tensor.unsqueeze(1)).squeeze()
        self.optimizer.zero_grad()
        loss = nn.MSELoss()(current_q_for_actions, target_q_values)
        loss.backward()
        self.optimizer.step()
        self.step_counter += 1
        self.update_target_network()
        if self.epsilon > self.epsilon_end:
            self.epsilon *= self.epsilon_decay
    
    def update_target_network(self):
        if self.step_counter % self.update_target_freq == 0:
            self.model_target.load_state_dict(self.model_online.state_dict())

    def learn(self, total_timesteps):
        temp_reward = 0
        frame_count_prev = 0
        frame_count = 0
        ep_count = 0
        while frame_count < total_timesteps:
            state, _ = self.env.reset()
            total_reward = 0
            done = False
            ep_count += 1
            while not done:
                frame_count += 1
                action = self.act(state = state)
                observation, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated
                self.remember(state, action, reward, observation, done)
                state = observation
                total_reward += reward
                temp_reward += reward
                if frame_count % rew_p_interval == 0:
                    print(f'step n={frame_count} with reward {temp_reward}')
                    temp_reward = 0
                self.replay_vect(batch_size)

            if (ep_count+1) % episode_p_interval == 0:
                print(f'Episode {ep_count+1} \nstep n={(frame_count-frame_count_prev)/episode_p_interval}\nreward {temp_reward/episode_p_interval}\n')
                temp_reward = 0
                frame_count_prev=frame_count
            if (ep_count+1) % save_interval == 0:
                save_model(self, ep_count)
            print(f"Episode finished with total reward: {total_reward}")

In [13]:
class ConvPPOActor(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(ConvPPOActor, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.LeakyReLU()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(self.calc_conv_output(input_shape), 512),
            nn.LeakyReLU(),
            nn.Linear(512, num_actions),
            nn.Softmax(dim=1)
        )

    def calc_conv_output(self, shape):
        dummy_input = torch.zeros(1, *shape)
        dummy_output = self.conv_layers(dummy_input)
        return int(np.prod(dummy_output.size()))

    def forward(self, x):
        conv_out = self.conv_layers(x).view(x.size()[0], -1)
        conv_out = self.fc_layers(conv_out)
        dist = Categorical(conv_out)
        return dist
    
class ConvPPOCritic(nn.Module):
    def __init__(self, input_shape):
        super(ConvPPOCritic, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.LeakyReLU()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(self.calc_conv_output(input_shape), 512),
            nn.LeakyReLU(),
            nn.Linear(512, 1)
        )

    def calc_conv_output(self, shape):
        dummy_input = torch.zeros(1, *shape)
        dummy_output = self.conv_layers(dummy_input)
        return int(np.prod(dummy_output.size()))

    def forward(self, x):
        conv_out = self.conv_layers(x).view(x.size()[0], -1)
        return self.fc_layers(conv_out)

In [14]:
class ConvPPOAgent:
    def __init__(self, env, input_shape, clip=0.2, learning_rate=1e-4, gamma=0.99, n_steps=2048, n_updates_per_iteration=5, entropy_coef=0.01, minibatch_size = 64, max_grad_norm=0.5, lam = 0.95):
        self.env = env
        self.input_shape = input_shape
        self.num_actions = env.action_space.n
        self.lr = learning_rate
        self.gamma = gamma
        self.n_steps = n_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_actor = ConvPPOActor(input_shape, self.num_actions).to(self.device)
        #self.model_actor.half()
        self.model_critic = ConvPPOCritic(input_shape).to(self.device)
        #self.model_critic.half()
        self.actor_optimizer = optim.Adam(self.model_actor.parameters(), lr=self.lr)
        self.critic_optimizer = optim.Adam(self.model_critic.parameters(), lr=self.lr)
        self.n_updates = n_updates_per_iteration
        self.clip = clip
        self.entropy_coef = entropy_coef
        self.mb_size = minibatch_size
        self.max_grad_norm = max_grad_norm
        self.lam = lam
        self.csv_file = f'../Logs/{Model}/PPO.csv'

    def get_action(self, obs):
        dist = self.model_actor(obs.unsqueeze(0))
        #dist = MultivariateNormal(mean, self.cov_mat)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob.detach().squeeze()
    
    def preprocess_wv(self, state):
        state_tensor = torch.tensor(state, dtype=torch.float32, device=self.device)
        state_tensor = state_tensor / 255.0
        state_tensor = state_tensor.permute(0, 3, 1, 2) 
        C_out = self.input_shape[0]
        H_out = self.input_shape[1] 
        W_out = self.input_shape[2] 
        state_tensor = state_tensor.contiguous().view(C_out, H_out, W_out)
        return state_tensor
    
    def rollout(self):
        batch_obs = []      
        batch_acts = []            
        batch_log_probs = []       
        batch_rews = []          
        batch_lens = []     
        batch_vals = []
        batch_dones = []
        ep_rews = []
        ep_vals = []
        ep_dones = []
        t = 0 
        if not hasattr(self, 'current_obs'):
            obs, _ =self.env.reset()
            self.current_obs = obs
        obs = self.current_obs
        with torch.no_grad():
            while t < self.n_steps:
                t+=1
                obs_tensor = self.preprocess_wv(obs)
                batch_obs.append(obs_tensor)
                action, log_prob = self.get_action(obs_tensor)
                val = self.model_critic(obs_tensor.unsqueeze(0)).detach().cpu().item()#.numpy().flatten()[0]
                obs, reward, terminated, truncated, _ = self.env.step(action)
                df_log = pd.DataFrame([{'rew': reward,'action':action,'logprob': log_prob}])
                if not os.path.exists(self.csv_file):
                    df_log.to_csv(self.csv_file, index=False)
                else:
                    # Append data without writing the header again
                    df_log.to_csv(self.csv_file, mode='a', header=False, index=False)
                done = terminated or truncated
                ep_dones.append(done)
                ep_rews.append(reward)
                ep_vals.append(val)
                batch_acts.append(action)
                batch_log_probs.append(log_prob)
                if done:
                    batch_lens.append(len(ep_rews))
                    batch_rews.append(ep_rews)
                    batch_vals.append(ep_vals)
                    batch_dones.append(ep_dones)
                    obs, _ = self.env.reset()
                    ep_rews = []
                    ep_vals = []
                    ep_dones = []
                    self.current_obs = obs
        if len(ep_rews) > 0:
            batch_lens.append(len(ep_rews))
            batch_rews.append(ep_rews)
            batch_vals.append(ep_vals)
            batch_dones.append(ep_dones)
        final_obs_tensor = self.preprocess_wv(obs)
        with torch.no_grad():
            self.final_val = self.model_critic(final_obs_tensor.unsqueeze(0)).squeeze().cpu().item()

        self.current_obs = obs
        batch_obs = torch.stack(batch_obs)
        #batch_obs = torch.tensor(batch_obs, dtype=torch.float)
        batch_acts = torch.tensor(batch_acts, dtype=torch.long, device=self.device)
        batch_log_probs = torch.stack(batch_log_probs).float()
        #batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float32, device=self.device)
        return batch_obs, batch_acts, batch_log_probs, batch_rews, batch_lens, batch_vals, batch_dones
    
    # def compute_rtgs(self, batch_rews):
    #     batch_rtgs = []
    #     for ep_rews in reversed(batch_rews):
    #         discounted_reward = 0 
    #         for rew in reversed(ep_rews):
    #             discounted_reward = rew + discounted_reward * self.gamma
    #             batch_rtgs.insert(0, discounted_reward)
    #     batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float, device=self.device)
    #     return batch_rtgs

    def calculate_gae(self, rewards, values, dones):
        batch_advantages = []
        for i, (ep_rews, ep_vals, ep_dones) in enumerate(zip(rewards, values, dones)):
            is_last_segment = (i == len(rewards) - 1)
            advantages = []
            last_advantage = 0
            for t in reversed(range(len(ep_rews))):
                is_terminal = ep_dones[t]
                if is_terminal:
                    next_val = 0
                elif t == len(ep_rews) - 1 and is_last_segment:
                    next_val = self.final_val
                else:
                    next_val = ep_vals[t+1]
                delta = ep_rews[t] + self.gamma * next_val - ep_vals[t]
                advantage = delta + self.gamma * self.lam * (1 - is_terminal) * last_advantage
                last_advantage = advantage
                advantages.insert(0, advantage)
            batch_advantages.extend(advantages)
        return torch.tensor(batch_advantages, dtype=torch.float32, device=self.device)

    def evaluate(self, batch_obs, batch_acts):
        V = self.model_critic(batch_obs).view(-1) #.squeeze()
        dist = self.model_actor(batch_obs)
        #dist = MultivariateNormal(mean, self.cov_mat)
        log_probs = dist.log_prob(batch_acts)
        entropy_loss = dist.entropy().mean()
        return V, log_probs, entropy_loss
    
    def learn(self, total_timesteps):
        act_t = 0
        while act_t < total_timesteps:
            batch_obs, batch_acts, batch_log_probs, batch_rews, batch_lens, batch_vals, batch_dones = self.rollout()
            act_t += np.sum(batch_lens)
            V_old_flat = [val for ep_vals in batch_vals for val in ep_vals]
            V_old_flat = torch.tensor(V_old_flat, dtype=torch.float32, device=self.device).detach()
            A_k = self.calculate_gae(batch_rews, batch_vals, batch_dones)
            A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)
            # batch_acts = torch.tensor(batch_acts, dtype=torch.long, device=self.device)
            # batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float, device=self.device)
            batch_rtgs = A_k + V_old_flat
            step = len(batch_obs)
            inds = np.arange(step)
            print(f"Iteration {act_t}/{total_timesteps}, collected {np.sum(batch_lens)} steps")
            assert len(V_old_flat) == len(batch_obs), f"Value mismatch: {len(V_old_flat)} vs {len(batch_obs)}"
            for _ in range(self.n_updates):
                np.random.shuffle(inds)
                for start in range(0, step, self.mb_size):
                    end = start + self.mb_size
                    idx = inds[start:end]
                    mini_obs = batch_obs[idx]
                    mini_acts = batch_acts[idx]
                    mini_log_probs = batch_log_probs[idx]
                    mini_advantage = A_k[idx]
                    mini_rtgs = batch_rtgs[idx]
                    V, curr_log_probs, curr_entropy_loss = self.evaluate(mini_obs, mini_acts)
                    ratios = torch.exp(curr_log_probs - mini_log_probs)
                    surr1 = ratios * mini_advantage
                    surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * mini_advantage
                    actor_loss = (-torch.min(surr1, surr2)).mean() - self.entropy_coef*curr_entropy_loss
                    critic_loss = nn.MSELoss()(V, mini_rtgs)
                    self.actor_optimizer.zero_grad()
                    actor_loss.backward()
                    nn.utils.clip_grad_norm_(self.model_actor.parameters(), self.max_grad_norm)
                    self.actor_optimizer.step()
                    self.critic_optimizer.zero_grad()    
                    critic_loss.backward()
                    nn.utils.clip_grad_norm_(self.model_critic.parameters(), self.max_grad_norm)
                    self.critic_optimizer.step()

In [15]:
try:
    env.close()
except:
    print('No Enviroment to close')

No Enviroment to close


In [16]:
def make_env(*, game, state=None, max_episode_steps=4500, **kwargs):
    if state is None:
        state = retro.State.DEFAULT
    env = retro.make(game, state, **kwargs)
    env = ButtonActionWrapper(env)
    #env = CustomRewardWrapper(env)
    env = StochasticFrameSkip(env, n=num_frame_skip, stickprob=0.25)
    if RESIZE_ENV:
        input_shape = (num_stacked_frames*3, *new_size)
        env = ResizeObservation(env, new_size)
    else:
        input_shape = (num_stacked_frames*3, 224, 320)
    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    env = FrameStackObservation(env, stack_size=num_stacked_frames)
    return env, input_shape

In [17]:
env, input_shape = make_env(game="SonicTheHedgehog-Genesis", render_mode='rgb_array', scenario = 'contest', max_episode_steps=max_steps_per_episode) #rgb_array, scenario = 'contest'
action_dim = env.action_space.n
print(action_dim)
#venv = VecTransposeImage(VecFrameStack(SubprocVecEnv([make_env] * 8), n_stack=4))
if Model == "DQN":
  agent = ConvDQNAgent(env=env, input_shape=input_shape, num_actions=action_dim, lr=LR, gamma=GAMMA, epsilon=EPSILON, epsilon_decay=EPSILON_DECAY, buffer_size=BUFFER_SIZE)
  if LOAD_MODEL:
    agent.model.state_dict(torch.load(model_load_path, map_location=agent.device))
if Model == "D3QN":
  agent = ConvD3QNAgent(env=env, input_shape=input_shape, num_actions=action_dim, lr=LR, gamma=GAMMA, epsilon=EPSILON, epsilon_decay=EPSILON_DECAY, buffer_size=BUFFER_SIZE, update_target_freq=UPDATE_TARGET_FREQ)
  if LOAD_MODEL:
    agent.model_online.state_dict(torch.load(model_load_path, map_location=agent.device))
    agent.model_target.state_dict(torch.load(model_load_path, map_location=agent.device))
if Model == "PPO":
  agent = ConvPPOAgent(env=env, input_shape=input_shape, learning_rate=LR, gamma=GAMMA, n_steps=N_STEPS, clip=CLIP, n_updates_per_iteration=N_UPDATES_PER_ITERATION, entropy_coef=ENTROPY_COEF)
  if LOAD_MODEL:
    agent.model.state_dict(torch.load(model_load_path, map_location=agent.device))

agent.learn(num_episodes*max_steps_per_episode)
save_model(agent, num_episodes)
env.close()

7
step n=5 with reward 0.0
step n=10 with reward 0.0
step n=15 with reward 0.0
step n=20 with reward 0.0
step n=25 with reward 0.0
step n=30 with reward 0.0
step n=35 with reward 0.0
step n=40 with reward 28.456998467445374
step n=45 with reward 15.1770658493042
step n=50 with reward 0.0
step n=55 with reward 3.79426646232605
step n=60 with reward 1.897133231163025
step n=65 with reward 5.691399693489075
step n=70 with reward 0.9485666155815125
step n=75 with reward -14.228499233722687
step n=80 with reward -14.228499233722687
step n=85 with reward -5.691399693489075
step n=90 with reward -18.97133231163025
step n=95 with reward -2.8456998467445374
step n=100 with reward 0.0
step n=105 with reward 0.0
step n=110 with reward 0.0
step n=115 with reward 0.0
step n=120 with reward 0.0
step n=125 with reward 0.0
step n=130 with reward 0.0
step n=135 with reward 0.0
step n=140 with reward 5.691399693489075
step n=145 with reward 38.89123111963272
step n=150 with reward 35.09696465730667
step

KeyboardInterrupt: 

In [None]:
env, input_shape = make_env(game="SonicTheHedgehog-Genesis", render_mode='rgb_array', scenario = 'contest', max_episode_steps=max_steps_per_episode) #rgb_array
env = RecordVideo(
    env,
    video_folder=f'../Video/{Model}',    # Folder to save videos
    name_prefix=f'eval-V{version}-S{max_steps_per_episode*num_episodes}',               # Prefix for video filenames
    episode_trigger=lambda x: True    # Record every episode
)
action_dim = env.action_space.n
print(action_dim)
target_directory = f"../Saved_Models/{Model}"  # Replace with your directory path
if Model == "DQN":
    model_load_path = get_last_modified_file(target_directory)
    agent = ConvDQNAgent(env=env, input_shape=input_shape, num_actions=action_dim, lr=0.001, gamma=0.99, epsilon=0, epsilon_decay=0.9955, buffer_size=10000)
    agent.model.state_dict(torch.load(model_load_path, map_location=agent.device))
if Model == "D3QN":  
    model_load_path = get_last_modified_file(target_directory)
    agent = ConvD3QNAgent(env=env, input_shape=input_shape, num_actions=action_dim, lr=0.001, gamma=0.99, epsilon=0, epsilon_decay=0.9955, buffer_size=10000)
    agent.model_online.state_dict(torch.load(model_load_path, map_location=agent.device))
    agent.model_target.state_dict(torch.load(model_load_path, map_location=agent.device))
if Model == "PPO":
    agent = ConvPPOAgent(env=env, input_shape=input_shape)
    model_load_path = get_last_modified_file(target_directory+"/Actor")
    agent.model_actor.state_dict(torch.load(model_load_path, map_location=agent.device))
    agent.model_actor.eval()
    model_load_path = get_last_modified_file(target_directory+"/Critic")
    agent.model_critic.state_dict(torch.load(model_load_path, map_location=agent.device))
    agent.model_critic.eval()

In [None]:
episode = 10
for temp_episode in range(episode):
    obs, info = env.reset()
    done = False
    total_reward = 0
    while not done:
        if Model == "PPO":
            obs = agent.preprocess_wv(state=obs)
            action = agent.get_action(obs = obs)[0]
        else:
            action = agent.act(state = obs)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        #print(f"Reward: {reward}")
        total_reward += reward

    print(f"Episode: {temp_episode} Reward: {total_reward}")