# imports

In [None]:
# imports from model.py
import torch
import torch.nn as nn

# imports from memory.py
import numpy as np

# imports from wrappers.py
import gymnasium as gym
from PIL import Image
from ale_py import ALEInterface

# imports from agent.py 
import random
import torch.optim as optim
import torch.nn.functional as F
import copy

# imports from train.py
import matplotlib.pyplot as plt
import seaborn as sns

# Misc.
import os
import time

# Model

In [2]:
class DQNNet(nn.Module):
    def __init__(self, num_of_actions):
        super(DQNNet, self).__init__()
        self.num_of_actions = num_of_actions

        # images should be preprocessed (extract luminance channel from RGB channels) by φ defined in the paper.

        # in: (4, 84, 84) - out: (32, 20, 20)
        # reLU should be applied on the outputs
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size= (8, 8), stride=4) 

        # in: (32, 20, 20) - out: (64, 9, 9)
        # reLU should be applied on the outputs
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size= (4, 4), stride=2)

        # in: (64, 9, 9) - out: (64, 7, 7)
        # reLU should be applied on the outputs
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size= (3, 3), stride=1)

        # flattening should be applied here before feeding into fc1

        # in: 64 * 7 * 7 = (3136, ) - out: (512, ) 
        # reLU should be applied on the outputs
        self.fc1 = nn.Linear(in_features=3136, out_features=512)

        # in: (512, ) - out: (7, )
        self.fc2 = nn.Linear(in_features=512, out_features=num_of_actions)

    def forward(self, x):
        x = self.conv1(x)
        x = nn.ReLU()(x)
         
        x = self.conv2(x)
        x = nn.ReLU()(x)
         
        x = self.conv3(x)
        x = nn.ReLU()(x)

        x = nn.Flatten(start_dim=1, end_dim=-1)(x)

        x = self.fc1(x)
        x = nn.ReLU()(x)

        return self.fc2(x)

# Memory

In [3]:
class Memory:
    def __init__(self, max_num_transitions, mini_batch_size, device="cpu"):
        self.lst = []
        self.max_num_transitions = max_num_transitions
        self.mini_batch_size = mini_batch_size
        self.device = device

    # this function returns the number of stored transitions
    def __len__(self):
        return len(self.lst)
    
    def append(self, curr_state, action, reward, next_state, done):
        if len(self.lst) == self.max_num_transitions:
            self.lst.pop(0)

        trans = (curr_state, action, reward, next_state, done)
        self.lst.append(trans)
    
    def sample_mini_batch(self):
        if len(self.lst) < self.mini_batch_size:
            raise Exception('Don\'t try to sample mini-batches while number of stored transitions < mini_batch_size')
        
        idxs = np.random.randint(0, len(self.lst), self.mini_batch_size)
        samples = [self.lst[idx] for idx in idxs]

        zipped_content = tuple(zip(*samples))
        dtypes = [torch.float32, torch.int, torch.bool, torch.float32, torch.float32]

        mini_batch = [
            torch.tensor(np.array(zipped_content[i]), dtype=dtypes[i]).to(self.device)
            for i in range(len(zipped_content))
        ]
        
        return mini_batch

# Wrappers

In [4]:
class AtariImage(gym.Wrapper):
    """
    Gym wrapper to preprocess the environments observations (frames)
    The wrapper applies frameskip and stacks frames together
    The same action is taken in each frame of a stack

    :param env: Environment to wrap
    :param image_shape: The output shape of the image
    :param frame_skip: The amount of frames that stack, also the same action is applied
    """
    def __init__(self, env, image_shape=(84, 84), frame_skip=4):
        super().__init__(env)
        self.image_shape = image_shape
        self.frame_skip = frame_skip

        obs_shape = (frame_skip, self.image_shape[0], self.image_shape[1])
        self.observation_space = gym.spaces.Box(shape=obs_shape, low=0, high=1, dtype=np.float32)

    def reset(self):
        observations = []

        obs, info = self.env.reset()
        obs = self._process_observations(obs)
        observations.append(obs)

        for i in range(self.frame_skip - 1):
            obs, reward, terminated, truncated, info = self.env.step(0) # Do nothing
            obs = self._process_observations(obs)
            observations.append(obs)

        observation = np.stack(observations)

        return observation, info

    def step(self, action):
        observations = []
        total_reward = 0
        for i in range(self.frame_skip):
            obs, reward, terminated, truncated, info = self.env.step(action)
            obs = self._process_observations(obs)
            observations.append(obs)
            total_reward += reward

        observation = np.stack(observations)

        return observation, total_reward, terminated, truncated, info

    def _process_observations(self, obs):
        image = Image.fromarray(obs)
        image = image.convert('L')
        image = image.resize((self.image_shape[1], self.image_shape[0]))
        image_array = np.array(image).astype(np.float32)
        image_array /= 255
        return image_array


class ClipReward(gym.Wrapper):
    """
    Gym wrapper to clip rewards

    :param env: Environment to wrap
    :param min_reward: The minimum reward
    :param max_reward: The maximum reward
    """
    def __init__(self, env, min_reward=-1, max_reward=1):
        super().__init__(env)
        self.min_reward = min_reward
        self.max_reward = max_reward

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        reward = float(np.clip(reward, self.min_reward, self.max_reward))

        return obs, reward, terminated, truncated, info


# Agent

In [5]:
class Agent:
    def __init__(self, num_of_actions=7, network=None, lr=0.00025, gamma=0.99, eps=1.0,
        eps_fframe=1e6, eps_final=0.1, minibatch_size=32, min_training_step=1000,
        max_num_transitions=50000, target_interval=10000, device="cpu"):   

        self.num_of_actions = num_of_actions
        if(network == None):
            network = DQNNet(num_of_actions)
        self.network = network.to(device)
        self.target_network = copy.deepcopy(network)
        self.target_interval = target_interval
        self.learn_count = 0
        # Hyperparameters taken from the paper
        self.optim = torch.optim.RMSprop(network.parameters(), lr=lr, alpha=0.95, eps=0.01, momentum=0.95)
        self.minibatch_size = minibatch_size

        self.eps = eps
        self.eps_final = eps_final
        self.eps_step = (eps - eps_final) / eps_fframe
        self.gamma = gamma
        self.min_training_step = min_training_step

        self.memory = Memory(max_num_transitions=max_num_transitions, mini_batch_size=32, device=device)
        self.device = device

    def load_model(self, model_path):
        self.network.load_state_dict(torch.load(model_path, map_location=self.device))

    def save_model(self, model_path):
        torch.save(self.network.state_dict(), model_path)

    def store_transition(self, obs, action, reward, done, next_obs):
        self.memory.append(obs, action, reward, done, next_obs)

    def choose_action(self, obs, eps=None):
        if(eps == None):
            eps = self.eps

        if(random.random() < eps):
            return random.randint(0, self.num_of_actions - 1)
        else:
            with torch.no_grad():
                action_values = self.network(obs)
                return torch.argmax(action_values).item()

    def learn(self):
        if(len(self.memory) < self.min_training_step):
            return

        obss, actions, rewards, dones, next_obss = self.memory.sample_mini_batch()

        ys = rewards + 0.0
    
        with torch.no_grad():
            next_qvals = self.target_network(next_obss)
            ys[dones == 0] += self.gamma * torch.max(next_qvals, dim=1)[0][dones == 0]
        
        qvals = self.network(obss)
        ys_p = qvals[torch.arange(qvals.size(0), device=self.device), actions]

        loss = F.mse_loss(ys, ys_p)

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        self.eps = max(self.eps - self.eps_step, self.eps_final)

        self.learn_count += 1

        if(self.learn_count % self.target_interval == 0):
            self.target_network = copy.deepcopy(self.network)
            print("Updated target network")

        return loss.item()

# Plotting Function

In [6]:
def plot_logs(game_id, total_interactions, episode_cnt, history_of_total_losses, history_of_total_rewards):
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs = axs.flatten()

    x = np.arange(1, episode_cnt + 1)
    sns.lineplot(x=x, y=history_of_total_losses, ax=axs[0])
    axs[0].set_title('Total Loss over Different Episodes')
    axs[0].set_xlabel('Episodes')
    axs[0].set_ylabel('Total MSE Loss')
    # axs[0].legend()

    sns.lineplot(x=x, y=history_of_total_rewards, ax=axs[1])
    axs[1].set_title('Total Rewards over each Episodes')
    axs[1].set_xlabel('Episodes')
    axs[1].set_ylabel('Total Reward')
    # axs[1].legend()

    plt.suptitle('Total Loss & Reward over each Episodes \n ' )

    plt.tight_layout()
    plt.show()

In [None]:
# cofiguration of the environment
game_id = 'ALE/Breakout-v5'
max_total_interactions = 5000000
frame_skip = 4
env = gym.make(id=game_id, **{'frameskip':1})
clip_reward_wrapper = ClipReward(env)
atari_image_wrapper = AtariImage(clip_reward_wrapper)
# add other wrappers if needed
# ...
wrapped_env = atari_image_wrapper # set to the last applied wrapper for more convinent naming 

print(f'The Environment for the Game {game_id} has been Initialized.')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# configuration of the agent
agent = Agent( num_of_actions=4, device=device) # we keep the arguments as default


# parameters of the training loop 
total_interactions = 0 # total number of the interactions, that the agent had so far (each stack of the frames is counted once).


# logging variables (accumulated over all episodes)
history_of_total_losses = []
history_of_total_rewards = []
episode_cnt = 0
num_of_last_episodes_to_avg = 100
log_display_step = 10000
start_time = time.time()

print(f'Starting the Training...')
while total_interactions < max_total_interactions: 
    episode_finished = False
    episode_total_loss = 0.0
    episode_total_reward = 0.0

    # initializing a new episode
    obs, info = wrapped_env.reset()
    obs = torch.tensor(obs)

    while not episode_finished:
        # chosing action - observing the outcome - storing in replay buffer - learning 
        action = agent.choose_action(obs.unsqueeze(0).to(device))
        next_obs, reward, terminated, truncated, info = wrapped_env.step(action)
        next_obs, action = torch.tensor(next_obs), torch.tensor(action)
        
        agent.store_transition(obs, action, reward, terminated or truncated, next_obs)
        loss = agent.learn()
        
        if loss == None: # it means that the replay buffer has not stored a sufficient number of transitions yet
            continue

        obs = next_obs

        # logging (accumlated over each episode)
        total_interactions += 1
        episode_finished = terminated or truncated
        episode_total_loss += loss
        episode_total_reward += reward

        # display logs every log_display_step + saving
        if (total_interactions % log_display_step) == 0 and (total_interactions > 0) and (episode_cnt >= num_of_last_episodes_to_avg):
            end_time = time.time()
            avg_loss_of_last_episodes = np.average(history_of_total_losses[-num_of_last_episodes_to_avg:])
            avg_reward_of_last_episodes = np.average(history_of_total_rewards[-num_of_last_episodes_to_avg:])
            print(f'Displaying Logs at the Frame {total_interactions}, Episode {episode_cnt}, Delta Time: {end_time - start_time}')
            print(f'Avg Loss Across {num_of_last_episodes_to_avg} Last Episodes = {avg_loss_of_last_episodes:.4f}')
            print(f'Avg Reward Across {num_of_last_episodes_to_avg} Last Episodes = {avg_reward_of_last_episodes:.4f}')
            start_time = end_time
            
            agent.save_model(f'saved_models/agent_it_{total_interactions}.pt')


    # logging (accumulated over all episodes)
    history_of_total_losses.append(episode_total_loss)
    history_of_total_rewards.append(episode_total_reward)
    episode_cnt += 1

print(f'Training has been Finished!')

print(f'Storing the Model...')
dir_path = "/kaggle/working/saved_models"

if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    print(f"Directory '{dir_path}' created.")
else:
    print(f"Directory '{dir_path}' already exists.")
agent.save_model(f'/kaggle/working/saved_models/agent_{game_id}.pt')

print(f'Plotting the Logs...')
plot_logs(game_id, total_interactions, episode_cnt, history_of_total_losses, history_of_total_rewards)

A.L.E: Arcade Learning Environment (version 0.11.2+ecc1138)
[Powered by Stella]


The Environment for the Game ALE/Breakout-v5 has been Initialized.
Starting the Training...


KeyboardInterrupt: 