In [1]:
!pip install --no-cache-dir certifi==2022.12.7 charset-normalizer==2.1.1 cloudpickle==2.2.1 cmake==3.25.0 colorama==0.4.6 contourpy==1.1.0 cycler==0.11.0 decorator==4.4.2 filelock==3.9.0 fonttools==4.42.1 fsspec==2023.4.0 gym==0.26.2 gym-notices==0.0.8 gym-super-mario-bros==7.4.0 idna==3.4 imageio==2.31.1 imageio-ffmpeg==0.4.8 Jinja2==3.1.2 kiwisolver==1.4.5 lit==15.0.7 lz4==4.3.2 MarkupSafe==2.1.2 matplotlib==3.7.2 moviepy==1.0.3 mpmath==1.2.1 nes-py==8.2.1 networkx==3.0 numpy==1.23.5 opencv-python==4.8.0.76 packaging==23.1 Pillow==9.3.0 proglog==0.1.10 pyglet==1.5.21 pyparsing==3.0.9 python-dateutil==2.8.2 requests==2.28.1 six==1.16.0 sympy==1.11.1 tensordict==0.2.1 torchrl==0.2.1 tqdm==4.66.1 typing_extensions==4.4.0 urllib3==1.26.13


Collecting certifi==2022.12.7
  Downloading certifi-2022.12.7-py3-none-any.whl (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting charset-normalizer==2.1.1
  Downloading charset_normalizer-2.1.1-py3-none-any.whl (39 kB)
Collecting cmake==3.25.0
  Downloading cmake-3.25.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorama==0.4.6
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting contourpy==1.1.0
  Downloading contourpy-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.7/300.7 kB[0m [31m200.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cycler==0.11.0
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fi

In [1]:
!pip install torchvision

Collecting torch==2.2.1 (from torchvision)
  Downloading torch-2.2.1-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting typing-extensions>=4.8.0 (from torch==2.2.1->torchvision)
  Downloading typing_extensions-4.11.0-py3-none-any.whl (34 kB)
Collecting nvidia-nccl-cu12==2.19.3 (from torch==2.2.1->torchvision)
  Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)
Collecting triton==2.2.0 (from torch==2.2.1->torchvision)
  Downloading triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.9/167.9 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: typing-extensions, triton, nvidia-nccl-cu12, torch
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.4.0
    Uninstalling typing

In [8]:
%cd drive/MyDrive

/content/drive/MyDrive


In [2]:
import torch
from torch import nn
import numpy as np
from tensordict import TensorDict
from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage

from gym import Wrapper
from gym.wrappers import GrayScaleObservation, ResizeObservation, FrameStack

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY

from nes_py.wrappers import JoypadSpace
import os
from PIL import Image
import time
import datetime
import matplotlib.pyplot as plt

  _register_pytree_node(


In [3]:

class AgentNN(nn.Module):
    def __init__(self, input_shape, n_actions, freeze=False):
        """
        Neural network model for the agent in a reinforcement learning environment.

        Args:
            input_shape (tuple): Shape of the input tensor (channels, height, width).
            n_actions (int): Number of possible actions the agent can take.
            freeze (bool): If True, freezes the parameters of the network.
        """
        super().__init__()
        
        # Convolutional layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )

        # Calculate the output size of the convolutional layers
        conv_out_size = self._get_conv_out(input_shape)

        # Linear layers
        self.network = nn.Sequential(
            self.conv_layers,
            nn.Flatten(),
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

        # Freeze the network parameters if specified
        if freeze:
            self._freeze()

        # Move the model to GPU if available
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.to(self.device)

    def forward(self, x):
        """
        Forward pass through the network.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor.
        """
        return self.network(x)

    def _get_conv_out(self, shape):
        """
        Calculate the output size of the convolutional layers.

        Args:
            shape (tuple): Shape of the input tensor (channels, height, width).

        Returns:
            int: Size of the output tensor after passing through the convolutional layers.
        """
        o = self.conv_layers(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def _freeze(self):
        """Freeze the parameters of the network."""
        for p in self.network.parameters():
            p.requires_grad = False


In [4]:
class Agent:
    def __init__(self,
                 input_dims,
                 num_actions,
                 lr=0.00025,
                 gamma=0.9,
                 epsilon=1.0,
                 eps_decay=0.99999975,
                 eps_min=0.1,
                 replay_buffer_capacity=10_000,
                 batch_size=32,
                 sync_network_rate=10000):
        """
        Initialize the agent.

        Args:
            input_dims (tuple): Dimensions of the input observation.
            num_actions (int): Number of possible actions.
            lr (float): Learning rate for the optimizer.
            gamma (float): Discount factor for future rewards.
            epsilon (float): Initial value of exploration rate.
            eps_decay (float): Decay rate for exploration rate.
            eps_min (float): Minimum value of exploration rate.
            replay_buffer_capacity (int): Capacity of the replay buffer.
            batch_size (int): Batch size for training.
            sync_network_rate (int): Frequency of synchronizing target network with online network.
        """
        self.num_actions = num_actions
        self.learn_step_counter = 0

        # Hyperparameters
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.batch_size = batch_size
        self.sync_network_rate = sync_network_rate

        # Networks
        self.online_network = AgentNN(input_dims, num_actions)
        self.target_network = AgentNN(input_dims, num_actions, freeze=True)

        # Optimizer and loss
        self.optimizer = torch.optim.Adam(self.online_network.parameters(), lr=self.lr)
        self.loss = torch.nn.MSELoss()
        # self.loss = torch.nn.SmoothL1Loss() # Try this loss function instead!

        # Replay buffer
        storage = LazyMemmapStorage(replay_buffer_capacity)
        self.replay_buffer = TensorDictReplayBuffer(storage=storage)

    def choose_action(self, observation):
        """
        Choose an action based on the current observation.

        Args:
            observation (list): Current observation.

        Returns:
            int: Chosen action.
        """
        if np.random.random() < self.epsilon:
            return np.random.randint(self.num_actions)
        
        observation = torch.tensor(np.array(observation), dtype=torch.float32) \
                        .unsqueeze(0) \
                        .to(self.online_network.device)
        
        return self.online_network(observation).argmax().item()

    def decay_epsilon(self):
        """Decay the exploration rate epsilon."""
        self.epsilon = max(self.epsilon * self.eps_decay, self.eps_min)

    def store_in_memory(self, state, action, reward, next_state, done):
        """
        Store the transition tuple in the replay buffer.

        Args:
            state: Current state.
            action: Action taken.
            reward: Reward received.
            next_state: Next state.
            done: Flag indicating if the episode is done.
        """
        self.replay_buffer.add(TensorDict({
                                            "state": torch.tensor(np.array(state), dtype=torch.float32),
                                            "action": torch.tensor(action),
                                            "reward": torch.tensor(reward),
                                            "next_state": torch.tensor(np.array(next_state), dtype=torch.float32),
                                            "done": torch.tensor(done)
                                          }, batch_size=[]))

    def sync_networks(self):
        """Synchronize the target network with the online network."""
        if self.learn_step_counter % self.sync_network_rate == 0 and self.learn_step_counter > 0:
            self.target_network.load_state_dict(self.online_network.state_dict())

    def save_model(self, path):
        """Save the model parameters to a file."""
        torch.save(self.online_network.state_dict(), path)

    def load_model(self, path):
        """Load the model parameters from a file."""
        self.online_network.load_state_dict(torch.load(path))
        self.target_network.load_state_dict(torch.load(path))

    def learn(self):
        """Perform one step of learning."""
        if len(self.replay_buffer) < self.batch_size:
            return

        self.sync_networks()

        self.optimizer.zero_grad()

        samples = self.replay_buffer.sample(self.batch_size).to(self.online_network.device)

        keys = ("state", "action", "reward", "next_state", "done")

        states, actions, rewards, next_states, dones = [samples[key] for key in keys]

        predicted_q_values = self.online_network(states)
        predicted_q_values = predicted_q_values[np.arange(self.batch_size), actions.squeeze()]

        target_q_values = self.target_network(next_states).max(dim=1)[0]
        target_q_values = rewards + self.gamma * target_q_values * (1 - dones.float())

        loss = self.loss(predicted_q_values, target_q_values)
        loss.backward()
        self.optimizer.step()

        self.learn_step_counter += 1
        self.decay_epsilon()


In [5]:
class SkipFrame(Wrapper):
    def __init__(self, env, skip):
        """
        A wrapper for skipping frames in an environment.

        Args:
            env (gym.Env): The environment to wrap.
            skip (int): Number of frames to skip before returning a new observation.
        """
        super().__init__(env)
        self.skip = skip

    def step(self, action):
        """
        Take a step in the environment.

        Args:
            action: Action to take.

        Returns:
            tuple: Tuple containing the next state, total reward, done flag, truncation flag, and additional info.
        """
        total_reward = 0.0
        done = False
        for _ in range(self.skip):
            next_state, reward, done, trunc, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return next_state, total_reward, done, trunc, info


def apply_wrappers(env):
    """
    Apply a series of wrappers to the environment.

    Args:
        env (gym.Env): The environment to wrap.

    Returns:
        gym.Env: The wrapped environment.
    """
    env = SkipFrame(env, skip=4)  # Num of frames to apply one action to
    env = ResizeObservation(env, shape=84)  # Resize frame from 240x256 to 84x84
    env = GrayScaleObservation(env)
    env = FrameStack(env, num_stack=4, lz4_compress=True)  # May need to change lz4_compress to False if issues arise
    return env


In [6]:
import datetime
import time

def get_current_date_time_string():
    """
    Get the current date and time as a formatted string.

    Returns:
        str: Formatted string representing the current date and time.
    """
    return datetime.datetime.now().strftime("%Y-%m-%d-%H_%M_%S")

class Timer():
    def __init__(self):
        """Initialize the timer."""
        self.times = []

    def start(self):
        """Start the timer."""
        self.t = time.time()

    def print(self, msg=''):
        """
        Print the time taken since the timer was started.

        Args:
            msg (str): Additional message to print along with the time taken.
        """
        print(f"Time taken: {msg}", time.time() - self.t)

    def get(self):
        """
        Get the time taken since the timer was started.

        Returns:
            float: Time taken.
        """
        return time.time() - self.t

    def store(self):
        """Store the time taken since the timer was started."""
        self.times.append(time.time() - self.t)

    def average(self):
        """
        Calculate the average time taken.

        Returns:
            float: Average time taken.
        """
        return sum(self.times) / len(self.times)


In [None]:
model_path = os.path.join("models", get_current_date_time_string())
os.makedirs(model_path, exist_ok=True)

if torch.cuda.is_available():
    print("Using CUDA device:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available")

ENV_NAME = 'SuperMarioBros-1-1-v0'
SHOULD_TRAIN = False
DISPLAY = True
CKPT_SAVE_INTERVAL = 500 #after how much episodes should the model be saved
NUM_OF_EPISODES = 5000 #Total episodes

# Create and wrap the environment
env = gym_super_mario_bros.make(ENV_NAME, render_mode='human' if DISPLAY else 'rgb', apply_api_compatibility=True)
env = JoypadSpace(env, RIGHT_ONLY)
env = apply_wrappers(env)

# Create the agent
agent = Agent(input_dims=env.observation_space.shape, num_actions=env.action_space.n)

if not SHOULD_TRAIN:
    folder_name = ""
    ckpt_name = ""
    # agent.load_model(os.path.join("models", folder_name, ckpt_name))
    agent.load_model('model_1500_iter.pt')
    agent.epsilon = 0.2
    agent.eps_min = 0.0
    agent.eps_decay = 0.0

# Training loop
env.reset()
next_state, reward, done, trunc, info = env.step(action=0)
rewards = []
episodes = []
for i in range(NUM_OF_EPISODES):    
    print("Episode:", i)
    done = False
    state, _ = env.reset()
    total_reward = 0
    while not done:
        action = agent.choose_action(state)                               #Choosing action
        new_state, reward, done, truncated, info  = env.step(action)      #Taking a step based on the policy's action
        total_reward += reward

        if SHOULD_TRAIN:
            agent.store_in_memory(state, action, reward, new_state, done)   #Store in replay buffer
            agent.learn()

        state = new_state

    print("Total reward:", total_reward, "Epsilon:", agent.epsilon, "Size of replay buffer:", len(agent.replay_buffer), "Learn step counter:", agent.learn_step_counter)

    rewards.append(total_reward)
    episodes.append(i)
    if SHOULD_TRAIN and (i + 1) % CKPT_SAVE_INTERVAL == 0:
        agent.save_model(os.path.join(model_path, "model_" + str(i + 1) + "_iter.pt"))

    print("Total reward:", total_reward)

env.close()

# Plot rewards
plt.plot(episodes, rewards, label='Array 1')
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('Average rewards over the episodes')
plt.legend()
plt.show()

# Save the plot as an image
plt.savefig('1500.png')

Total reward: 858.0 Epsilon: 0.933428876898244 Size of replay buffer: 10000 Learn step counter: 275562
Total reward: 858.0
Episode: 957
Total reward: 651.0 Epsilon: 0.9332041809168768 Size of replay buffer: 10000 Learn step counter: 276525
Total reward: 651.0
Episode: 958
Total reward: 785.0 Epsilon: 0.9331362927736807 Size of replay buffer: 10000 Learn step counter: 276816
Total reward: 785.0
Episode: 959
Total reward: 218.0 Epsilon: 0.9331253284852838 Size of replay buffer: 10000 Learn step counter: 276863
Total reward: 218.0
Episode: 960
Total reward: 813.0 Epsilon: 0.9330901036645983 Size of replay buffer: 10000 Learn step counter: 277014
Total reward: 813.0
Episode: 961
Total reward: 696.0 Epsilon: 0.933013826656944 Size of replay buffer: 10000 Learn step counter: 277341
Total reward: 696.0
Episode: 962
Total reward: 974.0 Epsilon: 0.9329018717011671 Size of replay buffer: 10000 Learn step counter: 277821
Total reward: 974.0
Episode: 963
Total reward: 635.0 Epsilon: 0.932864556367