# Silence Warnings

In [1]:
import warnings
warnings.filterwarnings("ignore")

# Imports

In [2]:
import os
import numpy as np
import random
import csv

import torch
from torch import nn
from torch.serialization import add_safe_globals

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace

from agent import Agent
from wrappers import apply_wrappers

In [3]:
add_safe_globals([Agent])

# Configs

In [4]:
DISPLAY = False
NUM_OF_EPISODES = 100_000
CKPT_SAVE_INTERVAL = 50     # previously 25

In [5]:
LOAD = False
PATH = 'models/model_v2'

In [8]:
# Config if LOAD = False
if not LOAD:
    # Environment Configuration
    TRAIN_LEVELS = []
    for world in range(1, 7):
        for level in range(1, 5):
            TRAIN_LEVELS.append(f"SuperMarioBros-{world}-{level}-v0")
    # print(len(TRAIN_LEVELS))  # 24

    TEST_LEVELS = []
    for world in range(7, 9):
        for level in range(1, 5):
            TEST_LEVELS.append(f"SuperMarioBros-{world}-{level}-v0")
    # print(len(TEST_LEVELS))   # 8

    NUM_EVAL_EPISODES = 1
    SKIP_FRAME = 4
    RESIZE = 84
    FRAME_STACK = 4

    # Hyperparameter Configuration
    LR = 0.00025
    GAMMA = 0.9
    EPSILON = 1.0
    EPS_DECAY = 0.99999975
    EPS_MIN = 0.1
    REPLAY_BUFFER_CAPACITY = 100_000
    BATCH_SIZE = 32
    SYNC_NETWORK_RATE = 10_000

    # Network Architecture Configuration
    conv_layers = nn.Sequential(
        nn.Conv2d(FRAME_STACK, 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
    )

    o = conv_layers(torch.zeros(1, FRAME_STACK, RESIZE, RESIZE))
    conv_out_size = int(np.prod(o.size()))

    network = nn.Sequential(
        conv_layers,
        nn.Flatten(),
        nn.Linear(conv_out_size, 512),
        nn.ReLU(),
        nn.Linear(512, len(RIGHT_ONLY))
    )

    # Create Agent
    agent = Agent(
        network,
        len(RIGHT_ONLY),
        LR,
        GAMMA,
        EPSILON,
        EPS_DECAY,
        EPS_MIN,
        REPLAY_BUFFER_CAPACITY,
        BATCH_SIZE,
        SYNC_NETWORK_RATE
    )

# Setup

In [11]:
# Set paths and load specified model if LOAD = TRUE
if LOAD:
    CKPT_PATH = os.path.join(PATH, "checkpoint.pt")
    TEST_CSV_PATH = os.path.join(PATH, "test.csv")
    TRAIN_CSV_PATH = os.path.join(PATH, "train_log.csv")

    checkpoint = torch.load(CKPT_PATH, weights_only=False)

    TRAIN_LEVELS = checkpoint['train_levels']
    TEST_LEVELS = checkpoint['test_levels']
    NUM_EVAL_EPISODES = checkpoint['num_eval_episodes']
    SKIP_FRAME = checkpoint['skip_frame']
    RESIZE = checkpoint['resize']
    FRAME_STACK = checkpoint['frame_stack']

    agent = checkpoint['agent']

In [14]:
# Set paths if LOAD = False
if not LOAD:
    base = "models"
    os.makedirs(base, exist_ok=True)

    existing = [d for d in os.listdir(base) if d.startswith("model_v")]
    nums = [int(d.replace("model_v", "")) for d in existing if d.replace("model_v", "").isdigit()]
    next_version = max(nums) + 1 if nums else 1

    PATH = os.path.join(base, f"model_v{next_version}")
    os.makedirs(PATH, exist_ok=True)
    
    CKPT_PATH = os.path.join(PATH, "checkpoint.pt")
    TEST_CSV_PATH = os.path.join(PATH, "test.csv")
    TRAIN_CSV_PATH = os.path.join(PATH, "train_log.csv")


    with open(TEST_CSV_PATH, "w", newline="") as f:
        csv.writer(f).writerow(["episode", "level", "reward"])

    with open(TRAIN_CSV_PATH, "w", newline="") as f:
        csv.writer(f).writerow(["episode", "learn_step", "level", "reward", "epsilon", "replay_buffer_size"])

# Train

In [15]:
def run_test_episode(level):
    rewards = []

    for _ in range(NUM_EVAL_EPISODES):
        env = gym_super_mario_bros.make(level, render_mode='rgb', apply_api_compatibility=True)
        env = JoypadSpace(env, RIGHT_ONLY)
        env = apply_wrappers(env, SKIP_FRAME, RESIZE, FRAME_STACK)

        try:
            state, _ = env.reset()
            done = False
            total_reward = 0
            while not done:
                action = agent.choose_action_test(state)
                state, reward, done, truncated, info = env.step(action)
                total_reward += reward

            rewards.append(total_reward)

        finally:
            env.close()

    return sum(rewards) / len(rewards)

In [16]:
def run_training_episode(level):
    env = gym_super_mario_bros.make(level, render_mode='human' if DISPLAY else 'rgb', apply_api_compatibility=True)
    env = JoypadSpace(env, RIGHT_ONLY)
    env = apply_wrappers(env, SKIP_FRAME, RESIZE, FRAME_STACK)

    try:
        state, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = agent.choose_action(state)
            new_state, reward, done, truncated, info  = env.step(action)
            total_reward += reward

            agent.store_in_memory(state, action, reward, new_state, done)
            agent.learn()

            state = new_state

        return total_reward
    
    finally:
        env.close()

In [None]:
for i in range(NUM_OF_EPISODES):
    agent.episode_counter += 1

    level = random.choice(TRAIN_LEVELS)
    train_reward = run_training_episode(level)

    print("Current Episode Number:", i + 1)
    print("Total Episode Number", agent.episode_counter)
    print("Learn step counter:", agent.learn_step_counter)
    print("Total reward:", train_reward)
    print("Epsilon:", agent.epsilon)
    print("Size of replay buffer:", len(agent.replay_buffer))
    print()

    # Save training data at every episode
    with open(TRAIN_CSV_PATH, "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            agent.episode_counter,
            agent.learn_step_counter,
            level,
            train_reward,
            agent.epsilon,
            len(agent.replay_buffer)
        ])

    if (i + 1) % CKPT_SAVE_INTERVAL == 0:
        # Save model
        torch.save(
            {
                "agent": agent,
                "train_levels": TRAIN_LEVELS,
                "test_levels": TEST_LEVELS,
                "num_eval_episodes": NUM_EVAL_EPISODES,
                "skip_frame": SKIP_FRAME,
                "resize": RESIZE,
                "frame_stack": FRAME_STACK
            },
            CKPT_PATH
        )            

        # Save testing rewards
        with open(TEST_CSV_PATH, "a", newline="") as f:
            writer = csv.writer(f)

            for test_level in TEST_LEVELS:
                reward = run_test_episode(test_level)
                writer.writerow([agent.episode_counter, test_level, reward])

Current Episode Number: 1
Total Episode Number 1
Learn step counter: 16
Total reward: 384.0
Epsilon: 0.9999960000074994
Size of replay buffer: 47

Current Episode Number: 2
Total Episode Number 2
Learn step counter: 53
Total reward: 169.0
Epsilon: 0.9999867500861227
Size of replay buffer: 84

Current Episode Number: 3
Total Episode Number 3
Learn step counter: 1548
Total reward: 81.0
Epsilon: 0.9996130748264299
Size of replay buffer: 1579

Current Episode Number: 4
Total Episode Number 4
Learn step counter: 1733
Total reward: 342.0
Epsilon: 0.9995668437850347
Size of replay buffer: 1764

Current Episode Number: 5
Total Episode Number 5
Learn step counter: 1776
Total reward: 243.0
Epsilon: 0.9995560984978754
Size of replay buffer: 1807

Current Episode Number: 6
Total Episode Number 6
Learn step counter: 1826
Total reward: 330.0
Epsilon: 0.9995436041231707
Size of replay buffer: 1857

Current Episode Number: 7
Total Episode Number 7
Learn step counter: 1883
Total reward: 408.0
Epsilon: 