# Silence Warnings

In [1]:
import warnings
warnings.filterwarnings("ignore")

# Imports

In [2]:
import os
import random
import csv

import torch

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace

from agent import Agent
from wrappers import apply_wrappers

# Configs

In [3]:
# General
DISPLAY = False
NUM_OF_EPISODES = 50_000
CKPT_SAVE_INTERVAL = 25

In [4]:
# Loading
LOAD = True
PATH = 'models/model_v3'

In [5]:
if LOAD == False:
    # Environment Configuration
    TRAIN_LEVELS = ['SuperMarioBros-1-1-v0']
    TEST_LEVELS = ['SuperMarioBros-1-1-v0']
    NUM_EVAL_EPISODES = 1
    SKIP_FRAME = 4
    RESIZE = 84
    FRAME_STACK = 4

    # Hyperparameters
    LR = 0.00025
    GAMMA = 0.9
    EPSILON = 1.0
    EPS_DECAY = 0.99999975
    EPS_MIN = 0.1
    REPLAY_BUFFER_CAPACITY = 100_000
    BATCH_SIZE = 32
    SYNC_NETWORK_RATE = 10_000

# Create or Load Agent

In [6]:
if LOAD == True:
    # Initialize paths
    CHECKPOINT_PATH = os.path.join(PATH, 'checkpoint.pt')
    BUFFER_PATH = os.path.join(PATH, 'buffer')

    # Load
    checkpoint = torch.load(CHECKPOINT_PATH)

    # Environment Configuration
    TRAIN_LEVELS = checkpoint['train_levels']
    TEST_LEVELS = checkpoint['test_levels']
    NUM_EVAL_EPISODES = checkpoint['num_eval_episodes']
    SKIP_FRAME = checkpoint['skip_frame']
    RESIZE = checkpoint['resize']
    FRAME_STACK = checkpoint['frame_stack']

    # Hyperparameters
    agent = Agent(
        input_dims=checkpoint["input_dims"],
        num_actions=checkpoint["num_actions"],
        lr=checkpoint["lr"],
        gamma=checkpoint["gamma"],
        epsilon=checkpoint["epsilon"],
        eps_decay=checkpoint["eps_decay"],
        eps_min=checkpoint["eps_min"],
        replay_buffer_capacity=checkpoint["replay_buffer_capacity"],
        batch_size=checkpoint["batch_size"],
        sync_network_rate=checkpoint["sync_network_rate"],
    )

    # Agent Attributes
    agent.online_network.load_state_dict(checkpoint["online_network"])
    agent.target_network.load_state_dict(checkpoint["target_network"])
    agent.optimizer.load_state_dict(checkpoint["optimizer"])
    agent.learn_step_counter = checkpoint["learn_step_counter"]
    agent.episode_counter = checkpoint["episode_counter"]

    agent.replay_buffer.loads(BUFFER_PATH)

In [7]:
if LOAD == False:
    agent = Agent(
        (FRAME_STACK, RESIZE, RESIZE),
        len(RIGHT_ONLY),
        LR,
        GAMMA,
        EPSILON,
        EPS_DECAY,
        EPS_MIN,
        REPLAY_BUFFER_CAPACITY,
        BATCH_SIZE,
        SYNC_NETWORK_RATE
    )

# Setup Paths

In [8]:
if LOAD == True:
    TEST_CSV_PATH = os.path.join(PATH, "test.csv")

In [9]:
if LOAD == False:
    # Find the next available model number
    base = "models"
    os.makedirs(base, exist_ok=True)

    existing = [d for d in os.listdir(base) if d.startswith("model_v")]
    nums = [int(d.replace("model_v", "")) for d in existing if d.replace("model_v", "").isdigit()]
    next_version = max(nums) + 1 if nums else 1

    PATH = os.path.join(base, f"model_v{next_version}")
    os.makedirs(PATH, exist_ok=True)
    
    # Set paths
    CHECKPOINT_PATH = os.path.join(PATH, "checkpoint.pt")
    BUFFER_PATH = os.path.join(PATH, 'buffer')
    TEST_CSV_PATH = os.path.join(PATH, "test.csv")

    # Create CSV with proper headers
    with open(TEST_CSV_PATH, "w", newline="") as f:
        csv.writer(f).writerow(["episode", "level", "reward"])

# Helper Functions

In [10]:
def save_checkpoint():
    checkpoint = {
        # Environment config
        "train_levels": TRAIN_LEVELS,
        "test_levels": TEST_LEVELS,
        "num_eval_episodes": NUM_EVAL_EPISODES,
        "skip_frame": SKIP_FRAME,
        "resize": RESIZE,
        "frame_stack": FRAME_STACK,

        # Agent parameters
        "input_dims": agent.input_dims,
        "num_actions": agent.num_actions,
        "lr": agent.lr,
        "gamma": agent.gamma,
        "epsilon": agent.epsilon,
        "eps_decay": agent.eps_decay,
        "eps_min": agent.eps_min,
        "replay_buffer_capacity": agent.replay_buffer_capacity,
        "batch_size": agent.batch_size,
        "sync_network_rate": agent.sync_network_rate,

        # Agent state
        "online_network": agent.online_network.state_dict(),
        "target_network": agent.target_network.state_dict(),
        "optimizer": agent.optimizer.state_dict(),
        "learn_step_counter": agent.learn_step_counter,
        "episode_counter": agent.episode_counter,
    }

    # Save checkpoint
    torch.save(checkpoint, CHECKPOINT_PATH)

    # Save replay buffer
    agent.replay_buffer.dumps(BUFFER_PATH)

In [11]:
def run_test_episode(level):
    rewards = []

    for _ in range(NUM_EVAL_EPISODES):
        env = gym_super_mario_bros.make(level, render_mode='rgb', apply_api_compatibility=True)
        env = JoypadSpace(env, RIGHT_ONLY)
        env = apply_wrappers(env, SKIP_FRAME, RESIZE, FRAME_STACK)

        try:
            state, _ = env.reset()
            done = False
            total_reward = 0
            while not done:
                action = agent.choose_action_test(state)
                state, reward, done, truncated, info = env.step(action)
                total_reward += reward

            rewards.append(total_reward)

        finally:
            env.close()

    return sum(rewards) / len(rewards)

In [12]:
def run_training_episode(level):
    env = gym_super_mario_bros.make(level, render_mode='human' if DISPLAY else 'rgb', apply_api_compatibility=True)
    env = JoypadSpace(env, RIGHT_ONLY)
    env = apply_wrappers(env, SKIP_FRAME, RESIZE, FRAME_STACK)

    try:
        state, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = agent.choose_action(state)
            new_state, reward, done, truncated, info  = env.step(action)
            total_reward += reward

            agent.store_in_memory(state, action, reward, new_state, done)
            agent.learn()

            state = new_state

        return total_reward
    
    finally:
        env.close()

# Train

In [13]:
for i in range(NUM_OF_EPISODES):
    agent.episode_counter += 1

    level = random.choice(TRAIN_LEVELS)
    train_reward = run_training_episode(level)

    print("Current Episode Number:", i + 1)
    print("Total Episode Number", agent.episode_counter)
    print("Learn step counter:", agent.learn_step_counter)
    print("Total reward:", train_reward)
    print("Epsilon:", agent.epsilon)
    print("Size of replay buffer:", len(agent.replay_buffer))
    print()

    if (i + 1) % CKPT_SAVE_INTERVAL == 0:
        # Save model
        save_checkpoint()

        # Test model and save results
        with open(TEST_CSV_PATH, "a", newline="") as f:
            writer = csv.writer(f)
            for test_level in TEST_LEVELS:
                reward = run_test_episode(test_level)
                writer.writerow([agent.episode_counter, test_level, reward])

Current Episode Number: 1
Total Episode Number 26
Learn step counter: 7444
Total reward: 232.0
Epsilon: 0.9981407303543394
Size of replay buffer: 7475

Current Episode Number: 2
Total Episode Number 27
Learn step counter: 7626
Total reward: 1045.0
Epsilon: 0.9980953159786099
Size of replay buffer: 7657

Current Episode Number: 3
Total Episode Number 28
Learn step counter: 7661
Total reward: 234.0
Epsilon: 0.9980865826817106
Size of replay buffer: 7692

Current Episode Number: 4
Total Episode Number 29
Learn step counter: 7769
Total reward: 598.0
Epsilon: 0.9980596347044051
Size of replay buffer: 7800

Current Episode Number: 5
Total Episode Number 30
Learn step counter: 7864
Total reward: 623.0
Epsilon: 0.9980359310665963
Size of replay buffer: 7895

Current Episode Number: 6
Total Episode Number 31
Learn step counter: 7960
Total reward: 614.0
Epsilon: 0.9980119784886851
Size of replay buffer: 7991

Current Episode Number: 7
Total Episode Number 32
Learn step counter: 8057
Total reward

KeyboardInterrupt: 