# Setup

In [1]:
import gymnasium as gym
env = gym.make('MountainCar-v0')
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb
from matplotlib import pyplot as plt
import random

In [2]:
from dqnv2 import DQNAgentV2

In [3]:
# hyperparameters
learning_rate = 1e-3
n_episodes = 3_000
start_epsilon = 0.9
final_epsilon = 0.05
epsilon_decay = 0.95
# reduce the exploration over time
batch_size = 64
discount_factor = 0.99
replay_size = 10_000
logging_interval = 10
hidden_size=64
dropout_rate=0.0
weight_decay=1e-4
target_network = False
target_network_update = int(1e4)
alpha = 0
seed=42
np.random.seed(seed)    
torch.manual_seed(seed)
random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
amsgrad = False
reward_hidden_size=64
reward_factor=1/5
running_window = 32

In [4]:
agent = DQNAgentV2(
    learning_rate=learning_rate,
    state_size=2,
    action_size=3,
    discount_factor=discount_factor,
    final_epsilon=final_epsilon,
    hidden_size=hidden_size,
    epsilon_decay=epsilon_decay,
    initial_epsilon=start_epsilon,
    replay_size=replay_size,
    dropout_rate=dropout_rate,
    target_network=target_network,
    weight_decay=weight_decay,
    target_network_update=target_network_update,
    alpha=alpha,
    amsgrad=amsgrad,
    reward_hidden_size=reward_hidden_size,
    reward_factor=reward_factor,
    running_window=running_window
)

In [5]:
run = wandb.init(project='ANN', config={"learning_rate": learning_rate, "n_episodes": n_episodes, "start_epsilon": start_epsilon, "final_epsilon": final_epsilon, "epsilon_decay": epsilon_decay, "batch_size": batch_size, "discount_factor": discount_factor, "replay_size": replay_size, "hidden_size": hidden_size, "dropout_rate": dropout_rate, "weight_decay":weight_decay, "target_network":target_network, "alpha":alpha,"target_network_update":target_network_update, "reward_factor":reward_factor, "reward_hidden_size":reward_hidden_size, "amsgrad":amsgrad}, name='DQNv2')


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnreateguir[0m ([33mreategui[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
with tqdm(total=n_episodes, desc=f"Episode 0/{n_episodes}") as pbar:
    losses = []
    rewards = []
    target_count = 0
    finished = []
    episode_steps = []
    empty = True
    intrinsic_losses = []
    RND_rewards = []
    for episode in tqdm(range(n_episodes)):
        obs, info = env.reset()
        done = False

        t = 0
        episode_reward = 0
        qnetwork_loss = 0
        intrinsic_loss = 0
        episode_RND_reward = 0

        while not done:
            action = agent.get_action(obs, env)
            next_obs, reward, terminated, truncated, info = env.step(action)

            # update if the environment is done and the current obs
            done = terminated or truncated
            if terminated:
                next_obs = (None, None)

            loss, target_count, RND,intrinsic_loss = agent.update(obs, action, reward, next_obs, batch_size=batch_size, target_count=target_count)
            if loss is not None:
                episode_reward += reward + RND
                qnetwork_loss+=loss
                intrinsic_loss+=intrinsic_loss
                episode_RND_reward+=RND

            obs = next_obs
            t+=1
        pbar.set_description(f"Episode {episode + 1}/{n_episodes}")
        pbar.set_postfix(train_loss=qnetwork_loss, epsilon=agent.epsilon, target_count=target_count, intrinsic_loss=intrinsic_loss, episode_steps=t, reward=episode_reward, episode_RND_reward=episode_RND_reward)
        pbar.update(1)
        pbar.refresh() 
        if not empty:
            finished.append(terminated)
            episode_steps.append(t)
            rewards.append(episode_reward)
            losses.append(qnetwork_loss)
            intrinsic_losses.append(intrinsic_loss)
            agent.decay_epsilon()
            RND_rewards.append(episode_RND_reward)
            if episode % logging_interval == 0 :
                wandb.log({"train_loss": np.mean(losses), "predicted_intrinsic_losses":np.mean(intrinsic_losses), "epsilon": agent.epsilon, "episode_steps": np.mean(episode_steps), "finished": np.sum(finished), "mean_reward": np.mean(rewards), "episode_RND_reward": np.mean(RND_rewards)})
                losses = []
                rewards = []
                finished = []
                episode_steps = []
                RND_rewards = []
                intrinsic_losses = []
        if loss is not None:
            empty = False


Episode 0/3000:   0%|          | 0/3000 [00:00<?, ?it/s]

 20%|█▉        | 585/3000 [07:22<30:26,  1.32it/s]<38:48,  1.04it/s, episode_RND_reward=3.94, episode_steps=200, epsilon=0.05, intrinsic_loss=1.14e-8, reward=-196, target_count=107001, train_loss=0.0534] 
Episode 585/3000:  20%|█▉        | 585/3000 [07:22<30:26,  1.32it/s, episode_RND_reward=3.94, episode_steps=200, epsilon=0.05, intrinsic_loss=1.14e-8, reward=-196, target_count=107001, train_loss=0.0534]


KeyboardInterrupt: 

In [None]:
wandb.finish()