# D3QN

In [None]:
import matplotlib
import random
import sys
import torch

import numpy as np
import gymnasium as gym
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils as utils

from collections import deque, namedtuple
from itertools import count
from torch.distributions import Categorical
from time import time
import matplotlib.pyplot as plt

sys.path.append(r"C:\Users\takat\PycharmProjects\machine-learning")
import flowdata
import flowenv

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    pass

In [None]:
device_name = "cpu"

if True:
    if torch.cuda.is_available():
        device_name = "cuda"
    elif torch.mps.is_available():
        device_name = "mps"
    # elif torch.hip.is_available():
    #     device_name = "hip"
    elif torch.mtia.is_available():
        device_name = "mtia"
    elif torch.xpu.is_available():
        device_name = "xpu"

device = torch.device(device_name)
print(f"device: {device_name}")

## Constants

In [None]:
UPDATE_TARGET_STEPS = 500
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000000
TAU = 0.005
LR = 1e-4

## using data

In [None]:
raw_data_train, raw_data_test = flowdata.flow_data.using_data()

train_env = gym.make("flowenv/FlowTrain-v0", data=raw_data_train)
test_env = gym.make("flowenv/FlowTest-v0", data=raw_data_test)

## Memory class

In [None]:
Transaction = namedtuple('Transaction', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        # self.capacity = capacity
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transaction(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

## Dueling DQN Module

In [None]:
class DuelingDQN(nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(DuelingDQN, self).__init__()
        self.fc1 = nn.Linear(n_inputs, 128)
        self.fc2 = nn.Linear(128, 128)

        self.value_stream = nn.Sequential(
            nn.Linear(128, 1)
        )
        self.advantage_stream = nn.Sequential(
            nn.Linear(128, n_outputs)
        )

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)
        return value + advantage - advantage.mean()

In [None]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1. * steps_done / EPS_DECAY)

    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1, 1)
    else:
        return torch.tensor([[random.randrange(train_env.action_space.n)], ], device=device, dtype=torch.long)

def optimize_model():
    global memory, policy_net, target_net, optimizer
    if len(memory) < BATCH_SIZE:
        return

    transactions = memory.sample(BATCH_SIZE)
    batch = Transaction(*zip(*transactions))

    non_final_mask = torch.tensor(
        tuple(map(lambda s: s is not None, batch.next_state)),
        device=device,
        dtype=torch.bool
    )
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_actions = policy_net(non_final_next_states).max(1).indices.unsqueeze(1)
        next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_actions).squeeze(1)

    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    criterion = nn.MSELoss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()

    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

def get_h_m_s(seconds: float):
    hours = int(seconds // 3600)
    minutes = int((seconds - hours * 3600) // 60)
    seconds = seconds - hours * 3600 - minutes * 60
    return hours, minutes, seconds

def loading_bar(episode, total_episodes, interval):
    pro_size_float = (episode + 1) / total_episodes * 20
    show = pro_size_float * 5
    pro_size = int(pro_size_float)

    # episode...interval -> total_episodes...interval * total_episodes / episode
    last_time = interval * (total_episodes - episode) / (episode + 1)
    hours, minutes, seconds = get_h_m_s(last_time)
    print(f"\r[{'#' * pro_size}{' ' * (20 - pro_size)}] {show:3.02f}%, last={hours:02d}:{minutes:02d}:{seconds:03.3f}", end="")

In [None]:
policy_net = DuelingDQN(train_env.observation_space.shape[0], train_env.action_space.n).to(device)
target_net = DuelingDQN(train_env.observation_space.shape[0], train_env.action_space.n).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = torch.optim.Adam(policy_net.parameters(), lr=LR)
memory = ReplayMemory(int(1e6))

steps_done = 0

num_episodes = 10000
episode_rewards = []

start_time = time()
for i_episode in range(num_episodes):
    state, info = train_env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    sum_reward = 0

    for t in count():
        action = select_action(state)
        next_state, reward, terminated, truncated, _ = train_env.step(action.item())

        reward = torch.tensor([reward], device=device)
        done = bool(terminated)
        steps_done += 1

        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)

        memory.push(state, action, next_state, reward)

        state = next_state
        sum_reward += reward.item()

        optimize_model()

        if steps_done % UPDATE_TARGET_STEPS == 0:
            target_net_state_dict = target_net.state_dict()
            policy_net_state_dict = policy_net.state_dict()
            for key in target_net_state_dict:
                target_net_state_dict[key] = TAU * policy_net_state_dict[key] + (1 - TAU) * target_net_state_dict[key]
            target_net.load_state_dict(target_net_state_dict)

        if done:
            episode_rewards.append(sum_reward)
            break

    end_time = time()
    loading_bar(i_episode, num_episodes, end_time - start_time)


In [None]:
mean_rewards = []

for i in range(0, len(episode_rewards)):
    # print(f"Episode {i}, mean reward: {np.mean(episode_rewards[0:i])}")
    mean_rewards.append(np.mean(episode_rewards[0:i]))

plt.figure(figsize=(10, 5))
plt.plot(episode_rewards)
plt.plot(mean_rewards, color="red")
plt.show()