In [None]:
#run this cell
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os
import time

import gymnasium as gym


import collections
import argparse

import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import seaborn as sns
import pandas as pd
from collections import defaultdict

from time import time

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [None]:
import torch
import torch.utils.data as utils
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

import torch.optim as optim


from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from MainCode.env import NetworkEnv

In [None]:
env = NetworkEnv()

In [None]:
class Agent:

    def __init__(self, EPS_START, EPS_END, EPS_DECAY, grand_length, training_rate=0.8):


        self.epsilon_start = EPS_START
        self.epsilon_end = EPS_END
        
        self.epsilon = EPS_START
        self.grand_length = grand_length
        self.epsilon_decay_steps = self.grand_length - 2
        self.epsilon_decay = (self.epsilon_start - self.epsilon_end) / int(self.epsilon_decay_steps * training_rate)
        self.base_ep_steps = int(self.epsilon_decay_steps * training_rate)

        self.total_steps = 0
    
    def adjust_epsilon(self):
        
        if self.total_steps < self.base_ep_steps:
            self.epsilon -= self.epsilon_decay
        else:
            self.epsilon = 0.01
        
        self.total_steps += 1

In [None]:
max_episodes = 2000
max_episode_steps = 20

training_rate = 0.7

In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class DQNConv1D(nn.Module):
    def __init__(self, shape, actions_n):
        super(DQNConv1D, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(shape[0], 1600, 5),
            nn.ReLU(),
            nn.Conv1d(1600, 1600, 5),
            nn.ReLU(),
        )

        out_size = self._get_conv_out(shape)

        self.fc_val = nn.Sequential(
            nn.Linear(out_size, 3200),
            nn.ReLU(),
            nn.Linear(3200, 1)
        )

        self.fc_adv = nn.Sequential(
            nn.Linear(out_size, 3200),
            nn.ReLU(),
            nn.Linear(3200, actions_n)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        val = self.fc_val(conv_out)
        adv = self.fc_adv(conv_out)
        return val + adv - adv.mean(dim=1, keepdim=True)

In [None]:
input_neurons = 32

#BATCH_SIZE = 512
BATCH_SIZE = 16

#BATCH_SIZE = 512

#BATCH_SIZE = 6000

#BATCH_SIZE = 8000


GAMMA = 0.99
EPS_START = 0.99
EPS_END = 0.01
EPS_DECAY = 100
#TAU = 0.0005
#TAU = 0.005

TAU = 100

#LR = 0.0001

LR = 0.001

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state, info = env.reset()
n_observations = env.simulator.shape[-1]

policy_net = DQNConv1D(env.observation_space.shape, env.action_space.n).to(device)
target_net = DQNConv1D(env.observation_space.shape, env.action_space.n).to(device)

# policy_net = CNN().to(device)
# target_net = CNN().to(device)







file_number = 2444

# policy_net.load_state_dict(torch.load(f'special_{file_number}_net.pt'))
target_net.load_state_dict(policy_net.state_dict())


optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

# optimizer.load_state_dict(torch.load(f'special_opt_{file_number}_net.pt'))


memory = ReplayMemory(10000)


steps_done = 0

agent = Agent(EPS_START=EPS_START, EPS_END=EPS_END,
              EPS_DECAY=EPS_DECAY, grand_length=max_episodes, training_rate=training_rate)


def select_action(state):
    if np.random.rand() <= agent.epsilon:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
    else:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)

def select_action_his(state):
    global steps_done
    global eps_threshold
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)


episode_durations = []


In [None]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    #print(batch.action)
    #print(type(batch.action))
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)



    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    



    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, dtype=torch.float32, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    expected_state_action_values = expected_state_action_values.to(torch.float32)



    # Compute Huber loss

    criterion = nn.MSELoss()



    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))



    # Optimize the model
    optimizer.zero_grad()


    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

In [None]:
def format_time(t):
    m_, s = divmod(t, 60)
    h, m = divmod(m_, 60)
    return '{:02.0f}:{:02.0f}:{:02.0f}'.format(h, m, s)

In [None]:
def outcome(episode, reward, epsilon, total):
    template = '{:>4d} | {}  | Reward: {:>8.3f} ({:>7.3f}) | '
    template += 'Epsilon | {:.3f}'

    print(template.format(episode, format_time(total),
                          np.mean(reward[-50:]),
                          np.mean(reward[-10:]),
                          epsilon
                          ))

In [None]:
while len(memory.memory) < BATCH_SIZE:

  
    state,info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    for episode_step in range(max_episode_steps):
        action = select_action(state)
        
        #observation, reward, terminated, _ = env.step(action.item())
        observation, reward, terminated, truncated, _ = env.step(action.item())

        reward = torch.tensor([reward], dtype=torch.float32, device=device)

        done = terminated or truncated
        

        if done:
            next_state = None
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        if done:
            break 

In [None]:
if torch.cuda.is_available():
    num_episodes = 20
else:
    num_episodes = 5

total_rewards = []
total = 0
start = time()

agent_reward = []


#for i_episode in range(num_episodes):
results = []

for episode in range(1,max_episodes + 1):


    state,info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    for episode_step in range(max_episode_steps):
        action = select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())


        reward = torch.tensor([reward], dtype=torch.float32, device=device)

        done = terminated or truncated

        if done:
            total = reward.cpu().numpy()
            next_state = None
            total_rewards.append(reward.cpu().numpy())
            episode_durations.append(episode_step + 1)
            break
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′

        
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()

        if episode_step % TAU == 0:
            target_net.load_state_dict(policy_net.state_dict())
    
    agent_reward.append(total)

    agent.adjust_epsilon()



    if episode % 10 == 0:
        outcome(episode, agent_reward, agent.epsilon, time() - start)
        # value = round(total[0], 4)
        # print(
        #     f'reward: {value}'
        # )
