In [1]:

import matplotlib
import random

from collections import deque, namedtuple
from torch.distributions import Categorical

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.utils as utils
import torch.optim as optim
import pandas as pd

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    pass

## train and test data

In [2]:
TRAIN_DATA_PATH = "../../DNP3_Intrusion_Detection_Dataset_Final/Training_Testing_Balanced_CSV_Files/CICFlowMeter/CICFlowMeter_Training_Balanced.csv"
TEST_DATA_PATH = "../../DNP3_Intrusion_Detection_Dataset_Final/Training_Testing_Balanced_CSV_Files/CICFlowMeter/CICFlowMeter_Testing_Balanced.csv"

# CSVファイルの読み込み
raw_data_train = pd.read_csv(TRAIN_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="all").dropna(how="all", axis=1)
raw_data_test = pd.read_csv(TEST_DATA_PATH).dropna(how="all").replace([np.inf, -np.inf], np.nan).dropna(how="all", axis=1)

raw_data_train["Binary Label"] = raw_data_train["Label"] == "NORMAL"
raw_data_test["Binary Label"] = raw_data_test["Label"] == "NORMAL"

train_env = gym.make("flowenv/FlowTrain-v0", data=raw_data_train)
# test_env = gym.make("flowenv/FlowTest-v0", data=raw_data_test)

## Agent

In [63]:
class A2C(nn.Module):
    def __init__(self, env, hidden_size=128, gamma=0.99, random_seed=None):
        super().__init__()

        if random_seed:
            env.seed(random_seed)
            torch.manual_seed(random_seed)

        self.env = env
        self.gamma = gamma
        self.hidden_size = hidden_size
        self.in_size = len(env.observation_space.sample().flatten())
        self.out_size = env.action_space.n

    def forward(self, state):
        pass

class Actor(A2C):
    def __init__(self, env, hidden_size=128, gamma=0.99, random_seed=None):
        super().__init__(env, hidden_size, gamma, random_seed)

        self.fc1 = nn.Linear(self.in_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.out_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state.clone().detach()))
        x = self.fc2(x)

        return x

class Critic(A2C):
    def __init__(self, env, hidden_size=128, gamma=0.99, random_seed=None):
        super().__init__(env, hidden_size, gamma, random_seed)

        self.fc1 = nn.Linear(self.in_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, 1)

    def forward(self, state):
        x = torch.relu(self.fc1(state.clone().detach()))
        x = self.fc2(x)

        return x

In [64]:
Transaction = namedtuple('Transaction', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        # self.capacity = capacity
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transaction(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [65]:
def optimizer_step():
    pass

def select_action(arg_state):
    state_tensor = torch.tensor(arg_state, dtype=torch.float64)
    if torch.isnan(state_tensor).any():
        state_tensor = torch.nan_to_num(state_tensor, nan=0.0)

    action_logits = actor(state_tensor)
    action = Categorical(logits=action_logits).sample()

    return action

### Loss

$$
\text{Actor Loss} = - \mathbb{E}[\log \pi(a_t|s_t) \cdot A_t]
$$

$$
\text{Critic Loss} = \frac{1}{2} \mathbb{E}[(V(s_t) - R_t)^2]
$$

$$
\text{Total Loss} = \text{Actor Loss} + \lambda \cdot \text{Critic Loss}
$$

In [73]:
# const values
BATCH_SIZE = 64
LAMBDA = 0.5

# variables
actor = Actor(train_env)
critic = Critic(train_env)
actor_optimizer = optim.Adam(actor.parameters(), lr=1e3)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e3)

action_lp_vals = []
critic_vals = []
rewards = []
memory = ReplayMemory(10000)

def optimizer_model():
    if len(memory) < BATCH_SIZE:
        return
    critic_optimizer.zero_grad()
    actor_optimizer.zero_grad()

    transactions = memory.sample(BATCH_SIZE)
    batch = Transaction(*zip(*transactions))

    state_batch = torch.stack(batch.state)
    action_batch = torch.stack(batch.action).unsqueeze(1)
    reward_batch = torch.tensor(batch.reward, dtype=torch.float32)
    next_state_batch = torch.stack([torch.tensor(ns, dtype=torch.float32) for ns in batch.next_state])

    values = critic(state_batch).squeeze()
    next_values = critic(next_state_batch).squeeze()
    targets = reward_batch + critic.gamma * next_values
    advantages = targets - values

    action_probs = actor(state_batch)
    action_log_props = torch.log(action_probs.gather(1, action_batch))
    actor_loss = -torch.mean(action_log_props * advantages.detach())

    critic_loss = nn.MSELoss()(values, targets.detach())

    total_loss = actor_loss + LAMBDA * critic_loss

    total_loss.backward()

    utils.clip_grad_norm_(actor.parameters(), 1.0)
    utils.clip_grad_norm_(critic.parameters(), 1.0)

    actor_optimizer.step()
    critic_optimizer.step()

for i_episode in range(1000):
    state, _ = train_env.reset()
    done = False
    sum_rewards = 0

    while not done:
        state = torch.tensor(state, dtype=torch.float32)
        if torch.isnan(state).any():
            state = torch.nan_to_num(state, nan=1e-6)
            print("nan detected")
        action_logits = actor(state)
        if torch.isnan(action_logits).any():
            print(state)
            print(action_logits)
            # action_logits = torch.nan_to_num(action_logits, nan=0.0)
        action = Categorical(logits=action_logits).sample()

        action_log_prob = action_logits[action]
        pred = torch.squeeze(critic(state).view(-1))

        action_lp_vals.append(action_log_prob)
        critic_vals.append(pred)

        next_state, reward, terminated, truncated, _ = train_env.step(action)
        memory.push(state, action, next_state, reward)
        done = terminated or truncated
        sum_rewards += reward

        state = next_state

    rewards.append(sum_rewards)
    print(f"Episode {i_episode:4} finished with reward {sum_rewards:+2}, length of memory {len(memory)}")
    optimizer_model()


Episode    0 finished with reward +78, length of memory 100
Episode    1 finished with reward +12, length of memory 200
Episode    2 finished with reward +82, length of memory 300
Episode    3 finished with reward +76, length of memory 400
Episode    4 finished with reward +78, length of memory 500
Episode    5 finished with reward +80, length of memory 600
Episode    6 finished with reward -6, length of memory 700
Episode    7 finished with reward -12, length of memory 800
Episode    8 finished with reward -2, length of memory 900
Episode    9 finished with reward -14, length of memory 1000
Episode   10 finished with reward -8, length of memory 1100
Episode   11 finished with reward -4, length of memory 1200
Episode   12 finished with reward -8, length of memory 1300
Episode   13 finished with reward +0, length of memory 1400
Episode   14 finished with reward +4, length of memory 1500
Episode   15 finished with reward -8, length of memory 1600
Episode   16 finished with reward -10, le

ValueError: Expected parameter logits (Tensor of shape (2,)) of distribution Categorical(logits: torch.Size([2])) to satisfy the constraint IndependentConstraint(Real(), 1), but found invalid values:
tensor([nan, nan], grad_fn=<SubBackward0>)