# DRL Framework Training - Production Grade\n\nThis notebook trains a Deep Q-Network (DQN) agent for cybersecurity malware detection.\n\n**Features:**\n- Real telemetry data ingestion\n- GPU acceleration\n- Experience replay\n- Target network updates\n- ONNX export for C++ inference\n- Comprehensive logging and metrics

In [None]:
# Install dependencies\n!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n!pip install numpy pandas matplotlib tensorboard onnx onnxruntime scikit-learn

In [None]:
import torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom collections import deque\nimport random\nimport json\nimport os\nfrom datetime import datetime\nfrom torch.utils.tensorboard import SummaryWriter\n\n# Check GPU availability\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\nprint(f'Using device: {device}')

## 1. Define DQN Architecture

In [None]:
class DQN(nn.Module):\n    def __init__(self, state_dim, action_dim, hidden_dims=[256, 256, 128]):\n        super(DQN, self).__init__()\n        \n        layers = []\n        input_dim = state_dim\n        \n        for hidden_dim in hidden_dims:\n            layers.append(nn.Linear(input_dim, hidden_dim))\n            layers.append(nn.ReLU())\n            layers.append(nn.Dropout(0.2))\n            input_dim = hidden_dim\n        \n        layers.append(nn.Linear(input_dim, action_dim))\n        \n        self.network = nn.Sequential(*layers)\n    \n    def forward(self, x):\n        return self.network(x)

## 2. Experience Replay Buffer

In [None]:
class ReplayBuffer:\n    def __init__(self, capacity=100000):\n        self.buffer = deque(maxlen=capacity)\n    \n    def push(self, state, action, reward, next_state, done):\n        self.buffer.append((state, action, reward, next_state, done))\n    \n    def sample(self, batch_size):\n        batch = random.sample(self.buffer, batch_size)\n        states, actions, rewards, next_states, dones = zip(*batch)\n        return (np.array(states), np.array(actions), np.array(rewards),\n                np.array(next_states), np.array(dones))\n    \n    def __len__(self):\n        return len(self.buffer)

## 3. DQN Agent

In [None]:
class DQNAgent:\n    def __init__(self, state_dim, action_dim, lr=0.0001, gamma=0.99,\n                 epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.995):\n        self.state_dim = state_dim\n        self.action_dim = action_dim\n        self.gamma = gamma\n        self.epsilon = epsilon_start\n        self.epsilon_end = epsilon_end\n        self.epsilon_decay = epsilon_decay\n        \n        # Networks\n        self.policy_net = DQN(state_dim, action_dim).to(device)\n        self.target_net = DQN(state_dim, action_dim).to(device)\n        self.target_net.load_state_dict(self.policy_net.state_dict())\n        self.target_net.eval()\n        \n        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)\n        self.replay_buffer = ReplayBuffer()\n        \n    def select_action(self, state, training=True):\n        if training and random.random() < self.epsilon:\n            return random.randint(0, self.action_dim - 1)\n        \n        with torch.no_grad():\n            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)\n            q_values = self.policy_net(state_tensor)\n            return q_values.argmax().item()\n    \n    def train_step(self, batch_size=64):\n        if len(self.replay_buffer) < batch_size:\n            return None\n        \n        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)\n        \n        states = torch.FloatTensor(states).to(device)\n        actions = torch.LongTensor(actions).to(device)\n        rewards = torch.FloatTensor(rewards).to(device)\n        next_states = torch.FloatTensor(next_states).to(device)\n        dones = torch.FloatTensor(dones).to(device)\n        \n        # Current Q values\n        current_q_values = self.policy_net(states).gather(1, actions.unsqueeze(1))\n        \n        # Target Q values\n        with torch.no_grad():\n            next_q_values = self.target_net(next_states).max(1)[0]\n            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values\n        \n        # Compute loss\n        loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)\n        \n        # Optimize\n        self.optimizer.zero_grad()\n        loss.backward()\n        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)\n        self.optimizer.step()\n        \n        return loss.item()\n    \n    def update_target_network(self):\n        self.target_net.load_state_dict(self.policy_net.state_dict())\n    \n    def decay_epsilon(self):\n        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)