In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
import networkx as nx
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import time
import numpy as np
from torch_geometric.nn import Linear, BatchNorm, GINEConv, global_mean_pool
from torch_geometric.loader import DataLoader
import pandas as pd

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda:0


In [None]:
class StarGIN(torch.nn.Module):
    def __init__(self, num_features = 5, num_gnn_layers = 1, n_classes=36, n_hidden=64, edge_updates=True, edge_dim=77, dropout=0.0, final_dropout=0.5):
        super().__init__()

        self.n_hidden = n_hidden
        self.num_gnn_layers = num_gnn_layers
        self.edge_updates = edge_updates
        self.dropout = dropout
        self.final_dropout = final_dropout

        self.node_emb = nn.Linear(num_features, n_hidden)
        self.edge_emb = nn.Linear(edge_dim, n_hidden)

        self.convs = nn.ModuleList()
              
        for _ in range(self.num_gnn_layers):
            conv = GINEConv(nn.Sequential(
                nn.Linear(self.n_hidden, self.n_hidden),
                nn.BatchNorm1d(n_hidden),
                nn.Dropout(self.dropout),
                nn.ReLU(),
                nn.Linear(self.n_hidden, self.n_hidden),
                nn.BatchNorm1d(n_hidden),
                nn.Dropout(self.dropout),
                nn.ReLU(),
                ), edge_dim=self.n_hidden)
            self.convs.append(conv)
        
        self.mlp = nn.Sequential(Linear(n_hidden, n_hidden), nn.ReLU(), nn.Dropout(self.dropout))
        self.layer_pool = Linear(n_hidden, n_classes)
        
    def forward(self, x, edge_index, edge_attr, batch):
        device = x.device
        src, dst = edge_index
        x = self.node_emb(x).to(device)
        edge_attr = self.edge_emb(edge_attr).to(device)

        for i in range(self.num_gnn_layers):
            x = self.convs[i](x, edge_index, edge_attr)

        x = self.mlp(x)

        x_mean = global_mean_pool(x, batch)
        output = self.layer_pool(x_mean)

        return output

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def train(model, train_loader, criterion, optimizer, device):
    model.to(device)
    model.train()
    total_loss = 0.0 
    num_batches = 0

    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.edge_attr, data.batch)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() 
    average_loss = total_loss / len(train_loader)
    return average_loss

def evaluate(model, loader, criterion, device):
    model.to(device)
    model.eval()
    all_preds = torch.tensor([], dtype=torch.long, device="cpu")
    all_labels = torch.tensor([], dtype=torch.long, device="cpu")
    correct = 0
    total_samples = 0
    total_loss = 0.0
    num_batches = 0
    with torch.no_grad():
      for data in loader:   
        data = data.to(device)    
        output = model(data.x, data.edge_index, data.edge_attr, data.batch)
        preds = output.argmax(dim=1)
        target = data.y.view(-1)
        if target.dim() == 0:
            target = target.unsqueeze(0)
        all_preds = torch.cat((all_preds, preds.cpu()), dim=0)
        all_labels = torch.cat((all_labels, target.cpu()), dim=0)

        loss = criterion(output, data.y)
        total_loss += loss.item()
        
    average_loss = total_loss / len(loader)
    
    all_preds_np = all_preds.numpy()
    all_labels_np = all_labels.numpy()
    cm = confusion_matrix(all_labels_np, all_preds_np)

    
    metrics_df = pd.DataFrame({
        "accuracy": [accuracy_score(all_labels_np, all_preds_np)],
        "macro_precision": [precision_score(all_labels_np, all_preds_np, average='macro', zero_division=0)],
        "macro_recall": [recall_score(all_labels_np, all_preds_np, average='macro', zero_division=0)],
        "macro_f1": [f1_score(all_labels_np, all_preds_np, average='macro', zero_division=0)],
        "micro_precision": [precision_score(all_labels_np, all_preds_np, average='micro', zero_division=0)],
        "micro_recall": [recall_score(all_labels_np, all_preds_np, average='micro', zero_division=0)],
        "micro_f1": [f1_score(all_labels_np, all_preds_np, average='micro', zero_division=0)],
        "weighted_precision": [precision_score(all_labels_np, all_preds_np, average='weighted', zero_division=0)],
        "weighted_recall": [recall_score(all_labels_np, all_preds_np, average='weighted', zero_division=0)],
        "weighted_f1": [f1_score(all_labels_np, all_preds_np, average='weighted', zero_division=0)],
    })


    return metrics_df, cm, average_loss

In [None]:
pyg_file = r'C:\Users\LEENT\Desktop\CICandMal17\Graph\cat_full_graph_data.pt'
list_data = torch.load(pyg_file,  weights_only=False)
labels = []
for data in list_data:
    labels.extend(data.y.tolist())


print('Prepare data input model successful !')

Prepare data input model successful !


In [None]:
from collections import Counter
label_counts = Counter(labels)
sorted_counts = dict(sorted(label_counts.items())) 


total_samples = len(labels)
total_samples = len(labels)
class_weights = torch.tensor(
    [total_samples / label_counts[label] for label in sorted(label_counts.keys())],
    dtype=torch.float
).to(device)

for label, weight in enumerate(class_weights):
    print(f"Class {label}: count = {sorted_counts[label]}, weight = {weight:.4f}")

lr = 0.001
h_layers = 64
epochs = 1000

In [None]:
import random
from collections import defaultdict

def split_data(data_list, ratio=0.2, seed=42):
   
    rng = random.Random(seed)
    
    family_dict = defaultdict(list)
    for data in data_list:
        fam = data.y.item() if hasattr(data.y, "item") else int(data.y)
        family_dict[fam].append(data)

    
    min_count = min(len(samples) for samples in family_dict.values())
    if min_count == 0:
        
        return [], data_list[:]

    k_test = int(min_count * ratio)
    k_train = min_count - k_test

    if k_test == 0 and min_count >= 2:
        k_test, k_train = 1, min_count - 1

    sublist_1, sublist_2 = [], []

    for fam, samples in family_dict.items():
        
        rng.shuffle(samples)
        
        chosen = samples[:min_count]
        test_part  = chosen[:k_test]
        train_part = chosen[k_test:k_test + k_train]

        sublist_1.extend(test_part)
        sublist_2.extend(train_part)

    return sublist_1, sublist_2


In [None]:

import os, time
import psutil
import pandas as pd
import pynvml

import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.loader import DataLoader  

def get_resource_info(gpu_index=0):
    cpu_percent = psutil.cpu_percent(interval=None)

   
    process = psutil.Process(os.getpid())
    ram_process_kb = int(process.memory_info().rss // 1024)

    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
    
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    vram_used_kb = int(mem_info.used // 1024)

    gpu_util_percent = float(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu)
    
    pynvml.nvmlShutdown()

    return cpu_percent, ram_process_kb, vram_used_kb, gpu_util_percent



def shorten_pyg_name(path: str) -> str:
    name = os.path.basename(path)
    return name


def save_csv(dataframe: pd.DataFrame, filepath: str):
    if not os.path.isfile(filepath):
        dataframe.to_csv(filepath, index=False)
    else:
        dataframe.to_csv(filepath, mode="a", header=False, index=False)


def append_row_csv(row_dict: dict, filepath: str):
    df = pd.DataFrame([row_dict])
    if not os.path.isfile(filepath):
        df.to_csv(filepath, index=False)
    else:
        df.to_csv(filepath, mode="a", header=False, index=False)



results_dir = f"C:/Users/LEENT/Desktop/CICandMal17/Results/2_layer/CAT_LR{lr}and{h_layers}/"
os.makedirs(results_dir, exist_ok=True)

mac_results     = f"{results_dir}classification_5d_macro_LR{lr}and{h_layers}.csv"
mic_results     = f"{results_dir}classification_5d_micro_LR{lr}and{h_layers}.csv"
w_results       = f"{results_dir}classification_5d_weight_LR{lr}and{h_layers}.csv"
mac_cm_results  = f"{results_dir}classification_5d_macro_cm_LR{lr}and{h_layers}.csv"
mic_cm_results  = f"{results_dir}classification_5d_micro_cm_LR{lr}and{h_layers}.csv"
w_cm_results    = f"{results_dir}classification_5d_weight_cm_LR{lr}and{h_layers}.csv"

resource_csv    = f"{results_dir}resource_usage_LR{lr}and{h_layers}.csv"

pyg_name_for_csv = shorten_pyg_name(pyg_file)  

test_data, remain_data = split_data(list_data, 0.2)
remain_labels = []
for d in remain_data:
    remain_labels.extend(d.y.tolist())

for i in range(2):
    model = StarGIN(num_features=127, n_classes=5, num_gnn_layers = 2,  n_hidden=h_layers, edge_updates=False, dropout=0.5)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(weight=class_weights)


    temp_data, not_use_data = train_test_split(remain_data, test_size=0.2, random_state=42, stratify=remain_labels)

    train_loader = DataLoader(temp_data, batch_size=32, shuffle=True)
    test_loader  = DataLoader(test_data, batch_size=32, shuffle=True)

    total_loss_epochs = 0.0

    train_metrics = pd.DataFrame()
    test_metrics  = pd.DataFrame()

    mac_f1_final = 0.0
    mac_final_cm = pd.DataFrame()
    mac_final_metrics = pd.DataFrame()

    mic_f1_final = 0.0
    mic_final_cm = pd.DataFrame()
    mic_final_metrics = pd.DataFrame()

    w_f1_final = 0.0
    w_final_cm = pd.DataFrame()
    w_final_metrics = pd.DataFrame()

    train_time = 0.0
    test_time  = 0.0

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")

        
        start = time.perf_counter()
        cpu_b, ram_proc_b, vram_b, gpu_b = get_resource_info(gpu_index=0)

        train_loss = train(model, train_loader, criterion, optimizer, device)
        train_metrics, _, _ = evaluate(model, train_loader, criterion, device)

        end = time.perf_counter()
        train_time += int(1000 * (end - start))  # ms

        cpu_a, ram_proc_a, vram_a, gpu_a = get_resource_info(gpu_index=0)

        
        cpu_delta = cpu_a - cpu_b                     # %
        ram_kb_delta = ram_proc_a - ram_proc_b        # KB
        vram_kb_delta = vram_a - vram_b               # KB
        gpu_delta = gpu_a - gpu_b                     # %

        print(f"[TRAIN]  CPU: {cpu_delta:.2f} % | RAM: {ram_kb_delta:.0f} KB | "
            f"VRAM: {vram_kb_delta:.0f} KB | GPU Usage: {gpu_delta:.2f} %")


        
        row_train = {
            "run_index": i,
            "epoch": epoch + 1,
            "phase": "train",
            "file_name": pyg_name_for_csv,
            "cpu_used_percent_delta": cpu_delta,
            "ram_used_process_kb_delta": ram_kb_delta,
            "vram_used": vram_kb_delta,
            "gpu_used_percent_delta": gpu_delta,
            "elapsed_train_time_ms_cum": train_time,
        }
        append_row_csv(row_train, resource_csv)

        
        start = time.perf_counter()
        cpu_b_t, ram_proc_b_t, vram_b_t, gpu_b_t = get_resource_info(gpu_index=0)

        test_metrics, cm, test_loss = evaluate(model, test_loader, criterion, device)

        end = time.perf_counter()
        test_time += int(1000 * (end - start))  # ms

        cpu_a_t, ram_proc_a_t, vram_a_t, gpu_a_t = get_resource_info(gpu_index=0)

        cpu_delta_t = cpu_a_t - cpu_b_t                     # %
        ram_kb_delta_t = ram_proc_a_t - ram_proc_b_t        # KB
        vram_kb_delta_t = vram_a_t - vram_b_t               # KB
        gpu_delta_t = gpu_a_t - gpu_b_t                     # %

        print(f"[TEST]  CPU: {cpu_delta_t:.2f} % | RAM: {ram_kb_delta_t:.0f} KB | "
            f"VRAM: {vram_kb_delta_t:.0f} KB | GPU Usage: {gpu_delta_t:.2f} %")

        
        row_test = {
            "run_index": i,
            "epoch": epoch + 1,
            "phase": "test",
            "file_name": pyg_name_for_csv,
            "cpu_used_percent_delta": cpu_delta_t,
            "ram_used_process_kb_delta": ram_kb_delta_t,   
            "vram_used": vram_kb_delta_t,
            "gpu_used_percent_delta": gpu_delta_t,
            "elapsed_test_time_ms_cum": test_time,
        }
        append_row_csv(row_test, resource_csv)


        df_cm = pd.DataFrame(cm, index=range(5), columns=range(5))
        total_loss_epochs += test_loss

        if mac_f1_final < test_metrics["macro_f1"].iloc[0]:
            mac_f1_final = test_metrics["macro_f1"].iloc[0]
            mac_final_metrics = test_metrics
            mac_final_cm = df_cm

        if w_f1_final < test_metrics["weighted_f1"].iloc[0]:
            w_f1_final = test_metrics["weighted_f1"].iloc[0]
            w_final_metrics = test_metrics
            w_final_cm = df_cm

        if mic_f1_final < test_metrics["micro_f1"].iloc[0]:
            mic_f1_final = test_metrics["micro_f1"].iloc[0]
            mic_final_metrics = test_metrics
            mic_final_cm = df_cm

    for m in [mac_final_metrics, mic_final_metrics, w_final_metrics]:
        m["elapsed_time"] = train_time
        m["test_time"] = test_time
        m["experiment"] = "GINv11"
        m["file_name"] = pyg_name_for_csv

    save_csv(mac_final_metrics, mac_results)
    save_csv(mic_final_metrics, mic_results)
    save_csv(w_final_metrics, w_results)
    save_csv(mac_final_cm, mac_cm_results)
    save_csv(mic_final_cm, mic_cm_results)
    save_csv(w_final_cm, w_cm_results)


    print(f"Loss average: {total_loss_epochs / epochs:.6f}")
    print(f"Train accuracy: {train_metrics['accuracy'].iloc[0] * 100:.2f}%")

    print("\nBest model based on macro:")
    print(f"Accuracy: {mac_final_metrics['accuracy'].iloc[0] * 100:.2f}% | "
          f"Recall: {mac_final_metrics['macro_recall'].iloc[0] * 100:.2f}% | "
          f"Precision: {mac_final_metrics['macro_precision'].iloc[0] * 100:.2f}% | "
          f"F1: {mac_final_metrics['macro_f1'].iloc[0] * 100:.2f}%")
    print(mac_final_cm)

    print("\nBest model based on weighted:")
    print(f"Accuracy: {w_final_metrics['accuracy'].iloc[0] * 100:.2f}% | "
          f"Recall: {w_final_metrics['weighted_recall'].iloc[0] * 100:.2f}% | "
          f"Precision: {w_final_metrics['weighted_precision'].iloc[0] * 100:.2f}% | "
          f"F1: {w_final_metrics['weighted_f1'].iloc[0] * 100:.2f}%")
    print(w_final_cm)

    print("\nBest model based on micro:")
    print(f"Accuracy: {mic_final_metrics['accuracy'].iloc[0] * 100:.2f}% | "
          f"Recall: {mic_final_metrics['micro_recall'].iloc[0] * 100:.2f}% | "
          f"Precision: {mic_final_metrics['micro_precision'].iloc[0] * 100:.2f}% | "
          f"F1: {mic_final_metrics['micro_f1'].iloc[0] * 100:.2f}%")
    print(mic_final_cm)
