In [2]:
import networkx as nx
import pandas as pd
import scipy.sparse as sp
import numpy as np
import os
import torch
from torch_geometric.data import Data
import time
import psutil
import time


In [3]:
def create_full_pyg(node_file, edge_file, label):
    # 1. ƒê·ªçc d·ªØ li·ªáu tr∆∞·ªõc khi ƒëo t√†i nguy√™n
    node_df = pd.read_csv(node_file)
    edge_df = pd.read_csv(edge_file)

    # 2. B·∫Øt ƒë·∫ßu ƒëo sau khi ƒë·ªçc xong
    process = psutil.Process(os.getpid())
    initial_memory_mb = process.memory_info().rss / (1024 * 1024)  
    initial_cpu = psutil.cpu_percent(interval=None)
    start = time.perf_counter()

    # 3. X·ª≠ l√Ω t·∫°o node features
    node_ids = {ip: idx for idx, ip in enumerate(node_df['IP Address'])}
    node_features = node_df.drop(columns=['IP Address', 'Label']).values
    x = torch.tensor(node_features, dtype=torch.float)

    edge_index = torch.tensor([
        [node_ids[src_ip] for src_ip in edge_df['Src IP']],
        [node_ids[dst_ip] for dst_ip in edge_df['Dst IP']]
    ], dtype=torch.long)
    
    edge_features = edge_df.drop(columns=[
        'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp', 'Label','Cat_Label','Family_Label',
    ]).values
    edge_attr = torch.tensor(edge_features, dtype=torch.float)

    y = torch.tensor([label], dtype=torch.long)
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


    end = time.perf_counter()
    final_memory_mb = process.memory_info().rss / (1024 * 1024)  
    final_cpu = psutil.cpu_percent(interval=0.2) 

    time_per_file = (end - start) * 1000  
    ram_per_file = final_memory_mb - initial_memory_mb
    cpu_per_file = final_cpu - initial_cpu

    return data, time_per_file, ram_per_file, cpu_per_file


In [None]:
import os
import csv

def process_all_files(node_dir, edge_dir, label, output_csv="process_stats_category.csv"):
    pyg_data_list = []
    count_missing_edge = 0
    count_not_csv = 0
    total_time = 0
    total_ram = 0
    total_cpu = 0
    total_files = 0

    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["File Name", "Time (ms)", "RAM (MB)", "CPU (%)"])

        for root, dirs, files in os.walk(node_dir):
            for file_name in files:
                if file_name.endswith(".csv"):
                    node_file = os.path.join(root, file_name)
                    relative_path = os.path.relpath(root, node_dir)
                    edge_file = os.path.join(edge_dir, relative_path, file_name)

                    if os.path.exists(edge_file):
                        try:
                            
                            pyg_data, time_ms, ram_mb, cpu_percent = create_full_pyg(node_file, edge_file, label)
                            pyg_data_list.append(pyg_data)

                            
                            writer.writerow([file_name, time_ms, ram_mb, cpu_percent])

                         
                            total_time += time_ms
                            total_ram += ram_mb
                            total_cpu += cpu_percent
                            total_files += 1
                        except Exception as e:
                            print(f"[L·ªói] Kh√¥ng x·ª≠ l√Ω ƒë∆∞·ª£c file: {file_name} ‚Äî {e}")
                    else:
                        count_missing_edge += 1
                        print(f"[Thi·∫øu Edge File] Kh√¥ng t√¨m th·∫•y file edge t∆∞∆°ng ·ª©ng cho: {node_file}")
                else:
                    count_not_csv += 1
                    print(f"[B·ªè qua] Kh√¥ng ph·∫£i CSV file: {file_name}")

    avg_time = total_time / total_files if total_files else 0
    avg_ram = total_ram / total_files if total_files else 0
    avg_cpu = total_cpu / total_files if total_files else 0

    with open(output_csv, mode="a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([])
        writer.writerow(["T·ªïng s·ªë file ƒë√£ x·ª≠ l√Ω", total_files])
        writer.writerow(["Th·ªùi gian trung b√¨nh (ms)", avg_time])
        writer.writerow(["RAM trung b√¨nh (MB)", avg_ram])
        writer.writerow(["CPU trung b√¨nh (%)", avg_cpu])
        writer.writerow(["S·ªë file kh√¥ng ph·∫£i CSV", count_not_csv])
        writer.writerow(["S·ªë file thi·∫øu edge", count_missing_edge])

    print(f"\nüìä T·ªïng s·ªë file ƒë√£ x·ª≠ l√Ω: {total_files}")
    print(f"‚è±Ô∏è Th·ªùi gian trung b√¨nh: {avg_time:.2f} ms")
    print(f"üß† RAM trung b√¨nh tƒÉng th√™m: {avg_ram:.2f} MB")
    print(f"üñ•Ô∏è CPU trung b√¨nh: {avg_cpu:.2f} %")
    print(f"‚ö†Ô∏è S·ªë file kh√¥ng ph·∫£i CSV: {count_not_csv}")
    print(f"‚ùå S·ªë file thi·∫øu edge t∆∞∆°ng ·ª©ng: {count_missing_edge}")

    return pyg_data_list, total_time, total_ram, total_cpu


In [None]:
benign_node = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Node_Normalized\1Benign"
sms_malware_node = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Node_Normalized\2Smsmalware"
ransomware_node = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Node_Normalized\3Ransomware"
adware_node = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Node_Normalized\4Adware"
scareware_node = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Node_Normalized\5Scareware"


benign_edge = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Normalized/1Benign"
sms_malware_edge = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Normalized/2SMSmalware"
ransomware_edge = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Normalized/3Ransomware"
adware_edge = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Normalized/4Adware"
scareware_edge = r"C:\Users\LEENT\Desktop\CICandMal17\Test512_Normalized/5Scareware"


In [None]:
import csv
import torch

log_csv = r"C:\Users\LEENT\Desktop\CICandMal17\Results\pyg_resource\binary\process_log_pyg_binary.csv"

with open(log_csv, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Dataset", "Graphs", "Total Time (ms)", "Total RAM (MB)", "Total CPU (%)"])

def log_to_csv(dataset_name, graph_count, total_time, total_ram, total_cpu):
    with open(log_csv, mode="a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([dataset_name, graph_count, total_time, total_ram, total_cpu])

folder_datas = []
data_list = []
total_time = 0
total_ram = 0
total_cpu = 0

#1. Benign
folder_datas, per_time, per_ram, per_cpu = process_all_files(benign_node, benign_edge, 0)
print(f"Benign graph {len(folder_datas)}")
data_list.extend(folder_datas)
total_time += per_time
total_ram += per_ram
total_cpu += per_cpu
print(f"Total time: {total_time}")
print(f"Total ram: {total_ram}")
print(f"Total CPU: {total_cpu}")
log_to_csv("Benign", len(folder_datas), total_time, total_ram, total_cpu)

# 2. SMSMalware
folder_datas, per_time, per_ram, per_cpu = process_all_files(sms_malware_node, sms_malware_edge, 1)
print(f"SMSMalware graph: {len(folder_datas)}")
data_list.extend(folder_datas)
total_time += per_time
total_ram += per_ram
total_cpu += per_cpu
print(f"Total time: {total_time}")
print(f"Total ram: {total_ram}")
print(f"Total CPU: {total_cpu}")
log_to_csv("SMSMalware", len(folder_datas), total_time, total_ram, total_cpu)

# 3. Ransomware
folder_datas, per_time, per_ram, per_cpu = process_all_files(ransomware_node, ransomware_edge, 1)
print(f"Ransomware graph: {len(folder_datas)}")
data_list.extend(folder_datas)
total_time += per_time
total_ram += per_ram
total_cpu += per_cpu
print(f"Total time: {total_time}")
print(f"Total ram: {total_ram}")
print(f"Total CPU: {total_cpu}")
log_to_csv("Ransomware", len(folder_datas), total_time, total_ram, total_cpu)

# 4. Adware
folder_datas, per_time, per_ram, per_cpu = process_all_files(adware_node, adware_edge, 1)
print(f"Adware graph: {len(folder_datas)}")
data_list.extend(folder_datas)
total_time += per_time
total_ram += per_ram
total_cpu += per_cpu
print(f"Total time: {total_time}")
print(f"Total ram: {total_ram}")
print(f"Total CPU: {total_cpu}")
log_to_csv("Adware", len(folder_datas), total_time, total_ram, total_cpu)

# 5. Scareware
folder_datas, per_time, per_ram, per_cpu = process_all_files(scareware_node, scareware_edge, 1)
print(f"Scareware graph: {len(folder_datas)}")
data_list.extend(folder_datas)
total_time += per_time
total_ram += per_ram
total_cpu += per_cpu
print(f"Total time: {total_time}")
print(f"Total ram: {total_ram}")
print(f"Total CPU: {total_cpu}")
log_to_csv("Scareware", len(folder_datas), total_time, total_ram, total_cpu)


pyg_file = r"C:\Users\LEENT\Desktop\CICandMal17\Graph\bin1_full_graph_data.pt"
print(f"Total graph: {len(data_list)}")
print(f"Total time: {total_time}")
print(f"Total ram: {total_ram}")
print(f"Total CPU: {total_cpu}")
# log_to_csv("TOTAL", len(data_list), total_time, total_ram, total_cpu)

torch.save(data_list, pyg_file)
print(f"‚úÖ K·∫øt qu·∫£ ƒë√£ l∆∞u v√†o {log_csv}")
