### Libraries

In [5]:
%%capture
%reset -f  
'generic imports'
import pandas as pd
import json
import time 
import os
import torch
from IPython.display import display, clear_output
from numba import cuda  
from sklearn.model_selection import train_test_split

'data augmentation imports'
from be_great import GReaT # https://github.com/kathrinse/be_great

### GPU

In [7]:
# Select GPU device 
cuda.select_device(0)
# Clear GPU memory
cuda.current_context().deallocations.clear()
# Get the current CUDA context and GPU information
ctx = cuda.current_context()
# Get the current memory information from the cuda context
meminfo = ctx.get_memory_info()
# Get the device name
device_name = cuda.gpus[0].name.decode()
# Get the CUDA version
cudnn_version = cuda.cudadrv.driver.get_version()
if torch.cuda.is_available():
    # dont chang the line
    print('GPU is available', end=' ')
    print(f'({device_name})\nAvailable GPU memory: {meminfo[0] / 1024**3:.1f} GB\ncuDNN version: {cudnn_version}')
else:
    print("No GPU available")

GPU is available (NVIDIA GeForce RTX 4060 Laptop GPU)
Available GPU memory: 7.5 GB
cuDNN version: (12, 2)


### Loading Data

In [8]:
# Load data relative to the training set, with Attack_type "Normal" with 100k rows
df_train = pd.read_csv('../data/EdgeIIot_train_100k.csv', low_memory=False)

#### Calculates the No of rows to be augmented

In [9]:
# Counts how many rows of each Attack_type are missing (100.000)
counts_dict = {}
for attack in df_train["Attack_type"].unique():
    counts_dict[attack] = 100000 - df_train[df_train["Attack_type"] == attack].shape[0]

counts_dict.pop("Normal")

print("Rows to be augmented by attack type:\n")
for key, value in sorted(counts_dict.items(), key=lambda item: item[1], reverse=True):
    print(f"{key:<22} {value:>10}")

Rows to be augmented by attack type:

MITM                        99718
Fingerprinting              99293
Ransomware                  92237
XSS                         87969
Port_Scanning               84085
Backdoor                    80756
Uploading                   70466
DDoS_HTTP                   61084
Password                    60175
DDoS_TCP                    59947
Vulnerability_scanner       59914
SQL_injection               59245
DDoS_ICMP                   45562
DDoS_UDP                     3034


#### Divides data into 2 groups

In [10]:
# Split the data into two groups
counts_dict_1, counts_dict_2 = train_test_split(list(counts_dict.keys()), test_size=0.5, random_state=42)

print("Group 1:", counts_dict_1)
print("Group 2:", counts_dict_2)

print("\nNo of rows by group:")
print("Group 1:", sum([counts_dict[attack] for attack in counts_dict_1]))
print("Group 2:", sum([counts_dict[attack] for attack in counts_dict_2]))

Group 1: ['Password', 'Fingerprinting', 'DDoS_ICMP', 'SQL_injection', 'XSS', 'Backdoor', 'Vulnerability_scanner']
Group 2: ['Uploading', 'Ransomware', 'DDoS_UDP', 'MITM', 'Port_Scanning', 'DDoS_HTTP', 'DDoS_TCP']

No of rows by group:
Group 1: 492914
Group 2: 470571


### Discard Redundant Data

In [11]:
# Since 'Normal' Attack_type already has 100.000 rows, we do not consider it for augmentation
df_attack = df_train[df_train["Attack_type"] != "Normal"]

# Drop the Attack_label because it can be reconstructed from the Attack_type after augmentation
df_attack = df_attack.drop(columns=["Attack_label"])
del df_train

In [7]:
df_attack_1 = df_attack[df_attack["Attack_type"].isin(counts_dict_1)]
# shape of the dataframe before augmentation
print("Shape of the dataframe before augmentation:", df_attack_1.shape)

Shape of the dataframe before augmentation: (207086, 47)


###  Training Data Augmentation Model: Group 1

In [10]:
# instantiate the GReaT model with the distilgpt2 language model
model = GReaT(llm="distilgpt2",                   # Language model to use
              epochs=1,                           # Number of epochs to train
              save_steps=2000,                    # Save model weights every x steps
              logging_steps=500,                  # Log the loss and learning rate every x steps
              experiment_dir="checkpoints/Great/GReat_iter3", # Directory to save the model weights
              batch_size=16                       
             )

In [27]:
# Train the model
model.fit(df_attack_1)

  4%|▍         | 500/12943 [06:45<2:34:27,  1.34it/s] 

{'loss': 0.5297, 'learning_rate': 4.806845399057406e-05, 'epoch': 0.04}


  8%|▊         | 1000/12943 [16:09<2:30:11,  1.33it/s]

{'loss': 0.4146, 'learning_rate': 4.6136907981148113e-05, 'epoch': 0.08}


 12%|█▏        | 1500/12943 [24:54<2:37:32,  1.21it/s] 

{'loss': 0.3994, 'learning_rate': 4.420536197172217e-05, 'epoch': 0.12}


 15%|█▌        | 2000/12943 [32:24<2:27:17,  1.24it/s] 

{'loss': 0.3915, 'learning_rate': 4.2273815962296224e-05, 'epoch': 0.15}


 19%|█▉        | 2500/12943 [41:50<2:17:09,  1.27it/s] 

{'loss': 0.3865, 'learning_rate': 4.034226995287028e-05, 'epoch': 0.19}


 23%|██▎       | 3000/12943 [51:47<2:08:00,  1.29it/s] 

{'loss': 0.3829, 'learning_rate': 3.8410723943444336e-05, 'epoch': 0.23}


 27%|██▋       | 3500/12943 [1:00:05<2:14:15,  1.17it/s]

{'loss': 0.3812, 'learning_rate': 3.647917793401839e-05, 'epoch': 0.27}


 31%|███       | 4000/12943 [1:07:57<1:55:43,  1.29it/s] 

{'loss': 0.3788, 'learning_rate': 3.4547631924592447e-05, 'epoch': 0.31}


 35%|███▍      | 4500/12943 [1:15:52<1:52:36,  1.25it/s] 

{'loss': 0.3773, 'learning_rate': 3.26160859151665e-05, 'epoch': 0.35}


 39%|███▊      | 5000/12943 [1:24:59<1:40:03,  1.32it/s] 

{'loss': 0.3753, 'learning_rate': 3.068453990574056e-05, 'epoch': 0.39}


 42%|████▏     | 5500/12943 [1:34:58<4:16:02,  2.06s/it] 

{'loss': 0.374, 'learning_rate': 2.8752993896314613e-05, 'epoch': 0.42}


 46%|████▋     | 6000/12943 [1:43:20<1:56:36,  1.01s/it] 

{'loss': 0.3744, 'learning_rate': 2.682144788688867e-05, 'epoch': 0.46}


 50%|█████     | 6500/12943 [1:52:29<1:36:58,  1.11it/s] 

{'loss': 0.3734, 'learning_rate': 2.4889901877462724e-05, 'epoch': 0.5}


 54%|█████▍    | 7000/12943 [2:00:46<1:16:31,  1.29it/s] 

{'loss': 0.373, 'learning_rate': 2.2958355868036776e-05, 'epoch': 0.54}


 58%|█████▊    | 7500/12943 [2:06:28<40:57,  2.22it/s]   

{'loss': 0.3723, 'learning_rate': 2.1026809858610832e-05, 'epoch': 0.58}


 62%|██████▏   | 8000/12943 [2:11:34<37:20,  2.21it/s]  

{'loss': 0.3716, 'learning_rate': 1.9095263849184887e-05, 'epoch': 0.62}


 66%|██████▌   | 8500/12943 [2:17:15<1:05:18,  1.13it/s]

{'loss': 0.3708, 'learning_rate': 1.7163717839758943e-05, 'epoch': 0.66}


 70%|██████▉   | 9000/12943 [2:21:39<48:58,  1.34it/s]  

{'loss': 0.3711, 'learning_rate': 1.5232171830333e-05, 'epoch': 0.7}


 73%|███████▎  | 9500/12943 [2:27:07<25:50,  2.22it/s]  

{'loss': 0.3708, 'learning_rate': 1.3300625820907054e-05, 'epoch': 0.73}


 77%|███████▋  | 10000/12943 [2:31:47<23:00,  2.13it/s] 

{'loss': 0.3699, 'learning_rate': 1.1369079811481111e-05, 'epoch': 0.77}


 81%|████████  | 10500/12943 [2:37:27<18:32,  2.20it/s]  

{'loss': 0.3697, 'learning_rate': 9.437533802055165e-06, 'epoch': 0.81}


 85%|████████▍ | 11000/12943 [2:43:08<14:45,  2.20it/s]  

{'loss': 0.3694, 'learning_rate': 7.505987792629221e-06, 'epoch': 0.85}


 89%|████████▉ | 11500/12943 [2:48:35<10:55,  2.20it/s]  

{'loss': 0.3694, 'learning_rate': 5.574441783203276e-06, 'epoch': 0.89}


 93%|█████████▎| 12000/12943 [2:54:15<49:18,  3.14s/it]  

{'loss': 0.3693, 'learning_rate': 3.6428957737773316e-06, 'epoch': 0.93}


 97%|█████████▋| 12500/12943 [2:59:57<03:25,  2.15it/s]  

{'loss': 0.3687, 'learning_rate': 1.7113497643513869e-06, 'epoch': 0.97}


100%|██████████| 12943/12943 [3:04:00<00:00,  1.17it/s]

{'train_runtime': 11040.1783, 'train_samples_per_second': 18.757, 'train_steps_per_second': 1.172, 'train_loss': 0.38289573543878386, 'epoch': 1.0}





<be_great.great_trainer.GReaTTrainer at 0x20661a7a520>

In [28]:
# save the model
model.save("GReat_df_attack_1")
del model, df_attack_1

### Training Data Augmentation Model: Group 2

In [30]:
df_attack_2 = df_attack[df_attack["Attack_type"].isin(counts_dict_2)]
# shape of the dataframe before augmentation
print("Shape of the dataframe before augmentation:", df_attack_2.shape)

Shape of the dataframe before augmentation: (229429, 47)


In [29]:
model2 = GReaT(llm="distilgpt2",                  # Language model to use
              epochs=1,                           # Number of epochs to train
              save_steps=2000,                    # Save model weights every x steps
              logging_steps=500,                  # Log the loss and learning rate every x steps
              experiment_dir="checkpoints/Great/GReat_iter3", # Directory to save the model weights
              batch_size=16                       
             )

In [31]:
model2.fit(df_attack_2)

  3%|▎         | 500/14340 [08:47<1:46:48,  2.16it/s] 

{'loss': 0.5028, 'learning_rate': 4.825662482566248e-05, 'epoch': 0.03}


  7%|▋         | 1000/14340 [18:05<11:01:50,  2.98s/it]

{'loss': 0.3902, 'learning_rate': 4.651324965132497e-05, 'epoch': 0.07}


 10%|█         | 1500/14340 [26:56<3:02:51,  1.17it/s] 

{'loss': 0.3756, 'learning_rate': 4.476987447698745e-05, 'epoch': 0.1}


 14%|█▍        | 2000/14340 [36:08<2:42:16,  1.27it/s] 

{'loss': 0.3681, 'learning_rate': 4.302649930264993e-05, 'epoch': 0.14}


 17%|█▋        | 2500/14340 [44:15<2:07:02,  1.55it/s] 

{'loss': 0.364, 'learning_rate': 4.128312412831242e-05, 'epoch': 0.17}


 21%|██        | 3000/14340 [52:41<1:27:02,  2.17it/s] 

{'loss': 0.3609, 'learning_rate': 3.95397489539749e-05, 'epoch': 0.21}


 24%|██▍       | 3500/14340 [1:02:43<1:37:36,  1.85it/s] 

{'loss': 0.3588, 'learning_rate': 3.779637377963738e-05, 'epoch': 0.24}


 28%|██▊       | 4000/14340 [1:11:53<1:26:14,  2.00it/s] 

{'loss': 0.3569, 'learning_rate': 3.6052998605299864e-05, 'epoch': 0.28}


 31%|███▏      | 4500/14340 [1:21:03<4:47:01,  1.75s/it] 

{'loss': 0.3555, 'learning_rate': 3.4309623430962344e-05, 'epoch': 0.31}


 35%|███▍      | 5000/14340 [1:30:03<1:27:46,  1.77it/s] 

{'loss': 0.3548, 'learning_rate': 3.2566248256624825e-05, 'epoch': 0.35}


 38%|███▊      | 5500/14340 [1:39:45<1:10:13,  2.10it/s] 

{'loss': 0.3539, 'learning_rate': 3.082287308228731e-05, 'epoch': 0.38}


 42%|████▏     | 6000/14340 [1:48:56<1:18:23,  1.77it/s] 

{'loss': 0.3536, 'learning_rate': 2.9079497907949792e-05, 'epoch': 0.42}


 45%|████▌     | 6500/14340 [1:58:22<1:21:39,  1.60it/s] 

{'loss': 0.3526, 'learning_rate': 2.7336122733612275e-05, 'epoch': 0.45}


 49%|████▉     | 7000/14340 [2:06:50<5:26:12,  2.67s/it] 

{'loss': 0.3518, 'learning_rate': 2.5592747559274755e-05, 'epoch': 0.49}


 52%|█████▏    | 7500/14340 [2:15:19<1:11:26,  1.60it/s]

{'loss': 0.3519, 'learning_rate': 2.3849372384937242e-05, 'epoch': 0.52}


 56%|█████▌    | 8000/14340 [2:24:39<4:09:39,  2.36s/it]

{'loss': 0.3513, 'learning_rate': 2.2105997210599723e-05, 'epoch': 0.56}


 59%|█████▉    | 8500/14340 [2:34:26<2:01:48,  1.25s/it]

{'loss': 0.351, 'learning_rate': 2.0362622036262206e-05, 'epoch': 0.59}


 63%|██████▎   | 9000/14340 [2:43:42<53:23,  1.67it/s]  

{'loss': 0.35, 'learning_rate': 1.8619246861924686e-05, 'epoch': 0.63}


 66%|██████▌   | 9500/14340 [2:54:16<1:26:34,  1.07s/it]

{'loss': 0.3505, 'learning_rate': 1.687587168758717e-05, 'epoch': 0.66}


 70%|██████▉   | 10000/14340 [3:08:27<55:00,  1.31it/s]  

{'loss': 0.3496, 'learning_rate': 1.5132496513249652e-05, 'epoch': 0.7}


 73%|███████▎  | 10500/14340 [3:20:45<1:16:42,  1.20s/it]

{'loss': 0.3497, 'learning_rate': 1.3389121338912134e-05, 'epoch': 0.73}


 77%|███████▋  | 11000/14340 [3:36:04<1:06:05,  1.19s/it] 

{'loss': 0.3495, 'learning_rate': 1.1645746164574617e-05, 'epoch': 0.77}


 80%|████████  | 11500/14340 [3:50:24<1:22:01,  1.73s/it]

{'loss': 0.3489, 'learning_rate': 9.9023709902371e-06, 'epoch': 0.8}


 81%|████████  | 11588/14340 [3:52:36<41:02,  1.12it/s]  

In [None]:
# Save the model
model2.save("GReat_df_attack_2")
del df_attack_2

## Data Generation

In [12]:
# Load model df_attack_1
model = GReaT.load_from_dir("checkpoints/GReat/GReat_df_attack_1")

In [13]:
for i, attack in enumerate(counts_dict_1):
    n_att_samples = counts_dict[attack]

    samples = model.sample(n_samples = n_att_samples, 
                           max_length = 1000, 
                           k = 100, 
                           start_col="Attack_type", 
                           start_col_dist={attack: 1}
                           )
    
    if i == 0:
        samples.to_csv(f"EdgeIIoT_train_GReaT_p1.csv", index=False)
    else:
        samples.to_csv(f"EdgeIIoT_train_GReaT_p1.csv", mode='a', header=False, index=False)

60234it [1:45:39,  9.50it/s]                           
99322it [2:57:14,  9.34it/s]                             
45616it [1:19:23,  9.58it/s]                           
59308it [1:49:03,  9.06it/s]                           
87985it [2:35:52,  9.41it/s]                             
80841it [2:22:31,  9.45it/s]                             
100%|██████████| 59914/59914 [1:51:00<00:00,  9.00it/s]


In [15]:
# del df_attack_gen_1
del model
# Load model df_attack_2
model = GReaT.load_from_dir("checkpoints/GReat/GReat_df_attack_2")

In [16]:
for i, attack in enumerate(counts_dict_2):
    n_att_samples = counts_dict[attack]

    samples = model.sample(n_samples = n_att_samples, 
                           max_length = 1000, 
                           k = 128, 
                           start_col="Attack_type", 
                           start_col_dist={attack: 1}
                           )
    
    if i == 0:
        samples.to_csv(f"EdgeIIoT_train_GReaT_p2.csv", index=False)
    else:
        samples.to_csv(f"EdgeIIoT_train_GReaT_p2.csv", mode='a', header=False, index=False)

70486it [1:57:31, 10.00it/s]                           
92361it [2:38:17,  9.72it/s]                             
3065it [05:07,  9.96it/s]                          
99808it [2:49:06,  9.84it/s]                             
84176it [2:23:38,  9.77it/s]                             
61136it [1:41:43, 10.02it/s]                           
59967it [1:42:58,  9.71it/s]                           


### Augmented data verification and concatenation

In [None]:
# load the generated data
df_gen_1 = pd.read_csv("EdgeIIoT_train_GReaT_p1.csv")