### Libraries

In [76]:
%%capture
%reset -f  
'generic imports'
import pandas as pd
import json
import time 
from IPython.display import display, clear_output
from numba import cuda  

'data augmentation imports'
from realtabformer import REaLTabFormer  # https://github.com/worldbank/REaLTabFormer

### GPU

In [77]:
# Get the current CUDA context and GPU information
ctx = cuda.current_context()
meminfo = ctx.get_memory_info()
device_name = cuda.gpus[0].name.decode()
cudnn_version = cuda.cudadrv.driver.get_version()

# Print the GPU information
print(f'GPU: {device_name}\nAvailable GPU memory: {meminfo[0] / 1024**3:.1f} GB\ncuDNN version: {cudnn_version}')

GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Available GPU memory: 7.5 GB
cuDNN version: (12, 2)


### Loading Data

In [78]:
df_train = pd.read_csv('../data/EdgeIIot_train_100k.csv', low_memory=False)

#### Attack types to be augmented

In [79]:
# Counts how many rows of each Attack_type are missing (100.000)
counts_dict = {}
for attack in df_train["Attack_type"].unique():
    counts_dict[attack] = 100000 - df_train[df_train["Attack_type"] == attack].shape[0]

print("Rows to be augmented by attack type:\n")
for key, value in sorted(counts_dict.items(), key=lambda item: item[1], reverse=True):
    print(f"{key:<22} {value:>10}")

Rows to be augmented by attack type:

MITM                        99718
Fingerprinting              99293
Ransomware                  92237
XSS                         87969
Port_Scanning               84085
Backdoor                    80756
Uploading                   70466
DDoS_HTTP                   61084
Password                    60175
DDoS_TCP                    59947
Vulnerability_scanner       59914
SQL_injection               59245
DDoS_ICMP                   45562
DDoS_UDP                     3034
Normal                          0


### Discard Redudant Data

In [82]:
# Since 'Normal' Attack_type already has 100.000 rows, we do not consider it for augmentation
df_attack = df_train[df_train["Attack_type"] != "Normal"]

# Drop the Attack_label because it can be reconstructed from the Attack_type after augmentation
df_attack = df_attack.drop(columns=["Attack_label"])

### Data type conversion (to save space)

In [26]:
# calculate the size of df_attack before
size_before = df_attack.memory_usage(deep=True).sum() / 1024**2

# select only the columns that are not object type
df_numeric = df_attack.select_dtypes(exclude=["object"])

# convert all numeric columns to unsigned integer type
df_numeric = df_numeric.apply(pd.to_numeric, downcast="unsigned")

# print df_numeric data types before and after conversion side by side
print(pd.concat([df_attack.dtypes, df_numeric.dtypes], axis=1,keys=["Before", "After"]))

# drop all numeric columns from df_attack
df_attack = df_attack.drop(columns=df_numeric.columns)

# concatenate df_attack and newly convertes df_numeric columns
df_attack = pd.concat([df_attack, df_numeric], axis=1)

# calculate the size of df_attack after conversion
size_after = df_attack.memory_usage(deep=True).sum() / 1024**2

# print size before and after conversion
print(f"\nSize of df_attack before data conversion: {size_before:.2f} MBytes, after: {size_after:.2f} MBytes")


  new_result = trans(result).astype(dtype)


                            Before    After
arp.opcode                 float64    uint8
arp.hw.size                float64    uint8
icmp.checksum              float64   uint16
icmp.seq_le                float64   uint16
icmp.unused                float64    uint8
http.content_length        float64   uint32
http.request.method         object      NaN
http.referer                object      NaN
http.request.version        object      NaN
http.response              float64    uint8
http.tls_port              float64    uint8
tcp.ack                    float64   uint32
tcp.ack_raw                float64   uint32
tcp.checksum               float64   uint16
tcp.connection.fin         float64    uint8
tcp.connection.rst         float64    uint8
tcp.connection.syn         float64    uint8
tcp.connection.synack      float64    uint8
tcp.flags                  float64    uint8
tcp.flags.ack              float64    uint8
tcp.len                    float64   uint16
tcp.seq                    float

In [None]:
# print head of df_attack
df_attack.head(10).style.set_properties(**{'text-align': 'left'})

Unnamed: 0,http.request.method,http.referer,http.request.version,dns.qry.name.len,mqtt.conack.flags,mqtt.protoname,mqtt.topic,Attack_type,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.response,http.tls_port,tcp.ack,tcp.ack_raw,tcp.checksum,tcp.connection.fin,tcp.connection.rst,tcp.connection.syn,tcp.connection.synack,tcp.flags,tcp.flags.ack,tcp.len,tcp.seq,udp.stream,udp.time_delta,dns.qry.name,dns.qry.qu,dns.qry.type,dns.retransmission,dns.retransmit_request,dns.retransmit_request_in,mqtt.conflag.cleansess,mqtt.conflags,mqtt.hdrflags,mqtt.len,mqtt.msg_decoded_as,mqtt.msgtype,mqtt.proto_len,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS_UDP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1622034,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Password,0,0,0,0,0,0,0,0,303,3453508832,24759,0,0,0,0,16,1,0,262.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS_UDP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,2594368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS_UDP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1871277,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Password,0,0,0,0,0,0,0,0,1,1080021225,15803,0,0,0,0,16,1,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS_TCP,0,0,0,0,0,0,0,0,2140609672,2140609672,20485,0,0,1,0,2,0,120,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Backdoor,0,0,0,0,0,0,0,0,1,1453306566,6843,0,0,0,0,16,1,1448,1463329.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Password,0,0,0,0,0,0,0,0,0,0,16528,0,0,1,0,2,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS_UDP,0,0,0,57101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,10834,0,655220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS_UDP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1510333,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# separate df_attack by Attack_type into 2 dataframes with similar number of lines
df_attack_1 = df_attack[df_attack["Attack_type"].isin(["Backdoor", 
                                                       "DDoS_HTTP", 
                                                       "DDoS_ICMP", 
                                                       "DDoS_TCP", 
                                                       "DDoS_UDP", 
                                                       "Password"])]

df_attack_2 = df_attack[df_attack["Attack_type"].isin(["Port_Scanning", 
                                                       "SQL_injection", 
                                                       "Uploading", 
                                                       "Vulnerability_scanner", 
                                                       "Ransomware",
                                                       "XSS", 
                                                       "Fingerprinting", 
                                                       "MITM"])]
print("-----------------------")
print("df_attack_1")
# print number of lines per Attack_type for each dataframe and total number of lines for each dataframe
display(df_attack_1['Attack_type'].value_counts().to_frame().style.set_properties(**{'text-align': 'left'}))
# print sum of lines for df_attack_1
print(f"Number of lines: {df_attack_1['Attack_type'].value_counts().sum()}")
print(f"\nSize: {df_attack_1.memory_usage(deep=True).sum() / 1024**2:.2f} MBytes")
print("-----------------------\n")
print("df_attack_2")
# print number of lines per Attack_type for each dataframe and total number of lines for each dataframe
display(df_attack_2['Attack_type'].value_counts().to_frame().style.set_properties(**{'text-align': 'left'}))
# print sum of lines for df_attack_2
print(f"Number of lines: {df_attack_2['Attack_type'].value_counts().sum()}")
print(f"Size: {df_attack_2.memory_usage(deep=True).sum() / 1024**2:.2f} MBytes")
print("-----------------------")
      


-----------------------
df_attack_1


Unnamed: 0_level_0,count
Attack_type,Unnamed: 1_level_1
DDoS_UDP,96966
DDoS_ICMP,54438
DDoS_TCP,40053
Password,39825
DDoS_HTTP,38916
Backdoor,19244


Number of lines: 289442

Size: 154.04 MBytes
-----------------------

df_attack_2


Unnamed: 0_level_0,count
Attack_type,Unnamed: 1_level_1
SQL_injection,40755
Vulnerability_scanner,40086
Uploading,29534
Port_Scanning,15915
XSS,12031
Ransomware,7763
Fingerprinting,707
MITM,282


Number of lines: 147073
Size: 78.81 MBytes
-----------------------


# Training df_attack_1 classes model

In [None]:
# REalTabFormer instation: non-relational data variant
rtf_model_df1 = REaLTabFormer(model_type="tabular",
                          batch_size=1,
                          gradient_accumulation_steps=1,
                          checkpoints_dir = "rtf_checkpoint_df_attack_1",
                          logging_steps=100,
                          random_state= 42,
                          epochs= 10,
                          train_size = 0.8,                    # 80% of the data for training, 20% for validation
                          numeric_max_len= 12
                          )

# Note: number of bootstrap samples has influnce in required RAM memory, before optimization

In [None]:
# train df_attack_1 model
rtf_model_df1.fit(df_attack_1)
# save model
rtf_model_df1.save("rtf_models/")

# Training df_attack_2 classes model

In [None]:
rtf_model_df2 = REaLTabFormer(model_type="tabular",
                          batch_size=1,
                          gradient_accumulation_steps=1,
                          checkpoints_dir = "rtf_checkpoint_df_attack_2_v2",
                          logging_steps=100,
                          random_state= config['random_state'],
                          epochs= 10,
                          train_size = 0.8,                    # 80% of the data for training, 20% for validation
                          numeric_max_len= 12
                          )

In [None]:
# train df_attack_2 model
rtf_model_df2.fit(df_attack_2)
# save model
rtf_model_df2.save("rtf_models/")

Computing the sensitivity threshold...
Using parallel computation!!!




Bootstrap round:   0%|          | 0/500 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    500.000000
mean       0.000020
std        0.001220
min       -0.003965
25%       -0.000775
50%       -0.000038
75%        0.000776
max        0.003186
dtype: float64
Sensitivity threshold: 0.0020209899999999994 qt_max: 0.05


Map:   0%|          | 0/147073 [00:00<?, ? examples/s]

  0%|          | 0/588290 [00:00<?, ?it/s]

{'loss': 1.3782, 'learning_rate': 4.999575039521325e-05, 'epoch': 0.0}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.8080899119377136, 'eval_runtime': 86.7685, 'eval_samples_per_second': 339.006, 'eval_steps_per_second': 339.006, 'epoch': 0.0}
{'loss': 0.8018, 'learning_rate': 4.999150079042649e-05, 'epoch': 0.0}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.7078542709350586, 'eval_runtime': 86.7677, 'eval_samples_per_second': 339.008, 'eval_steps_per_second': 339.008, 'epoch': 0.0}
{'loss': 0.6935, 'learning_rate': 4.998725118563974e-05, 'epoch': 0.0}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.6651482582092285, 'eval_runtime': 86.6115, 'eval_samples_per_second': 339.62, 'eval_steps_per_second': 339.62, 'epoch': 0.0}
{'loss': 0.66, 'learning_rate': 4.998300158085298e-05, 'epoch': 0.0}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.6385543942451477, 'eval_runtime': 86.8812, 'eval_samples_per_second': 338.566, 'eval_steps_per_second': 338.566, 'epoch': 0.0}
{'loss': 0.643, 'learning_rate': 4.997875197606623e-05, 'epoch': 0.0}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.6007709503173828, 'eval_runtime': 86.9079, 'eval_samples_per_second': 338.462, 'eval_steps_per_second': 338.462, 'epoch': 0.0}
{'loss': 0.6057, 'learning_rate': 4.997450237127948e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5687059164047241, 'eval_runtime': 87.061, 'eval_samples_per_second': 337.867, 'eval_steps_per_second': 337.867, 'epoch': 0.01}
{'loss': 0.5865, 'learning_rate': 4.997025276649272e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5682404637336731, 'eval_runtime': 87.056, 'eval_samples_per_second': 337.886, 'eval_steps_per_second': 337.886, 'epoch': 0.01}
{'loss': 0.5505, 'learning_rate': 4.9966003161705966e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5672934651374817, 'eval_runtime': 87.0135, 'eval_samples_per_second': 338.051, 'eval_steps_per_second': 338.051, 'epoch': 0.01}
{'loss': 0.5514, 'learning_rate': 4.9961753556919214e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5517113208770752, 'eval_runtime': 86.6447, 'eval_samples_per_second': 339.49, 'eval_steps_per_second': 339.49, 'epoch': 0.01}
{'loss': 0.5709, 'learning_rate': 4.9957503952132455e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5463361740112305, 'eval_runtime': 86.7171, 'eval_samples_per_second': 339.206, 'eval_steps_per_second': 339.206, 'epoch': 0.01}
{'loss': 0.5804, 'learning_rate': 4.9953254347345703e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5300026535987854, 'eval_runtime': 86.7999, 'eval_samples_per_second': 338.883, 'eval_steps_per_second': 338.883, 'epoch': 0.01}
{'loss': 0.5469, 'learning_rate': 4.9949004742558945e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5260112881660461, 'eval_runtime': 91.4163, 'eval_samples_per_second': 321.77, 'eval_steps_per_second': 321.77, 'epoch': 0.01}
{'loss': 0.5513, 'learning_rate': 4.994475513777219e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5178199410438538, 'eval_runtime': 87.2602, 'eval_samples_per_second': 337.095, 'eval_steps_per_second': 337.095, 'epoch': 0.01}
{'loss': 0.5347, 'learning_rate': 4.9940505532985434e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5148817300796509, 'eval_runtime': 87.1964, 'eval_samples_per_second': 337.342, 'eval_steps_per_second': 337.342, 'epoch': 0.01}
{'loss': 0.5343, 'learning_rate': 4.9936255928198675e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5088757276535034, 'eval_runtime': 87.4644, 'eval_samples_per_second': 336.308, 'eval_steps_per_second': 336.308, 'epoch': 0.01}
{'loss': 0.5022, 'learning_rate': 4.993200632341192e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5114428400993347, 'eval_runtime': 88.1467, 'eval_samples_per_second': 333.705, 'eval_steps_per_second': 333.705, 'epoch': 0.01}
{'loss': 0.5125, 'learning_rate': 4.992775671862517e-05, 'epoch': 0.01}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5197890400886536, 'eval_runtime': 87.8676, 'eval_samples_per_second': 334.765, 'eval_steps_per_second': 334.765, 'epoch': 0.01}
{'loss': 0.5012, 'learning_rate': 4.992350711383841e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5029837489128113, 'eval_runtime': 86.929, 'eval_samples_per_second': 338.38, 'eval_steps_per_second': 338.38, 'epoch': 0.02}
{'loss': 0.5211, 'learning_rate': 4.991925750905166e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5060832500457764, 'eval_runtime': 87.2112, 'eval_samples_per_second': 337.285, 'eval_steps_per_second': 337.285, 'epoch': 0.02}
{'loss': 0.5125, 'learning_rate': 4.99150079042649e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5022186040878296, 'eval_runtime': 87.2425, 'eval_samples_per_second': 337.164, 'eval_steps_per_second': 337.164, 'epoch': 0.02}
{'loss': 0.5096, 'learning_rate': 4.991075829947815e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4957203269004822, 'eval_runtime': 87.1856, 'eval_samples_per_second': 337.384, 'eval_steps_per_second': 337.384, 'epoch': 0.02}
{'loss': 0.5058, 'learning_rate': 4.99065086946914e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.5001901388168335, 'eval_runtime': 87.1545, 'eval_samples_per_second': 337.504, 'eval_steps_per_second': 337.504, 'epoch': 0.02}
{'loss': 0.5337, 'learning_rate': 4.990225908990464e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4981704354286194, 'eval_runtime': 87.0427, 'eval_samples_per_second': 337.938, 'eval_steps_per_second': 337.938, 'epoch': 0.02}
{'loss': 0.514, 'learning_rate': 4.989800948511789e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4919542372226715, 'eval_runtime': 87.0912, 'eval_samples_per_second': 337.749, 'eval_steps_per_second': 337.749, 'epoch': 0.02}
{'loss': 0.5003, 'learning_rate': 4.989375988033113e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.49487432837486267, 'eval_runtime': 87.2317, 'eval_samples_per_second': 337.205, 'eval_steps_per_second': 337.205, 'epoch': 0.02}
{'loss': 0.5057, 'learning_rate': 4.9889510275544376e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4943791329860687, 'eval_runtime': 87.1911, 'eval_samples_per_second': 337.363, 'eval_steps_per_second': 337.363, 'epoch': 0.02}
{'loss': 0.4749, 'learning_rate': 4.9885260670757624e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4891112446784973, 'eval_runtime': 87.2456, 'eval_samples_per_second': 337.152, 'eval_steps_per_second': 337.152, 'epoch': 0.02}
{'loss': 0.4924, 'learning_rate': 4.9881011065970865e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.48647600412368774, 'eval_runtime': 87.2604, 'eval_samples_per_second': 337.095, 'eval_steps_per_second': 337.095, 'epoch': 0.02}
{'loss': 0.5016, 'learning_rate': 4.9876761461184113e-05, 'epoch': 0.02}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4884108006954193, 'eval_runtime': 87.6413, 'eval_samples_per_second': 335.63, 'eval_steps_per_second': 335.63, 'epoch': 0.02}
{'loss': 0.5099, 'learning_rate': 4.987251185639736e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.48287197947502136, 'eval_runtime': 87.1516, 'eval_samples_per_second': 337.515, 'eval_steps_per_second': 337.515, 'epoch': 0.03}
{'loss': 0.5127, 'learning_rate': 4.98682622516106e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.48393625020980835, 'eval_runtime': 87.2292, 'eval_samples_per_second': 337.215, 'eval_steps_per_second': 337.215, 'epoch': 0.03}
{'loss': 0.4917, 'learning_rate': 4.986401264682385e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4867366552352905, 'eval_runtime': 87.0638, 'eval_samples_per_second': 337.856, 'eval_steps_per_second': 337.856, 'epoch': 0.03}
{'loss': 0.4855, 'learning_rate': 4.985976304203709e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4825843572616577, 'eval_runtime': 87.1494, 'eval_samples_per_second': 337.524, 'eval_steps_per_second': 337.524, 'epoch': 0.03}
{'loss': 0.4976, 'learning_rate': 4.985551343725034e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4814685881137848, 'eval_runtime': 87.5595, 'eval_samples_per_second': 335.943, 'eval_steps_per_second': 335.943, 'epoch': 0.03}
{'loss': 0.4898, 'learning_rate': 4.985126383246359e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4771744906902313, 'eval_runtime': 86.8869, 'eval_samples_per_second': 338.543, 'eval_steps_per_second': 338.543, 'epoch': 0.03}
{'loss': 0.4903, 'learning_rate': 4.984701422767683e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4816703796386719, 'eval_runtime': 86.9602, 'eval_samples_per_second': 338.258, 'eval_steps_per_second': 338.258, 'epoch': 0.03}
{'loss': 0.4756, 'learning_rate': 4.984276462289008e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4855087399482727, 'eval_runtime': 87.0189, 'eval_samples_per_second': 338.03, 'eval_steps_per_second': 338.03, 'epoch': 0.03}
{'loss': 0.4943, 'learning_rate': 4.983851501810332e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4876137673854828, 'eval_runtime': 87.4759, 'eval_samples_per_second': 336.264, 'eval_steps_per_second': 336.264, 'epoch': 0.03}
{'loss': 0.4648, 'learning_rate': 4.9834265413316566e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4786631166934967, 'eval_runtime': 87.0368, 'eval_samples_per_second': 337.961, 'eval_steps_per_second': 337.961, 'epoch': 0.03}
{'loss': 0.4803, 'learning_rate': 4.9830015808529814e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4723789095878601, 'eval_runtime': 87.0693, 'eval_samples_per_second': 337.834, 'eval_steps_per_second': 337.834, 'epoch': 0.03}
{'loss': 0.4348, 'learning_rate': 4.982576620374305e-05, 'epoch': 0.03}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4770941138267517, 'eval_runtime': 87.2875, 'eval_samples_per_second': 336.99, 'eval_steps_per_second': 336.99, 'epoch': 0.03}
{'loss': 0.4667, 'learning_rate': 4.98215165989563e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4762062728404999, 'eval_runtime': 87.0746, 'eval_samples_per_second': 337.814, 'eval_steps_per_second': 337.814, 'epoch': 0.04}
{'loss': 0.4775, 'learning_rate': 4.9817266994169545e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.47890007495880127, 'eval_runtime': 87.1682, 'eval_samples_per_second': 337.451, 'eval_steps_per_second': 337.451, 'epoch': 0.04}
{'loss': 0.4588, 'learning_rate': 4.9813017389382786e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.47520866990089417, 'eval_runtime': 86.9764, 'eval_samples_per_second': 338.195, 'eval_steps_per_second': 338.195, 'epoch': 0.04}
{'loss': 0.495, 'learning_rate': 4.9808767784596034e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4725222587585449, 'eval_runtime': 86.9049, 'eval_samples_per_second': 338.473, 'eval_steps_per_second': 338.473, 'epoch': 0.04}
{'train_runtime': 4058.2147, 'train_samples_per_second': 144.963, 'train_steps_per_second': 144.963, 'train_loss': 0.5467119615342882, 'epoch': 0.04}


  0%|          | 0/30000 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 30080 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.0020209899999999994,                         val_sensitivity: -0.012873200000000001,                             val_sensitivities: [-0.012738199999999998, -0.012475200000000002, -0.011756400000000002, -0.0128942, -0.012177200000000004, -0.012274399999999996, -0.012677600000000002, -0.0134254, -0.013196200000000002, -0.012864600000000002, -0.013237199999999998, -0.0145034, -0.012708400000000002, -0.013518799999999997, -0.0126508]


  0%|          | 0/1176580 [00:00<?, ?it/s]

{'loss': 0.4659, 'learning_rate': 4.9804518179809275e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.47596824169158936, 'eval_runtime': 91.5578, 'eval_samples_per_second': 321.273, 'eval_steps_per_second': 321.273, 'epoch': 0.04}
{'loss': 0.464, 'learning_rate': 4.980026857502252e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4708021581172943, 'eval_runtime': 90.9691, 'eval_samples_per_second': 323.351, 'eval_steps_per_second': 323.351, 'epoch': 0.04}
{'loss': 0.4454, 'learning_rate': 4.979601897023577e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.47192785143852234, 'eval_runtime': 90.7988, 'eval_samples_per_second': 323.958, 'eval_steps_per_second': 323.958, 'epoch': 0.04}
{'loss': 0.4779, 'learning_rate': 4.979176936544901e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4728752672672272, 'eval_runtime': 90.7726, 'eval_samples_per_second': 324.052, 'eval_steps_per_second': 324.052, 'epoch': 0.04}
{'loss': 0.4849, 'learning_rate': 4.978751976066226e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.4760623276233673, 'eval_runtime': 90.6315, 'eval_samples_per_second': 324.556, 'eval_steps_per_second': 324.556, 'epoch': 0.04}
{'loss': 0.4575, 'learning_rate': 4.978327015587551e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.47221875190734863, 'eval_runtime': 90.6136, 'eval_samples_per_second': 324.62, 'eval_steps_per_second': 324.62, 'epoch': 0.04}
{'loss': 0.4811, 'learning_rate': 4.977902055108875e-05, 'epoch': 0.04}


  0%|          | 0/29415 [00:00<?, ?it/s]

{'eval_loss': 0.47343870997428894, 'eval_runtime': 90.5622, 'eval_samples_per_second': 324.804, 'eval_steps_per_second': 324.804, 'epoch': 0.04}
{'train_runtime': 657.5831, 'train_samples_per_second': 1789.249, 'train_steps_per_second': 1789.249, 'train_loss': 0.06301491590646598, 'epoch': 0.04}


  0%|          | 0/30000 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 30080 samples generated. Sampling efficiency is: 100.0000%
Saving not-best model...
Critic round: 10,                     sensitivity_threshold: 0.0020209899999999994,                         val_sensitivity: 0.00399436,                             val_sensitivities: [0.0036542, 0.0032517999999999996, 0.0028190000000000003, 0.0022500000000000003, 0.003990800000000001, 0.005039199999999999, 0.005099600000000001, 0.003762600000000001, 0.0034178000000000004, 0.0046236, 0.0044816, 0.0031036, 0.0047409999999999996, 0.005065000000000001, 0.0046156]
Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


# Data Generation

In [None]:
# load best df_attack_1 model (id000016979230192943532032)
rtf_model_df1 = REaLTabFormer.load_from_dir(path="rtf_models/id000016979230192943532032")

In [None]:
def sample_by_category(model, category_dict, columns):
    """
    Samples from the model by category.
    :param model: the model to sample from
    :param category_dict: a dictionary with the number of samples to generate for each category
    :param columns: the columns to include in the output dataframe
    :return: a dataframe with the samples
    """
    sampled_df = pd.DataFrame(columns=columns)
    for category, n_samples in category_dict.items():
        start_time = time.time()
        while len(sampled_df[sampled_df['Attack_type'] == category]) < n_samples:
            
            samples = model.sample(n_samples=20000)
            # select only the samples with the desired category
            category_samples = samples[samples['Attack_type'] == category]
            # discard the excess samples if necessary
            if len(category_samples) > n_samples:
                category_samples = category_samples.sample(n=n_samples)
            # add the selected samples to the sampled_df dataframe
            sampled_df = pd.concat([sampled_df, category_samples], ignore_index=True)
            
            elapsed_time = time.time() - start_time
            if elapsed_time > 5:
                clear_output(wait=True)
                print(f"{len(sampled_df[sampled_df['Attack_type'] == category])}/{n_samples}", end="\r")
                start_time = time.time()
    return sampled_df

In [None]:
# Dictionary with the number of samples to be generated with rtf_model_df1
counts_df1= {'DDoS_UDP': 3034,
             'DDoS_HTTP':61084,
             'Password': 60175,
             'DDoS_TCP': 59947,
             'Backdoor': 80756,
             'DDoS_ICMP': 45562}

In [None]:
# Sample by category for df_attack_1
RTB_dataset_df1 = sample_by_category(rtf_model_df1, counts_df1, df_train.columns)

  0%|          | 0/10000 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 10112 samples generated. Sampling efficiency is: 100.0000%
46182/45562

In [None]:
# Save RTB_dataset_df1 to csv file
RTB_dataset_df1.to_csv("RTB_dataset_df1.csv", index=False)

In [None]:
# Dictionary with the number of samples to be generated with rtf_model_df2
counts_df2= {"Port_Scanning": 84085, 
             "SQL_injection":59245 , 
             "Uploading":70466, 
             "Vulnerability_scanner":59914, 
             "Ransomware":92237, 
             "Fingerprinting":99293,
             "XSS": 87969,
             "MITM":99718}

In [None]:
# Load best df_attack_2 model (id000016979230192943532032)
rtf_model_df2 = REaLTabFormer.load_from_dir(path="rtf_models/id000016980601259525519360")

In [None]:
# TESTE

# counts_df2= {"XSS": 87969}
# rtf_model_df2 = REaLTabFormer.load_from_dir(path="rtf_models/id000016982514417271404544")  

# RTB_dataset_df2 = sample_by_category(rtf_model_df2, 
#                                      counts_df2, 
#                                      df_train.columns)

# save RTB_dataset_df2 to csv file
# RTB_dataset_df2.to_csv("RTB_dataset_df2_XSS.csv", index=False)

88183/87969

# Augmented data verification and concatenation

In [85]:
# Load RTB_dataset_df1 and RTB_dataset_df2
RTB_dataset_df1 = pd.read_csv("../data/RTB_dataset_df1.csv", low_memory=False)
RTB_dataset_df2 = pd.read_csv("../data/RTB_dataset_df2_v2.csv", low_memory=False)

In [86]:
# Attack type per dataset
display(RTB_dataset_df1['Attack_type'].value_counts().to_frame().style.set_caption("RTB_dataset_df1"))
display(RTB_dataset_df2['Attack_type'].value_counts().to_frame().style.set_caption("RTB_dataset_df2"))

Unnamed: 0_level_0,count
Attack_type,Unnamed: 1_level_1
Backdoor,81549
DDoS_HTTP,62613
Password,62373
DDoS_TCP,60349
DDoS_ICMP,46182
DDoS_UDP,3258


Unnamed: 0_level_0,count
Attack_type,Unnamed: 1_level_1
MITM,99812
Fingerprinting,99745
Ransomware,94039
XSS,88183
Port_Scanning,84733
Uploading,70749
SQL_injection,61693
Vulnerability_scanner,61128


In [87]:
# Concatenate RTB_dataset_df1 and RTB_dataset_df2
RTB_dataset = pd.concat([RTB_dataset_df1, RTB_dataset_df2], ignore_index=True)
del RTB_dataset_df1, RTB_dataset_df2

# Save RTB_dataset to csv file
RTB_dataset.to_csv("../data/RTB_dataset.csv", index=False)

In [88]:
# Count the number of rows per Attack_type in RTB_dataset
counts_dict_rtb = {}
for attack in RTB_dataset["Attack_type"].unique():
    counts_dict_rtb[attack] = RTB_dataset[RTB_dataset["Attack_type"] == attack].shape[0]
    
# Count the number of rows per Attack_type in df_train
counts_dict_train = {}
for attack in df_train["Attack_type"].unique():
    counts_dict_train[attack] = df_train[df_train["Attack_type"] == attack].shape[0]

counts_dict_total = {}
# Sum if key exists in both dictionaries
for key in counts_dict_rtb.keys() & counts_dict_train.keys():
    counts_dict_total[key] = counts_dict_rtb[key] + counts_dict_train[key]

counts_excess = {}
# Subtract 100.000 to get the number of rows to be removed
for key, value in counts_dict_total.items():
    counts_excess[key] = value - 100000

counts_excess

{'Backdoor': 793,
 'Ransomware': 1802,
 'DDoS_HTTP': 1529,
 'Password': 2198,
 'MITM': 94,
 'XSS': 214,
 'Port_Scanning': 648,
 'Uploading': 283,
 'DDoS_TCP': 402,
 'Fingerprinting': 452,
 'SQL_injection': 2448,
 'DDoS_UDP': 224,
 'DDoS_ICMP': 620,
 'Vulnerability_scanner': 1214}

In [90]:
# Remove excess rows from RTB_dataset by selecting a random sample of rows to be removed
for key, value in counts_excess.items():
    if value > 0:
        RTB_dataset = RTB_dataset.drop(RTB_dataset[RTB_dataset["Attack_type"] == key].sample(n=value, random_state=42).index)

# Concatenate RTB_dataset and df_train
df_balanced = pd.concat([df_train, RTB_dataset], ignore_index=True)

display(df_balanced['Attack_type'].value_counts().to_frame().style.set_properties(**{'text-align': 'left'}))

Unnamed: 0_level_0,count
Attack_type,Unnamed: 1_level_1
DDoS_UDP,100000
Password,100000
DDoS_TCP,100000
Backdoor,100000
DDoS_ICMP,100000
Port_Scanning,100000
Vulnerability_scanner,100000
SQL_injection,100000
DDoS_HTTP,100000
Uploading,100000


In [91]:
# Save df_balanced to csv file
df_balanced.to_csv("../data/EdgeIIot_train_100k_RTB_balanced.csv", index=False)

In [92]:
# Fill Attack_label column with 1 if Attack_type is not Normal, 0 otherwise
df_balanced["Attack_label"] = df_balanced["Attack_type"].apply(lambda x: 1 if x != "Normal" else 0)

In [98]:
display(df_balanced.describe().T.style.set_properties(**{'text-align': 'left'}))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
arp.opcode,1500000.0,2.1e-05,0.006,0.0,0.0,0.0,0.0,2.0
arp.hw.size,1500000.0,8.4e-05,0.02245,0.0,0.0,0.0,0.0,6.0
icmp.checksum,1500000.0,3824.459987,11903.002549,0.0,0.0,0.0,0.0,65533.0
icmp.seq_le,1500000.0,2379.180167,9827.70476,0.0,0.0,0.0,0.0,65534.0
icmp.unused,1500000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http.content_length,1500000.0,13.561247,104.671315,0.0,0.0,0.0,0.0,83655.0
http.response,1500000.0,0.042481,0.201685,0.0,0.0,0.0,0.0,1.0
http.tls_port,1500000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tcp.ack,1500000.0,72415358.421074,305137710.739505,0.0,0.0,1.0,486.0,3178709301.0
tcp.ack_raw,1500000.0,1273166826.755494,1280284214.791586,0.0,0.0,997129949.0,2296324424.0,4294926648.0
