### Libraries

In [2]:
%%capture
%reset -f  
'generic imports'
import pandas as pd
import json
import time 
from IPython.display import display, clear_output
from numba import cuda  

'data augmentation imports'
from be_great import GReaT # https://github.com/kathrinse/be_great

### GPU

In [3]:
# Select GPU device (change the device number as needed)
cuda.select_device(0)

# Clear GPU memory allocations
cuda.current_context().deallocations.clear()

In [4]:
# Get the current CUDA context and GPU information
ctx = cuda.current_context()
meminfo = ctx.get_memory_info()
device_name = cuda.gpus[0].name.decode()
cudnn_version = cuda.cudadrv.driver.get_version()

# Print the GPU information
print(f'GPU: {device_name}\nAvailable GPU memory: {meminfo[0] / 1024**3:.1f} GB\ncuDNN version: {cudnn_version}')

GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Available GPU memory: 7.6 GB
cuDNN version: (12, 2)


### Loading Data

In [5]:
# Load data relative to the training set, with Attack_type "Normal" with 100k rows
df_train = pd.read_csv('../data/EdgeIIot_train_100k.csv', low_memory=False)

# sample 1000 rows from the training set
df_train = df_train.sample(n=1000, random_state=42)

###  Training Data Augmentation Model

In [6]:
# instantiate the GReaT model with the distilgpt2 language model
model = GReaT(llm="distilgpt2",                   # Language model to use
              epochs=10,                          # Number of epochs to train
              save_steps=2000,                    # Save model weights every x steps
              logging_steps=500,                  # Log the loss and learning rate every x steps
              experiment_dir="checkpoints/GReat", # Name of the directory where all intermediate steps are saved
              batch_size=16                       
             )

In [7]:
# Train the model
model.fit(df_train)

  0%|          | 0/630 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 0 has a total capacty of 7.75 GiB of which 78.38 MiB is free. Process 15848 has 8.65 MiB memory in use. Including non-PyTorch memory, this process has 7.65 GiB memory in use. Of the allocated memory 7.43 GiB is allocated by PyTorch, and 108.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Save the model

In [None]:
model.save("checkpoints/GReat/trained_model")

## Data Generation

In [None]:
def sample_by_category(model, category_dict, columns):
    # create empty dataframe with columns 
    sampled_df = pd.DataFrame(columns=columns)
    for category, n_samples in category_dict.items():
        start_time = time.time()
        while len(sampled_df[sampled_df['Attack_type'] == category]) < n_samples:
            
            samples = model.sample(n_samples=20000)
            # select only the samples with the desired category
            category_samples = samples[samples['Attack_type'] == category]
            # discard the excess samples if necessary
            if len(category_samples) > n_samples:
                category_samples = category_samples.sample(n=n_samples)
            # add the selected samples to the sampled_df dataframe
            sampled_df = pd.concat([sampled_df, category_samples], ignore_index=True)
            
            elapsed_time = time.time() - start_time
            if elapsed_time > 5:
                clear_output(wait=True)
                print(f"{len(sampled_df[sampled_df['Attack_type'] == category])}/{n_samples}", end="\r")
                start_time = time.time()
    return sampled_df

In [None]:
samples = model.sample(n_samples, k=50, start_col="target", start_col_dist={"0": 0.5, "1": 0.5, "2": 0})