In [None]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.26.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.40.28-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.40.28-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.18.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.23.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [None]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.sampling import Condition
from google.colab import drive
import os

In [None]:

# Mount Google Drive to access your saved files
drive.mount('/content/drive')

# Load the trained CTGAN model and original data
model_path = '/content/drive/MyDrive/ANOMALY_DETECTION/ctgan_final_model.pkl'
ctgan = CTGANSynthesizer.load(model_path)
data = pd.read_csv('/content/drive/MyDrive/ANOMALY_DETECTION/clean_data.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


# Define the target size for each class
undersample_size = 230124

all_counts = {
    'BENIGN': 2271285, 'DoS Hulk': 230124, 'PortScan': 158804, 'DDoS': 128025,
    'DoS GoldenEye': 10293, 'FTP-Patator': 7935, 'SSH-Patator': 5897,
    'DoS slowloris': 5796, 'DoS Slowhttptest': 5499, 'Bot': 1956,
    'Web Attack - Brute Force': 1507, 'Web Attack - XSS': 652,
    'Infiltration': 36, 'Web Attack - Sql Injection': 21, 'Heartbleed': 11
}
UNLEARNABLE_THRESHOLD = 5000

# Final dataset chunks
data_chunks = []
conditions_list = []

# Add the undersampled BENIGN data first
benign_data = data[data['Label'] == 'BENIGN'].sample(n=undersample_size, random_state=42)
data_chunks.append(benign_data)

# Handle attack classes
for attack, count in all_counts.items():
    if attack == 'BENIGN':
        continue

    # Target size depends on rarity
    if count < 1000:
        target_size = 10000
    else:
        target_size = undersample_size

    n_samples_to_generate = target_size - count

    # Corrected: Add the original samples to the list first
    original_samples = data[data['Label'] == attack]
    data_chunks.append(original_samples)

    if count >= UNLEARNABLE_THRESHOLD and n_samples_to_generate > 0: # <-- CORRECTED: Check for n_samples_to_generate
        print(f"Adding a condition to generate {n_samples_to_generate} CTGAN samples for '{attack}' (target={target_size})...")
        conditions_list.append(Condition(
            num_rows=n_samples_to_generate,
            column_values={'Label': attack}
        ))
    elif count < UNLEARNABLE_THRESHOLD and n_samples_to_generate > 0:
        print(f"Duplicating samples for '{attack}' (target={target_size})...")
        duplicated_samples = pd.concat(
            [original_samples] * (target_size // count + 1),
            ignore_index=True
        )
        final_samples = duplicated_samples.head(n_samples_to_generate)
        data_chunks.append(final_samples)


# Sample all conditional data at once
print("\nSampling all conditional data with CTGAN...")
if conditions_list:
    generated_samples = ctgan.sample_from_conditions(conditions=conditions_list)
    data_chunks.append(generated_samples)

# Merge all chunks
balanced_data = pd.concat(data_chunks, ignore_index=True)

# Shuffle and save
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
output_path = '/content/drive/MyDrive/ANOMALY_DETECTION/final_balanced_dataset.csv'
balanced_data.to_csv(output_path, index=False)

# Summary
print(f"\nTotal rows in final balanced dataset: {len(balanced_data)}")
print("Class distribution in final dataset:")
print(balanced_data['Label'].value_counts())

Adding a condition to generate 71320 CTGAN samples for 'PortScan' (target=230124)...
Adding a condition to generate 102099 CTGAN samples for 'DDoS' (target=230124)...
Adding a condition to generate 219831 CTGAN samples for 'DoS GoldenEye' (target=230124)...
Adding a condition to generate 222189 CTGAN samples for 'FTP-Patator' (target=230124)...
Adding a condition to generate 224227 CTGAN samples for 'SSH-Patator' (target=230124)...
Adding a condition to generate 224328 CTGAN samples for 'DoS slowloris' (target=230124)...
Adding a condition to generate 224625 CTGAN samples for 'DoS Slowhttptest' (target=230124)...
Duplicating samples for 'Bot' (target=230124)...
Duplicating samples for 'Web Attack - Brute Force' (target=230124)...
Duplicating samples for 'Web Attack - XSS' (target=10000)...
Duplicating samples for 'Infiltration' (target=10000)...
Duplicating samples for 'Web Attack - Sql Injection' (target=10000)...
Duplicating samples for 'Heartbleed' (target=10000)...

Sampling all co

Sampling conditions: 100%|██████████| 1288619/1288619 [2:08:52<00:00, 166.66it/s]



Total rows in final balanced dataset: 2571364
Class distribution in final dataset:
Label
DoS Slowhttptest              230124
SSH-Patator                   230124
FTP-Patator                   230124
DoS Hulk                      230124
PortScan                      230124
DDoS                          230124
BENIGN                        230124
DoS GoldenEye                 230124
DoS slowloris                 230124
Bot                           230124
Web Attack - Brute Force      230124
Infiltration                   10000
Heartbleed                     10000
Web Attack - XSS               10000
Web Attack - Sql Injection     10000
Name: count, dtype: int64


In [None]:
save_path = r'c:\Users\gunit\Desktop\Anomaly_Detection\CTGAN_syn_data.csv'
balanced_data.to_csv(save_path, index=False)