In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jlvelaalonso","key":"72687d455a314ef5280bf2076c59622b"}'}

In [3]:
!mkdir -p ~/.kaggle

In [4]:
!mv kaggle.json ~/.kaggle/

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
# === UNSW-NB15 — Synthetic por cuotas (GaussianCopula) ===
!pip install -q sdv tqdm pandas numpy

import os, math, gc, time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer

# ===================== CONFIG =====================
REAL_CSV  = "/gdrive/MyDrive/Datasets/UNSW_NB15_full_clean.csv"
SYN_CSV   = "/gdrive/MyDrive/Datasets/synthetic_UNSW_NB15_ctgan.csv"
META_JSON = "/gdrive/MyDrive/Datasets/synthetic_UNSW_NB15_ctgan.json"

RANDOM_STATE = 42

NORMAL_N      = 200_000
ATTACK_N_EACH = 30_000

CHUNK_SIZE = 250_000

TRAIN_TOTAL_N         = 250_000
TRAIN_NORMAL_MAX      = 140_000
TRAIN_ATTACK_MIN_EACH = 35_000

DEFAULT_DISTRIBUTION = "gamma"
ENFORCE_MINMAX = False
# ================================================

def norm_cols(cols):
    return [str(c).strip().replace(" ", "_").replace("/", "_").replace("-", "_").lower() for c in cols]

def enforce_label_attackcat(df):
    df = df.copy()
    df["attack_cat"] = df["attack_cat"].astype(str).str.strip()
    df["label"] = (df["attack_cat"].str.lower() != "normal").astype(int)
    return df

def approx_line_count(path: str) -> int:
    with open(path, "rb") as f:
        return max(sum(1 for _ in f) - 1, 0)

def append_csv(df_part, path):
    header = not os.path.exists(path)
    df_part.to_csv(path, index=False, mode="a", header=header)

# ===================== scan clases =====================
n_lines = approx_line_count(REAL_CSV)
n_chunks = max(1, int(math.ceil(n_lines / CHUNK_SIZE)))
cnt_attack = Counter()

reader = pd.read_csv(REAL_CSV, chunksize=CHUNK_SIZE, low_memory=False)
for ch in tqdm(reader, total=n_chunks, desc="Scan clases UNSW", unit="chunk"):
    ch.columns = norm_cols(ch.columns)
    cnt_attack.update(ch["attack_cat"].astype(str).str.strip().tolist())
    del ch
    gc.collect()

attack_cats = sorted([c for c in cnt_attack.keys() if str(c).strip().lower() != "normal"])
print("[INFO] attack_cat detectadas (sin Normal):", attack_cats)

# ===================== construir df_train =====================
rng = np.random.RandomState(RANDOM_STATE)
train_parts, seen = [], Counter()

reader = pd.read_csv(REAL_CSV, chunksize=CHUNK_SIZE, low_memory=False)
for ch in tqdm(reader, total=n_chunks, desc="df_train UNSW", unit="chunk"):
    ch.columns = norm_cols(ch.columns)
    ch = enforce_label_attackcat(ch)

    for cat in attack_cats:
        need = TRAIN_ATTACK_MIN_EACH - seen[cat]
        if need <= 0:
            continue
        sub = ch[ch["attack_cat"] == cat]
        if len(sub) == 0:
            continue
        take = min(need, len(sub))
        train_parts.append(sub.sample(n=take, random_state=int(rng.randint(0, 1e9))))
        seen[cat] += take

    needN = TRAIN_NORMAL_MAX - seen["Normal"]
    if needN > 0:
        subN = ch[ch["attack_cat"].str.lower() == "normal"]
        if len(subN) > 0:
            takeN = min(needN, len(subN))
            train_parts.append(subN.sample(n=takeN, random_state=int(rng.randint(0, 1e9))))
            seen["Normal"] += takeN

    del ch
    gc.collect()

    if sum(seen.values()) >= TRAIN_TOTAL_N:
        break

df_train = pd.concat(train_parts, ignore_index=True).sample(frac=1.0, random_state=RANDOM_STATE)
df_train = df_train.head(TRAIN_TOTAL_N).reset_index(drop=True)

# ===================== fit SDV =====================
df_sdv = df_train.copy()
for c in df_sdv.columns:
    if str(df_sdv[c].dtype) == "category":
        df_sdv[c] = df_sdv[c].astype("object")

df_sdv["attack_cat"] = df_sdv["attack_cat"].astype(str)
df_sdv["label"] = df_sdv["label"].astype(str)

metadata = Metadata.detect_from_dataframe(df_sdv)
metadata.save_to_json(META_JSON)

synth = GaussianCopulaSynthesizer(
    metadata,
    default_distribution=DEFAULT_DISTRIBUTION,
    enforce_min_max_values=ENFORCE_MINMAX
)

print("[INFO] Entrenando sintetizador UNSW...")
synth.fit(df_sdv)

# ===================== generar cuotas =====================
if os.path.exists(SYN_CSV):
    os.remove(SYN_CSV)

synN = synth.sample(num_rows=NORMAL_N)
synN.columns = norm_cols(synN.columns)
synN = enforce_label_attackcat(synN)
synN["attack_cat"] = "Normal"
synN["label"] = 0
append_csv(synN, SYN_CSV)
del synN
gc.collect()

for cat in tqdm(attack_cats, desc="Generando ataques UNSW", unit="clase"):
    synA = synth.sample(num_rows=ATTACK_N_EACH)
    synA.columns = norm_cols(synA.columns)
    synA = enforce_label_attackcat(synA)
    synA["attack_cat"] = str(cat)
    synA["label"] = 1
    append_csv(synA, SYN_CSV)
    del synA
    gc.collect()

print("[OK] Sintético guardado:", SYN_CSV)

# validación
cnt_syn = Counter()
for ch in pd.read_csv(SYN_CSV, chunksize=200_000):
    cnt_syn.update(ch["attack_cat"].astype(str).str.strip().tolist())
print("[SYN] attack_cat:", dict(cnt_syn))


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/200.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m111.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.5/201.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Scan clases UNSW:   0%|          | 0/11 [00:00<?, ?chunk/s]

[INFO] attack_cat detectadas (sin Normal): ['Analysis', 'Backdoors', 'DoS', 'Exploits', 'Fuzzers', 'Generic', 'PortScan', 'Shellcode']


df_train UNSW:   0%|          | 0/11 [00:00<?, ?chunk/s]

[INFO] Entrenando sintetizador UNSW...


Generando ataques UNSW:   0%|          | 0/8 [00:00<?, ?clase/s]

[OK] Sintético guardado: /gdrive/MyDrive/Datasets/synthetic_UNSW_NB15_ctgan.csv
[SYN] attack_cat: {'Normal': 200000, 'Analysis': 30000, 'Backdoors': 30000, 'DoS': 30000, 'Exploits': 30000, 'Fuzzers': 30000, 'Generic': 30000, 'PortScan': 30000, 'Shellcode': 30000}


In [7]:
print("[REAL] attack_cat:", pd.read_csv("/gdrive/MyDrive/Datasets/UNSW_NB15_full_clean.csv", low_memory=False)["attack_cat"].value_counts().head(30))
print("[SYN]  attack_cat:", pd.read_csv("/gdrive/MyDrive/Datasets/synthetic_UNSW_NB15_ctgan.csv", low_memory=False)["attack_cat"].value_counts().head(30))

[REAL] attack_cat: attack_cat
Normal       2218764
Generic       217450
Exploits       44525
Fuzzers        24246
DoS            16353
PortScan       13987
Analysis        2677
Shellcode       1511
Backdoors        534
Name: count, dtype: int64
[SYN]  attack_cat: attack_cat
Normal       200000
Analysis      30000
Backdoors     30000
DoS           30000
Exploits      30000
Fuzzers       30000
Generic       30000
PortScan      30000
Shellcode     30000
Name: count, dtype: int64
