In [1]:
from google.colab import drive
drive.mount('/gdrive',force_remount=True)

Mounted at /gdrive


In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jlvelaalonso","key":"72687d455a314ef5280bf2076c59622b"}'}

In [3]:
!mkdir -p ~/.kaggle

In [4]:
!cp kaggle.json ~/.kaggle/

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
# === KITSUNE — GaussianCopula "1 sintetizador por clase" (REAL condicional) + anti-OOM + progreso ===
!pip install -q sdv packaging tqdm

import os, gc, time, math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer

# ===================== CONFIG =====================
REAL_CSV  = "/gdrive/MyDrive/Datasets/KITSUNE_full_clean.csv"
SYN_CSV   = "/gdrive/MyDrive/Datasets/synthetic_kitsune_ctgan.csv"
META_DIR  = "/gdrive/MyDrive/Datasets/kitsune_metadata_per_class"   # guardará un json por clase (opcional pero útil)

RANDOM_STATE = 42

# Cuotas sintéticas finales
NORMAL_N      = 200_000
ATTACK_N_EACH = 30_000

# Lectura por chunks (Kitsune es enorme)
CHUNK_SIZE = 300_000

# Muestras para entrenar SDV por clase (sube/baja según RAM)
TRAIN_NORMAL_N = 120_000     # muestra para entrenar "Normal"
TRAIN_ATTACK_N = 30_000      # muestra para entrenar cada ataque (DoS/MitM/...)

# SDV params
DEFAULT_DISTRIBUTION = "gamma"
ENFORCE_MINMAX = False
# ================================================

os.makedirs(META_DIR, exist_ok=True)
rng = np.random.RandomState(RANDOM_STATE)

def normalize_cols(cols):
    return [str(c).strip().replace(" ", "_").replace("/", "_").replace("-", "_").lower() for c in cols]

def enforce_attackcat_label_inplace(df_any: pd.DataFrame) -> pd.DataFrame:
    if "attack_cat" not in df_any.columns:
        raise ValueError("Falta 'attack_cat' en el CSV.")
    df_any["attack_cat"] = df_any["attack_cat"].astype(str).str.strip()
    # CHECK DURO: label depende SOLO de attack_cat
    df_any["label"] = (df_any["attack_cat"].str.lower() != "normal").astype(int)
    return df_any

def approx_line_count(path: str) -> int:
    with open(path, "rb") as f:
        return max(sum(1 for _ in f) - 1, 0)

def append_csv(df_part: pd.DataFrame, path: str):
    header = not os.path.exists(path)
    df_part.to_csv(path, index=False, mode="a", header=header)

def reorder_attackcat_label_first(df_part: pd.DataFrame) -> pd.DataFrame:
    # garantiza attack_cat, label al principio
    cols = df_part.columns.tolist()
    front = [c for c in ["attack_cat", "label"] if c in cols]
    rest = [c for c in cols if c not in front]
    return df_part[front + rest]

def build_class_sample(klass: str, target_n: int, n_chunks: int) -> pd.DataFrame:
    """
    Construye una muestra SOLO de la clase 'klass' leyendo el CSV por chunks.
    """
    parts = []
    seen = 0

    reader = pd.read_csv(REAL_CSV, low_memory=False, chunksize=CHUNK_SIZE)
    for ch in tqdm(reader, total=n_chunks, desc=f"Sampling REAL [{klass}]", unit="chunk"):
        ch.columns = normalize_cols(ch.columns.tolist())
        ch = enforce_attackcat_label_inplace(ch)

        if klass.lower() == "normal":
            sub = ch[ch["attack_cat"].str.lower() == "normal"]
        else:
            sub = ch[ch["attack_cat"] == klass]

        if len(sub) > 0:
            need = target_n - seen
            take = min(need, len(sub))
            # sample aleatorio dentro del chunk (si el chunk es enorme, evita sesgo por orden)
            samp = sub.sample(n=take, random_state=int(rng.randint(0, 1e9)))
            parts.append(samp)
            seen += len(samp)

        del ch, sub
        gc.collect()

        if seen >= target_n:
            break

    if not parts:
        return pd.DataFrame()

    df_sample = pd.concat(parts, ignore_index=True)
    del parts
    gc.collect()

    # barajar
    df_sample = df_sample.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)

    # asegurar tamaño exacto si sobra
    if len(df_sample) > target_n:
        df_sample = df_sample.sample(n=target_n, random_state=RANDOM_STATE).reset_index(drop=True)

    return df_sample

def fit_synth_for_class(df_train: pd.DataFrame, klass: str) -> GaussianCopulaSynthesizer:
    """
    Entrena un GaussianCopulaSynthesizer SOLO con df_train (una clase).
    """
    # SDV no admite category
    df_sdv = df_train.copy()
    for c in df_sdv.columns:
        if str(df_sdv[c].dtype) == "category":
            df_sdv[c] = df_sdv[c].astype("object")

    # forzar strings en categóricas clave
    df_sdv["attack_cat"] = df_sdv["attack_cat"].astype(str)
    df_sdv["label"] = df_sdv["label"].astype(str)

    metadata = Metadata.detect_from_dataframe(df_sdv)
    meta_path = os.path.join(META_DIR, f"metadata_{klass}.json".replace(" ", "_"))
    metadata.save_to_json(meta_path)

    synth = GaussianCopulaSynthesizer(
        metadata,
        default_distribution=DEFAULT_DISTRIBUTION,
        enforce_min_max_values=ENFORCE_MINMAX
    )

    t0 = time.time()
    synth.fit(df_sdv)
    print(f"[OK] fit({klass}) en {(time.time()-t0)/60:.2f} min | train={df_sdv.shape}")

    del df_sdv
    gc.collect()

    return synth

# ===================== 0) Preparación: contar clases reales por chunks =====================
n_lines = approx_line_count(REAL_CSV)
n_chunks = max(1, int(math.ceil(n_lines / CHUNK_SIZE)))
print(f"[INFO] REAL filas aprox: {n_lines:,} | chunks: {n_chunks} | chunksize: {CHUNK_SIZE:,}")

cnt = Counter()
reader = pd.read_csv(REAL_CSV, low_memory=False, chunksize=CHUNK_SIZE)
for ch in tqdm(reader, total=n_chunks, desc="Scan clases REAL", unit="chunk"):
    ch.columns = normalize_cols(ch.columns.tolist())
    if "attack_cat" not in ch.columns:
        raise ValueError("El CSV REAL no tiene attack_cat.")
    cnt.update(ch["attack_cat"].astype(str).str.strip().tolist())
    del ch
    gc.collect()

attack_cats = sorted([c for c in cnt.keys() if str(c).strip().lower() != "normal"])
print("[INFO] attack_cats detectadas (sin Normal):", attack_cats)
print("[INFO] top10 real:", dict(Counter(cnt).most_common(10)))

if not attack_cats:
    raise RuntimeError("No se detectaron ataques en KITSUNE_full_clean.csv (solo Normal).")

# ===================== 1) Preparar salida =====================
if os.path.exists(SYN_CSV):
    os.remove(SYN_CSV)
print("[INFO] SYN_CSV:", SYN_CSV)

# ===================== 2) Generar Normal (sintetizador entrenado SOLO con Normal) =====================
print("\n==================== CLASE: Normal ====================")
df_train_N = build_class_sample("Normal", TRAIN_NORMAL_N, n_chunks=n_chunks)
if df_train_N.empty:
    raise RuntimeError("No pude construir muestra para Normal. Revisa attack_cat en el REAL.")
print("[INFO] train Normal:", df_train_N.shape, "| attack_cat:", df_train_N["attack_cat"].value_counts().to_dict())

synth_N = fit_synth_for_class(df_train_N, "Normal")

t0 = time.time()
synN = synth_N.sample(num_rows=NORMAL_N)
synN.columns = normalize_cols(synN.columns.tolist())
synN = enforce_attackcat_label_inplace(synN)

# forzar coherencia estricta
synN["attack_cat"] = "Normal"
synN["label"] = 0

synN = reorder_attackcat_label_first(synN)
append_csv(synN, SYN_CSV)

del df_train_N, synth_N, synN
gc.collect()
print(f"[OK] Normal generado={NORMAL_N:,} en {(time.time()-t0)/60:.2f} min")

# ===================== 3) Generar ataques (1 sintetizador por ataque) =====================
print("\n==================== ATAQUES (por clase) ====================")

for cat in tqdm(attack_cats, desc="Ataques: entrenar+generar", unit="clase"):
    print(f"\n----- CLASE: {cat} -----")
    df_train_A = build_class_sample(cat, TRAIN_ATTACK_N, n_chunks=n_chunks)

    if df_train_A.empty:
        print(f"[AVISO] No pude construir muestra para {cat}. Se omite.")
        continue

    print("[INFO] train:", df_train_A.shape, "| dist:", df_train_A["attack_cat"].value_counts().to_dict())

    synth_A = fit_synth_for_class(df_train_A, cat)

    t1 = time.time()
    synA = synth_A.sample(num_rows=ATTACK_N_EACH)
    synA.columns = normalize_cols(synA.columns.tolist())
    synA = enforce_attackcat_label_inplace(synA)

    # forzar clase objetivo
    synA["attack_cat"] = str(cat)
    synA["label"] = 1

    synA = reorder_attackcat_label_first(synA)
    append_csv(synA, SYN_CSV)

    del df_train_A, synth_A, synA
    gc.collect()
    print(f"[OK] {cat} generado={ATTACK_N_EACH:,} en {(time.time()-t1)/60:.2f} min")

print("\n[OK] Sintético multiclase guardado en:", SYN_CSV)
print("[OK] Metadatas por clase en:", META_DIR)

# ===================== 4) Validación rápida del SYN (sin cargar entero) =====================
from collections import Counter
cnt_syn_attack = Counter()
cnt_syn_label  = Counter()

syn_reader = pd.read_csv(SYN_CSV, chunksize=200_000, low_memory=False)
for ch in tqdm(syn_reader, desc="Validando SYN (chunks)", unit="chunk"):
    cnt_syn_attack.update(ch["attack_cat"].astype(str).str.strip().tolist())
    cnt_syn_label.update(ch["label"].astype(str).str.strip().tolist())

print("\n[SYN] label:", dict(cnt_syn_label))
print("[SYN] attack_cat top15:", dict(Counter(cnt_syn_attack).most_common(15)))

# checks duros
bad1 = 0
bad2 = 0
syn_reader2 = pd.read_csv(SYN_CSV, chunksize=200_000, low_memory=False)
for ch in syn_reader2:
    ac = ch["attack_cat"].astype(str).str.lower()
    lb = pd.to_numeric(ch["label"], errors="coerce").fillna(0).astype(int)
    bad1 += int(((ac == "normal") & (lb != 0)).sum())
    bad2 += int(((ac != "normal") & (lb != 1)).sum())
print("\n[CHECK] Normal con label!=0:", bad1)
print("[CHECK] Ataque con label!=1:", bad2)
print("\n[FIN] Generación por clase lista (condicional real).")


[INFO] REAL filas aprox: 20,438,941 | chunks: 69 | chunksize: 300,000


Scan clases REAL:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] attack_cats detectadas (sin Normal): ['DDoS', 'DoS', 'Fuzzers', 'Generic', 'MitM', 'PortScan']
[INFO] top10 real: {'Normal': 16144581, 'DoS': 1539294, 'MitM': 1145272, 'Generic': 1025715, 'Fuzzers': 432783, 'DDoS': 85596, 'PortScan': 65700}
[INFO] SYN_CSV: /gdrive/MyDrive/Datasets/synthetic_kitsune_ctgan.csv



Sampling REAL [Normal]:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] train Normal: (120000, 7) | attack_cat: {'Normal': 120000}
[OK] fit(Normal) en 0.17 min | train=(120000, 7)
[OK] Normal generado=200,000 en 0.07 min



Ataques: entrenar+generar:   0%|          | 0/6 [00:00<?, ?clase/s]


----- CLASE: DDoS -----


Sampling REAL [DDoS]:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] train: (30000, 7) | dist: {'DDoS': 30000}
[OK] fit(DDoS) en 0.04 min | train=(30000, 7)
[OK] DDoS generado=30,000 en 0.01 min

----- CLASE: DoS -----


Sampling REAL [DoS]:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] train: (30000, 7) | dist: {'DoS': 30000}
[OK] fit(DoS) en 0.03 min | train=(30000, 7)
[OK] DoS generado=30,000 en 0.01 min

----- CLASE: Fuzzers -----


Sampling REAL [Fuzzers]:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] train: (30000, 7) | dist: {'Fuzzers': 30000}
[OK] fit(Fuzzers) en 0.06 min | train=(30000, 7)
[OK] Fuzzers generado=30,000 en 0.02 min

----- CLASE: Generic -----


Sampling REAL [Generic]:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] train: (30000, 7) | dist: {'Generic': 30000}
[OK] fit(Generic) en 0.07 min | train=(30000, 7)
[OK] Generic generado=30,000 en 0.02 min

----- CLASE: MitM -----


Sampling REAL [MitM]:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] train: (30000, 7) | dist: {'MitM': 30000}
[OK] fit(MitM) en 0.05 min | train=(30000, 7)
[OK] MitM generado=30,000 en 0.01 min

----- CLASE: PortScan -----


Sampling REAL [PortScan]:   0%|          | 0/69 [00:00<?, ?chunk/s]

[INFO] train: (30000, 7) | dist: {'PortScan': 30000}
[OK] fit(PortScan) en 0.05 min | train=(30000, 7)
[OK] PortScan generado=30,000 en 0.01 min

[OK] Sintético multiclase guardado en: /gdrive/MyDrive/Datasets/synthetic_kitsune_ctgan.csv
[OK] Metadatas por clase en: /gdrive/MyDrive/Datasets/kitsune_metadata_per_class


Validando SYN (chunks): 0chunk [00:00, ?chunk/s]


[SYN] label: {'0': 200000, '1': 180000}
[SYN] attack_cat top15: {'Normal': 200000, 'DDoS': 30000, 'DoS': 30000, 'Fuzzers': 30000, 'Generic': 30000, 'MitM': 30000, 'PortScan': 30000}

[CHECK] Normal con label!=0: 0
[CHECK] Ataque con label!=1: 0

[FIN] Generación por clase lista (condicional real).


In [8]:
print("[REAL] attack_cat:", pd.read_csv("/gdrive/MyDrive/Datasets/KITSUNE_full_clean.csv", low_memory=False)["attack_cat"].value_counts().head(30))
print("[SYN]  attack_cat:", pd.read_csv("/gdrive/MyDrive/Datasets/synthetic_kitsune_ctgan.csv", low_memory=False)["attack_cat"].value_counts().head(30))

[REAL] attack_cat: attack_cat
Normal      16144581
DoS          1539294
MitM         1145272
Generic      1025715
Fuzzers       432783
DDoS           85596
PortScan       65700
Name: count, dtype: int64
[SYN]  attack_cat: attack_cat
Normal      200000
DDoS         30000
DoS          30000
Fuzzers      30000
Generic      30000
MitM         30000
PortScan     30000
Name: count, dtype: int64
