In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jlvelaalonso","key":"72687d455a314ef5280bf2076c59622b"}'}

In [3]:
!mkdir -p ~/.kaggle

In [4]:
!mv kaggle.json ~/.kaggle/

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
# === CICIDS2017 (MachineLearningCVE) — SDV (GaussianCopula) por cuotas + robusto en memoria === 04/01/2026
!pip install -q sdv packaging tqdm

import os, gc, time, math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer

# ===================== CONFIG =====================
REAL_CSV  = "/gdrive/MyDrive/Datasets/MachineLearningCVE_full_clean.csv"
SYN_CSV   = "/gdrive/MyDrive/Datasets/synthetic_MachineLearningCVE_ctgan.csv"
META_JSON = "/gdrive/MyDrive/Datasets/MachineLearningCVE_metadata_gc.json"

RANDOM_STATE = 42

# Cuotas finales sintéticas (ajústalas aquí)
NORMAL_N      = 200_000
ATTACK_N_EACH = 30_000   # por cada clase != Normal detectada (DoS, DDoS, PortScan, BruteForce, Generic...)

# Lectura por chunks (evita RAM llena)
CHUNK_SIZE = 250_000

# Muestra para entrenar SDV (forzando ataques)
TRAIN_TOTAL_N         = 250_000
TRAIN_NORMAL_MAX      = 140_000
TRAIN_ATTACK_MIN_EACH = 35_000

DEFAULT_DISTRIBUTION = "gamma"
ENFORCE_MINMAX = False
# ================================================

def normalize_cols(cols):
    return [str(c).strip().replace(" ", "_").replace("/", "_").replace("-", "_").lower() for c in cols]

def enforce_label_attackcat_inplace(df_any: pd.DataFrame) -> pd.DataFrame:
    if "attack_cat" not in df_any.columns:
        raise ValueError("Falta 'attack_cat' en el REAL. Revisa tu full_clean.")
    df_any["attack_cat"] = df_any["attack_cat"].astype(str).str.strip()
    df_any["label"] = (df_any["attack_cat"].str.lower() != "normal").astype(int)
    return df_any

def approx_line_count(path: str) -> int:
    with open(path, "rb") as f:
        return max(sum(1 for _ in f) - 1, 0)

def append_csv(df_part: pd.DataFrame, path: str):
    header = not os.path.exists(path)
    df_part.to_csv(path, index=False, mode="a", header=header)

# ===================== 1) PASADA 1: contar clases (sin cargar en RAM) =====================
n_lines = approx_line_count(REAL_CSV)
n_chunks = max(1, int(math.ceil(n_lines / CHUNK_SIZE)))
print(f"[INFO] REAL filas aprox: {n_lines:,} | chunks: {n_chunks} | chunksize: {CHUNK_SIZE:,}")

cnt_attack = Counter()
reader = pd.read_csv(REAL_CSV, low_memory=False, chunksize=CHUNK_SIZE)

for ch in tqdm(reader, total=n_chunks, desc="Scan clases (chunks)", unit="chunk"):
    ch.columns = normalize_cols(ch.columns.tolist())
    if "attack_cat" not in ch.columns:
        raise ValueError("Este CSV no tiene attack_cat.")
    cnt_attack.update(ch["attack_cat"].astype(str).str.strip().tolist())
    del ch
    gc.collect()

attack_cats = sorted([c for c in cnt_attack.keys() if str(c).strip().lower() != "normal"])
print("[INFO] attack_cat detectadas (sin Normal):", attack_cats)
print("[INFO] top10 real:", dict(Counter(cnt_attack).most_common(10)))

if not attack_cats:
    raise RuntimeError("No detecto clases de ataque. Revisa MachineLearningCVE_full_clean.csv.")

# ===================== 2) PASADA 2: construir df_train estratificado =====================
rng = np.random.RandomState(RANDOM_STATE)
train_parts = []
seen_per_cat = Counter()

reader = pd.read_csv(REAL_CSV, low_memory=False, chunksize=CHUNK_SIZE)

for ch in tqdm(reader, total=n_chunks, desc="Construyendo df_train (chunks)", unit="chunk"):
    ch.columns = normalize_cols(ch.columns.tolist())
    ch = enforce_label_attackcat_inplace(ch)

    # A) ataques: forzar mínimo por clase
    for cat in attack_cats:
        need = TRAIN_ATTACK_MIN_EACH - seen_per_cat[cat]
        if need <= 0:
            continue
        sub = ch[ch["attack_cat"] == cat]
        if len(sub) == 0:
            continue
        take = min(need, len(sub))
        samp = sub.sample(n=take, random_state=int(rng.randint(0, 1e9)))
        train_parts.append(samp)
        seen_per_cat[cat] += len(samp)

    # B) Normal: cap
    needN = TRAIN_NORMAL_MAX - seen_per_cat["Normal"]
    if needN > 0:
        subN = ch[ch["attack_cat"].str.lower() == "normal"]
        if len(subN) > 0:
            takeN = min(needN, len(subN))
            sampN = subN.sample(n=takeN, random_state=int(rng.randint(0, 1e9)))
            train_parts.append(sampN)
            seen_per_cat["Normal"] += len(sampN)

    del ch
    gc.collect()

    if sum(seen_per_cat.values()) >= TRAIN_TOTAL_N:
        break

df_train = pd.concat(train_parts, ignore_index=True)
del train_parts
gc.collect()

df_train = df_train.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)
if len(df_train) > TRAIN_TOTAL_N:
    df_train = df_train.sample(n=TRAIN_TOTAL_N, random_state=RANDOM_STATE).reset_index(drop=True)

print("\n[INFO] df_train:", df_train.shape)
print("[INFO] df_train attack_cat:", df_train["attack_cat"].value_counts().to_dict())
print("[INFO] df_train label:", df_train["label"].value_counts().to_dict())

# ===================== 3) SDV fit =====================
df_sdv = df_train.copy()
for c in df_sdv.columns:
    if str(df_sdv[c].dtype) == "category":
        df_sdv[c] = df_sdv[c].astype("object")

df_sdv["attack_cat"] = df_sdv["attack_cat"].astype(str)
df_sdv["label"] = df_sdv["label"].astype(str)

metadata = Metadata.detect_from_dataframe(df_sdv)
metadata.save_to_json(META_JSON)

synth = GaussianCopulaSynthesizer(
    metadata,
    default_distribution=DEFAULT_DISTRIBUTION,
    enforce_min_max_values=ENFORCE_MINMAX
)

print("\n[INFO] Entrenando sintetizador (fit) con muestra estratificada...")
t_fit = time.time()
synth.fit(df_sdv)
print(f"[OK] fit() terminado en {(time.time()-t_fit)/60:.2f} min")

# ===================== 4) Generar sintético por cuotas (incremental) =====================
if os.path.exists(SYN_CSV):
    os.remove(SYN_CSV)

# Normal
print("\n[INFO] Generando 'Normal' =", NORMAL_N)
t0 = time.time()
synN = synth.sample(num_rows=NORMAL_N)
synN.columns = normalize_cols(synN.columns.tolist())
synN = enforce_label_attackcat_inplace(synN)
synN["attack_cat"] = "Normal"
synN["label"] = 0
append_csv(synN, SYN_CSV)
del synN
gc.collect()
print(f"[OK] Normal guardado en {(time.time()-t0)/60:.2f} min")

# Ataques por clase
for cat in tqdm(attack_cats, desc="Generando ataques (por clase)", unit="clase"):
    synA = synth.sample(num_rows=ATTACK_N_EACH)
    synA.columns = normalize_cols(synA.columns.tolist())
    synA = enforce_label_attackcat_inplace(synA)
    synA["attack_cat"] = str(cat)
    synA["label"] = 1
    append_csv(synA, SYN_CSV)
    del synA
    gc.collect()

print("\n[OK] Sintético guardado en:", SYN_CSV)
print("[OK] Metadata guardada en:", META_JSON)

# ===================== 5) Validación rápida del SYN (sin cargar entero) =====================
cnt_syn_attack = Counter()
cnt_syn_label = Counter()
syn_reader = pd.read_csv(SYN_CSV, chunksize=200_000, low_memory=False)

for ch in tqdm(syn_reader, desc="Validando SYN (chunks)", unit="chunk"):
    cnt_syn_attack.update(ch["attack_cat"].astype(str).str.strip().tolist())
    cnt_syn_label.update(ch["label"].astype(str).str.strip().tolist())

print("\n[SYN] label:", dict(cnt_syn_label))
print("[SYN] attack_cat top15:", dict(Counter(cnt_syn_attack).most_common(15)))
print("\n[FIN] CICIDS_SYN listo (deberías ver BruteForce/PortScan/DoS/DDoS si existen en el REAL).")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/200.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.5/201.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Scan clases (chunks):   0%|          | 0/12 [00:00<?, ?chunk/s]

[INFO] attack_cat detectadas (sin Normal): ['BruteForce', 'DDoS', 'DoS', 'Generic', 'PortScan']
[INFO] top10 real: {'Normal': 2273097, 'DoS': 252661, 'PortScan': 158930, 'DDoS': 128027, 'BruteForce': 15342, 'Generic': 2686}


Construyendo df_train (chunks):   0%|          | 0/12 [00:00<?, ?chunk/s]


[INFO] df_train: (250000, 80)
[INFO] df_train attack_cat: {'Normal': 133123, 'PortScan': 33295, 'DoS': 33253, 'DDoS': 33235, 'BruteForce': 14552, 'Generic': 2542}
[INFO] df_train label: {0: 133123, 1: 116877}

[INFO] Entrenando sintetizador (fit) con muestra estratificada...
[OK] fit() terminado en 5.88 min

[INFO] Generando 'Normal' = 200000
[OK] Normal guardado en 0.95 min


Generando ataques (por clase):   0%|          | 0/5 [00:00<?, ?clase/s]


[OK] Sintético guardado en: /gdrive/MyDrive/Datasets/synthetic_MachineLearningCVE_ctgan.csv
[OK] Metadata guardada en: /gdrive/MyDrive/Datasets/MachineLearningCVE_metadata_gc.json


Validando SYN (chunks): 0chunk [00:00, ?chunk/s]


[SYN] label: {'0': 200000, '1': 150000}
[SYN] attack_cat top15: {'Normal': 200000, 'BruteForce': 30000, 'DDoS': 30000, 'DoS': 30000, 'Generic': 30000, 'PortScan': 30000}

[FIN] CICIDS_SYN listo (deberías ver BruteForce/PortScan/DoS/DDoS si existen en el REAL).


In [8]:
print("[REAL] attack_cat:", pd.read_csv("/gdrive/MyDrive/Datasets/MachineLearningCVE_full_clean.csv", low_memory=False)["attack_cat"].value_counts().head(30))
print("[SYN]  attack_cat:", pd.read_csv("/gdrive/MyDrive/Datasets/synthetic_MachineLearningCVE_ctgan.csv", low_memory=False)["attack_cat"].value_counts().head(30))


[REAL] attack_cat: attack_cat
Normal        2273097
DoS            252661
PortScan       158930
DDoS           128027
BruteForce      15342
Generic          2686
Name: count, dtype: int64
[SYN]  attack_cat: attack_cat
Normal        200000
BruteForce     30000
DDoS           30000
DoS            30000
Generic        30000
PortScan       30000
Name: count, dtype: int64
