In [8]:
from google.colab import drive
drive.mount('/gdrive',force_remount=True)

Mounted at /gdrive


In [9]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"jlvelaalonso","key":"72687d455a314ef5280bf2076c59622b"}'}

In [10]:
!mkdir -p ~/.kaggle

In [11]:
!cp kaggle.json ~/.kaggle/

In [12]:
!chmod 600 ~/.kaggle/kaggle.json

In [13]:
# === KITSUNE — Kaggle + unificación/limpieza + taxonomía final (estilo UNSW)
# Salida: /gdrive/MyDrive/Datasets/KITSUNE_full_clean.csv con columnas al inicio: attack_cat, label
!pip install -q kaggle pandas numpy tqdm

import os, sys, glob, time, shutil, zipfile, subprocess, gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# ============================================================
# 1) TAXONOMÍA FINAL (COMÚN A UNSW / CICIDS / KITSUNE)
# ============================================================
ALLOWED_ATTACKS = {
    "Normal",
    "Fuzzers","Exploits","DoS","Reconnaissance","Generic","Analysis",
    "Shellcode","Backdoors","DDoS","PortScan","MitM","BruteForce","Shellcode","Worms"
}

MAP_ATTACK = {
    # Normal
    "benign":"Normal","Benign":"Normal","BENIGN":"Normal",
    "normal":"Normal","Normal":"Normal",
    "nan":"Normal","NaN":"Normal","None":"Normal","":"Normal",

    # clave:
    "Attack": "Generic","attack": "Generic","Unknown": "Generic",

    # Unificaciones
    "Attack":"Generic","attack":"Generic","Unknown":"Generic","unknown":"Generic",

    # DoS / DDoS
    "dos":"DoS","Dos":"DoS","DoS":"DoS",
    "ddos":"DDoS","Ddos":"DDoS","DDoS":"DDoS",

    # PortScan
    "Port Scan":"PortScan","port scan":"PortScan",
    "portscan":"PortScan","Portscan":"PortScan","PortScan":"PortScan",
    "Reconnaissance":"PortScan","reconnaissance":"PortScan",

    # MITM
    "mitm":"MitM","MITM":"MitM","MitM":"MitM",

    # BruteForce
    "Bruteforce":"BruteForce","bruteforce":"BruteForce","Brute Force":"BruteForce","brute force":"BruteForce",

}
def normalize_attack_cat_series(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.replace(MAP_ATTACK)
    s = s.apply(lambda x: "Normal" if str(x).strip().lower() == "normal" else x)
    s = s.apply(lambda x: x if x in ALLOWED_ATTACKS else ("Normal" if str(x).strip()=="" else "Generic"))
    return s

def enforce_attackcat_label(df: pd.DataFrame) -> pd.DataFrame:
    """
    CHECK DURO DEFINITIVO:
      - attack_cat == Normal  => label = 0
      - attack_cat != Normal  => label = 1
    """
    df = df.copy()
    if "attack_cat" not in df.columns:
        raise ValueError("Falta 'attack_cat'.")
    df["attack_cat"] = normalize_attack_cat_series(df["attack_cat"])
    df["label"] = (df["attack_cat"] != "Normal").astype(int).astype("category")
    return df

# ============================================================
# 2) UTILIDADES KAGGLE
# ============================================================
def run(cmd):
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    return p.returncode, p.stdout, p.stderr

def ensure_kaggle_token():
    run([sys.executable, "-m", "pip", "install", "-q", "kaggle"])
    kag_dir = Path.home() / ".kaggle"
    kag_dir.mkdir(parents=True, exist_ok=True)
    token = kag_dir / "kaggle.json"
    if not token.exists():
        from google.colab import files  # type: ignore
        print("→ Sube tu kaggle.json (Kaggle > Account > Create API Token)…")
        up = files.upload()
        cand = [k for k in up.keys() if k.endswith("kaggle.json")]
        if not cand:
            raise FileNotFoundError("No se subió kaggle.json.")
        shutil.copy(cand[0], token)
    os.chmod(token, 0o600)

def robust_read_csv(path):
    for enc in ("utf-8","latin1","cp1252"):
        try:
            return pd.read_csv(path, low_memory=False, encoding=enc)
        except Exception:
            pass
    return pd.read_csv(path, low_memory=False, engine="python")

def normalize_cols(df):
    df.columns = [str(c).strip().replace(" ","_").replace("/","_").replace("-","_").lower() for c in df.columns]
    return df

# ============================================================
# 3) DESCARGA KITSUNE (CANDIDATOS)
# ============================================================
ensure_kaggle_token()

CANDIDATES = [
    "ernie55ernie/5-tuple-packets-of-kitsune-network-attack-dataset",  # ✅ el que te ha funcionado
    "ymirsky/network-attack-dataset-kitsune",
    "ycshin/kitsune-network-attack-dataset",
]

zip_path = None
for ds in CANDIDATES:
    print(f"\n[INFO] Intentando Kaggle: {ds}")
    pre = set(glob.glob("/content/*.zip"))
    rc, out, err = run(["kaggle","datasets","download","-d",ds,"-p","/content","-w"])
    if rc == 0:
        time.sleep(1.0)
        post = set(glob.glob("/content/*.zip"))
        newz = list(post - pre)
        zip_path = newz[0] if newz else (max(list(post), key=os.path.getmtime) if post else None)
        if zip_path and os.path.exists(zip_path):
            print("[OK] ZIP descargado:", zip_path)
            break
    print("[FALLO]", (err or out)[0:200].replace("\n"," "), "…")

if not zip_path:
    raise RuntimeError("No se pudo descargar Kitsune con los candidatos. Sube el ZIP manualmente.")

TARGET_DIR = "/content/kitsune"
os.makedirs(TARGET_DIR, exist_ok=True)
print(f"[INFO] Descomprimiendo {zip_path} → {TARGET_DIR}")
with zipfile.ZipFile(zip_path) as z:
    z.extractall(TARGET_DIR)

# ============================================================
# 4) INFERIR attack_cat DESDE NOMBRE DE FICHERO (KITSUNE)
# ============================================================
def attack_cat_from_path(path: str):
    s = os.path.basename(path).lower()

    # benign/baseline si existiera
    if any(t in s for t in ["benign","normal","baseline","idle","clean"]):
        return "Normal"

    # kitsune ataques típicos
    if "mirai" in s or "botnet" in s:
        return "DDoS"
    if "scan" in s:
        return "Reconnaissance"
    if "fuzz" in s:
        return "Fuzzers"
    if "dos" in s or "flood" in s or "reneg" in s:
        return "DoS"
    if "mitm" in s or "arp" in s or "spoof" in s:
        return "MitM"

    return "Generic"

# ============================================================
# 5) CARGA + UNIFICACIÓN (CON PROGRESO)
# ============================================================
csv_files = sorted(glob.glob(f"{TARGET_DIR}/**/*.csv", recursive=True))
if not csv_files:
    raise FileNotFoundError("No se encontraron CSV dentro del ZIP extraído.")

print(f"[INFO] CSV encontrados: {len(csv_files)}")

dfs = []
cats = []

for f in tqdm(csv_files, desc="Cargando CSV Kitsune", unit="csv"):
    df_i = robust_read_csv(f)
    df_i = normalize_cols(df_i)

    file_cat = attack_cat_from_path(f)

    # si el csv trae alguna etiqueta interna, la usamos SOLO para detectar benign/ataque
    label_cols = [c for c in df_i.columns if c in ("label","class","attack","anomaly","is_anomaly","malicious")]
    if label_cols:
        lab = df_i[label_cols[0]].astype(str).str.strip().str.lower()
        df_i = df_i.drop(columns=label_cols, errors="ignore")
        is_benign = lab.isin({"0","benign","normal","false","no"})
        cat = np.where(is_benign, "Normal", file_cat)
    else:
        cat = np.array([file_cat] * len(df_i), dtype=object)

    dfs.append(df_i)
    cats.append(cat)

df = pd.concat(dfs, ignore_index=True)
df = normalize_cols(df)

attack_cat_all = np.concatenate(cats, axis=0)
df["attack_cat"] = pd.Series(attack_cat_all, dtype="object")

del dfs, cats
gc.collect()

# ============================================================
# 6) CHECK DURO + LIMPIEZA NUMÉRICA
# ============================================================
df = enforce_attackcat_label(df)

# Limpieza numérica mínima (sin dropna global)
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns.tolist() if c not in ("label",)]
df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan)

for c in tqdm(num_cols, desc="Imputando numéricas (mediana)", unit="col"):
    if df[c].isna().any():
        med = df[c].median()
        df[c] = df[c].fillna(0.0 if np.isnan(med) else med)

# Drop columnas constantes/vacías
const_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
df = df.drop(columns=const_cols, errors="ignore")

# ============================================================
# 7) REORDENAR COLUMNAS: attack_cat, label PRIMERO
# ============================================================
cols = df.columns.tolist()
cols = ["attack_cat", "label"] + [c for c in cols if c not in ("attack_cat","label")]
df = df[cols]

# ============================================================
# 8) GUARDAR + VERIFICAR
# ============================================================
OUT = "/gdrive/MyDrive/Datasets/KITSUNE_full_clean.csv"
df.to_csv(OUT, index=False, encoding="utf-8")

print("\n[OK] Guardado:", OUT)
print("Shape:", df.shape)

print("\nlabel:")
print(df["label"].astype(str).value_counts(normalize=True).round(3))

print("\nattack_cat (top 15):")
print(df["attack_cat"].value_counts().head(15))

print("\n[CHECK FINAL]")
print("Generic con label=0 →", int(((df.attack_cat=="Generic")&(df.label.astype(int)==0)).sum()))
print("Normal con label=1  →", int(((df.attack_cat=="Normal")&(df.label.astype(int)==1)).sum()))



[INFO] Intentando Kaggle: ernie55ernie/5-tuple-packets-of-kitsune-network-attack-dataset
[OK] ZIP descargado: /content/5-tuple-packets-of-kitsune-network-attack-dataset.zip
[INFO] Descomprimiendo /content/5-tuple-packets-of-kitsune-network-attack-dataset.zip → /content/kitsune
[INFO] CSV encontrados: 9


Cargando CSV Kitsune:   0%|          | 0/9 [00:00<?, ?csv/s]

Imputando numéricas (mediana):   0%|          | 0/3 [00:00<?, ?col/s]


[OK] Guardado: /gdrive/MyDrive/Datasets/KITSUNE_full_clean.csv
Shape: (20438941, 7)

label:
label
0    0.79
1    0.21
Name: proportion, dtype: float64

attack_cat (top 15):
attack_cat
Normal      16144581
DoS          1539294
MitM         1145272
Generic      1025715
Fuzzers       432783
DDoS           85596
PortScan       65700
Name: count, dtype: int64

[CHECK FINAL]
Generic con label=0 → 0
Normal con label=1  → 0
