In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jlvelaalonso","key":"72687d455a314ef5280bf2076c59622b"}'}

In [3]:
!mkdir -p ~/.kaggle

In [4]:
!mv kaggle.json ~/.kaggle/

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
# ============================================================
# UNSW-NB15 — Descarga Kaggle + Unificación + Limpieza + Normalización (1 script) 04/01/2026
# Salida: /gdrive/MyDrive/Datasets/UNSW_NB15_full_clean.csv
#   - attack_cat (normalizada a ALLOWED)
#   - label binaria 0/1 en category (derivada SOLO de attack_cat)
#   - attack_cat aparece ANTES que label en el CSV final
# ============================================================

!pip install -q kaggle pandas numpy tqdm

import os
import time
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

# ===================== CONFIG =====================
KAGGLE_DATASET = "mrwellsdavid/unsw-nb15"
ZIP_PATH = "/content/unsw-nb15.zip"
EXTRACT_DIR = "/content/unsw-nb15"
OUT_CSV = "/gdrive/MyDrive/Datasets/UNSW_NB15_full_clean.csv"
# ==================================================

# ============================================================
# NORMALIZADOR attack_cat (MISMA TAXONOMÍA QUE KITSUNE/CICIDS)
# ============================================================
ALLOWED = {
    "Normal",
    "Fuzzers","Exploits","DoS","Reconnaissance","Generic","Analysis","Shellcode",
    "Backdoors","DDoS","PortScan","MitM","DoS","DDoS","PortScan","BruteForce","Generic"
}

MAP = {
    "Benign": "Normal",
    "BENIGN": "Normal",
    "normal": "Normal",
    "benign": "Normal",

    # Hemos añadido Reconnaissance a PortScan ya que en UNSW se reconoce como tal
    "Port Scan": "PortScan",
    "Portscan": "PortScan",
    "portscan": "PortScan",
    "PortScan": "PortScan",
    "Reconnaissance":"PortScan",
    "reconnaissance":"PortScan",

    "bruteforce":"BruteForce",
    "BruteForce":"BruteForce",
    "brute force":"BruteForce",
    "Brute Force":"BruteForce",

    "Ddos": "DDoS",
    "ddos": "DDoS",
    "DDoS": "DDoS",

    "Dos": "DoS",
    "dos": "DoS",
    "DoS": "DoS",

    "MITM": "MitM",
    "mitm": "MitM",
    "MitM": "MitM",

    # clave:
    "Attack": "Generic",
    "attack": "Generic",
    "Unknown": "Generic",
    "": "Normal",
    "None": "Normal",
    "nan": "Normal",
    "NaN": "Normal",
}

def normalize_cols(cols):
    return [str(c).strip().replace(" ", "_").replace("/", "_").replace("-", "_").lower() for c in cols]

def normalize_attack_cat_series(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.replace(MAP)
    # normal exacto
    s = s.apply(lambda x: "Normal" if str(x).strip().lower() == "normal" else x)
    # cualquier cosa rara -> Generic (si no es Normal)
    s = s.apply(lambda x: x if x in ALLOWED else ("Normal" if str(x).strip()=="" else "Generic"))
    return s

def enforce_attackcat_label(df: pd.DataFrame) -> pd.DataFrame:
    """
    CHECK DURO ÚNICO:
      - attack_cat == Normal => label=0
      - attack_cat != Normal => label=1
    """
    df = df.copy()
    if "attack_cat" not in df.columns:
        raise ValueError("Falta attack_cat.")
    df["attack_cat"] = normalize_attack_cat_series(df["attack_cat"])
    df["label"] = (df["attack_cat"] != "Normal").astype(int).astype("category")
    return df

# ===================== 1) Descargar y descomprimir =====================
print(f">> Descargando dataset Kaggle: {KAGGLE_DATASET}")
t0 = time.time()
!kaggle datasets download -d $KAGGLE_DATASET -p /content/ -w
print(f"[OK] Descargado en {time.time()-t0:.1f}s")

print(">> Descomprimiendo…")
t1 = time.time()
!unzip -o /content/unsw-nb15.zip -d /content/unsw-nb15 > /dev/null
print(f"[OK] Extraído en {(time.time()-t1)/60:.2f} min → {EXTRACT_DIR}")

# ===================== 2) Localizar partes + features =====================
base = Path(EXTRACT_DIR)
parts = [base / f"UNSW-NB15_{i}.csv" for i in range(1, 5)]
features_path = base / "NUSW-NB15_features.csv"  # suele venir con ese nombre

for p in parts:
    if not p.exists():
        raise FileNotFoundError(f"No existe: {p}")
if not features_path.exists():
    raise FileNotFoundError(f"No existe: {features_path}")

# ===================== 3) Leer diccionario features =====================
t2 = time.time()
feat = pd.read_csv(features_path, encoding="latin1")
name_col = None
for cand in ["Name","name","Feature","feature","Attribute","attribute"]:
    if cand in feat.columns:
        name_col = cand
        break
if name_col is None:
    if feat.shape[1] == 1:
        name_col = feat.columns[0]
    else:
        raise ValueError("No encuentro la columna con nombres en NUSW-NB15_features.csv")

feature_names = feat[name_col].astype(str).str.strip().tolist()
print(f"[OK] Features leídas en {time.time()-t2:.1f}s | n_features={len(feature_names)}")

# ===================== 4) Leer fragmentos con progreso =====================
dfs = []
t3 = time.time()
for p in tqdm(parts, desc="Leyendo fragmentos UNSW (1/4..4/4)", unit="fichero"):
    tmp = pd.read_csv(p, header=None, low_memory=False, encoding="latin1")

    if tmp.shape[1] == len(feature_names):
        tmp.columns = feature_names
    else:
        tmp2 = pd.read_csv(p, header=0, low_memory=False, encoding="latin1")
        if tmp2.shape[1] == len(feature_names):
            tmp2.columns = feature_names
            tmp = tmp2
        else:
            raise ValueError(
                f"Las columnas de {p.name} no coinciden con el diccionario de features: "
                f"{tmp.shape[1]} vs {len(feature_names)}"
            )

    dfs.append(tmp)
print(f"[OK] Fragmentos cargados en {(time.time()-t3)/60:.2f} min")

# ===================== 5) Unir + normalizar columnas =====================
t4 = time.time()
df = pd.concat(dfs, ignore_index=True)
del dfs
df.columns = normalize_cols(df.columns.tolist())
print(f"[OK] Concatenado en {time.time()-t4:.1f}s | shape={df.shape}")

# ===================== 6) Limpieza numérica mínima (sin dropna global) =====================
t5 = time.time()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan)
print(f"[OK] inf→NaN en {time.time()-t5:.1f}s | num_cols={len(num_cols)}")

t6 = time.time()
for c in tqdm(num_cols, desc="Imputando numéricas (mediana)", unit="col"):
    if df[c].isna().any():
        med = df[c].median()
        df[c] = df[c].fillna(0.0 if np.isnan(med) else med)
print(f"[OK] Imputación numérica en {(time.time()-t6)/60:.2f} min")

# ===================== 7) Asegurar attack_cat, normalizar a ALLOWED y CHECK DURO =====================
if "label" not in df.columns:
    raise ValueError("No encuentro 'label' en UNSW (revisa features).")

# Si faltase attack_cat (no debería), lo creamos desde label:
if "attack_cat" not in df.columns:
    df["label"] = pd.to_numeric(df["label"], errors="coerce").fillna(0).astype(int).clip(0, 1)
    df["attack_cat"] = np.where(df["label"] == 0, "Normal", "Generic")

# Normalizar y CHECK DURO único
df = enforce_attackcat_label(df)

# ===================== 8) Eliminar IP/puertos (opcional seguro) =====================
for col in ["srcip", "dstip", "sport", "dsport"]:
    if col in df.columns:
        df = df.drop(columns=[col], errors="ignore")

# ===================== 9) Reordenar columnas: attack_cat luego label =====================
cols = df.columns.tolist()
# quitar si existen, y reinsertar al inicio
for k in ["attack_cat", "label"]:
    if k in cols:
        cols.remove(k)
new_cols = ["attack_cat", "label"] + cols
df = df[new_cols]

# ===================== 10) Guardar CSV =====================
t7 = time.time()
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"[OK] Guardado en {time.time()-t7:.1f}s → {OUT_CSV}")
print("Shape final:", df.shape)

# ===================== 11) Verificaciones finales =====================
print("\nColumnas clave presentes:", [c for c in df.columns if c in ["attack_cat","label","proto","service","state"]])

print("\nDistribución label (binaria):")
print(df["label"].astype(str).value_counts(normalize=True).round(3))

print("\nattack_cat (top 15):")
print(df["attack_cat"].value_counts().head(15))

print("\n[VERIFICACIÓN FINAL]")
print("Generic con label=0 →", int(((df["attack_cat"]=="Generic") & (df["label"].astype(int)==0)).sum()))
print("Normal con label=1  →", int(((df["attack_cat"]=="Normal")  & (df["label"].astype(int)==1)).sum()))

#df.head(5)


>> Descargando dataset Kaggle: mrwellsdavid/unsw-nb15
Dataset URL: https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15
License(s): unknown
Downloading unsw-nb15.zip to .
  0% 0.00/149M [00:00<?, ?B/s]
100% 149M/149M [00:00<00:00, 1.60GB/s]
[OK] Descargado en 2.3s
>> Descomprimiendo…
[OK] Extraído en 0.14 min → /content/unsw-nb15
[OK] Features leídas en 0.0s | n_features=49


Leyendo fragmentos UNSW (1/4..4/4):   0%|          | 0/4 [00:00<?, ?fichero/s]

[OK] Fragmentos cargados en 0.30 min
[OK] Concatenado en 0.5s | shape=(2540047, 49)
[OK] inf→NaN en 2.3s | num_cols=40


Imputando numéricas (mediana):   0%|          | 0/40 [00:00<?, ?col/s]

[OK] Imputación numérica en 0.00 min
[OK] Guardado en 58.4s → /gdrive/MyDrive/Datasets/UNSW_NB15_full_clean.csv
Shape final: (2540047, 45)

Columnas clave presentes: ['attack_cat', 'label', 'proto', 'state', 'service']

Distribución label (binaria):
label
0    0.874
1    0.126
Name: proportion, dtype: float64

attack_cat (top 15):
attack_cat
Normal       2218764
Generic       217450
Exploits       44525
Fuzzers        24246
DoS            16353
PortScan       13987
Analysis        2677
Shellcode       1511
Backdoors        534
Name: count, dtype: int64

[VERIFICACIÓN FINAL]
Generic con label=0 → 0
Normal con label=1  → 0
