In [None]:
# --- Colab Setup for netFound Pre-processing (rev-2, 2025-05-28) ---
"""
Single-cell script that prepares a Google Colab runtime for the **netFound**
pre-processing pipeline.

Key points
-----------
* **PcapPlusPlus** – builds from *master* by default (change `PCAPPP_TAG` to a
  release tag like `v24.09` for a reproducible build).  A zip-archive fallback
  is used if *git clone* fails (GitHub error 128, network hiccup, etc.).
* **No duplicated blocks** – previous edits accidentally left two build paths;
  this revision consolidates them.
* **Safe temp cleanup** – avoids `FileNotFoundError` if the unzip directory is
  moved.
* **Helper utilities** – `run()` prints every command; `ensure_dir()` creates
  paths idempotently.
"""

import io
import os
import shutil
import subprocess
import sys
import urllib.request
import zipfile
from pathlib import Path

# ---------------------------------------------------------------------------
# Helper utilities
# ---------------------------------------------------------------------------

def run(cmd, **kw):
    """Wrapper around subprocess.run that logs and checks exit status."""
    print("→", " ".join(map(str, cmd)))
    subprocess.run(cmd, check=True, text=True, **kw)


def ensure_dir(path: Path):
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------------------------
# Constants (edit here if needed)
# ---------------------------------------------------------------------------
NETFOUND_ZIP_PATH = Path("/content/netFound.zip")
NETFOUND_DIR      = Path("/content/netFound")

PCAPPP_TAG  = "master"   # use a tag (e.g. "v24.09") for deterministic builds
PCAPPP_REPO = "https://github.com/seladb/PcapPlusPlus.git"
PCAPPP_DIR  = Path("/content/PcapPlusPlus")

PCAPSPLITTER_REPO = "https://github.com/shramos/pcap-splitter.git"
PCAPSPLITTER_DIR  = Path("/content/pcap-splitter")
PCAPSPLITTER_BIN  = Path("/usr/local/bin/PcapSplitter")

APT_DEPS = ["cmake", "g++", "libpcap-dev", "git", "parallel", "make"]
PY_REQS_FALLBACK = [
    "pyarrow", "pandas", "datasets", "transformers",
    "scikit-learn", "huggingface_hub", "torch", "google-generativeai",
]

# ---------------------------------------------------------------------------
# 0 – Ensure the user uploaded netFound.zip
# ---------------------------------------------------------------------------
print("\n=== 0 / 6  Check for netFound.zip ===")
if not NETFOUND_ZIP_PATH.exists():
    print("[USER ACTION]  Please upload 'netFound.zip' to /content and rerun.")
    sys.exit(1)
print("Found", NETFOUND_ZIP_PATH)

# ---------------------------------------------------------------------------
# 1 – Unzip netFound repository
# ---------------------------------------------------------------------------
print("\n=== 1 / 6  Unzipping netFound ===")
if NETFOUND_DIR.exists():
    shutil.rmtree(NETFOUND_DIR)

temp_dir = Path("/content/_nf_unzip")
if temp_dir.exists():
    shutil.rmtree(temp_dir)
ensure_dir(temp_dir)
run(["unzip", "-qq", str(NETFOUND_ZIP_PATH), "-d", str(temp_dir)])

items = list(temp_dir.iterdir())
repo_root = items[0] if len(items) == 1 else temp_dir
shutil.move(str(repo_root), str(NETFOUND_DIR))
if temp_dir.exists():
    shutil.rmtree(temp_dir)
print("Unzipped into", NETFOUND_DIR)

# ---------------------------------------------------------------------------
# 2 – Install system dependencies
# ---------------------------------------------------------------------------
print("\n=== 2 / 6  Installing apt dependencies ===")
run(["apt-get", "update", "-qq"])
run(["apt-get", "install", "-y", "-qq", *APT_DEPS])
print("System packages installed.")

# ---------------------------------------------------------------------------
# 3 – Clone + build PcapPlusPlus (with zip fallback)
# ---------------------------------------------------------------------------
print(f"\n=== 3 / 6  Building PcapPlusPlus ({PCAPPP_TAG}) ===")
if PCAPPP_DIR.exists():
    shutil.rmtree(PCAPPP_DIR)


def download_pcappp_zip(ref: str):
    """Download and unpack a zip archive for the given branch or tag."""
    if ref == "master":
        url = f"https://github.com/seladb/PcapPlusPlus/archive/refs/heads/{ref}.zip"
    else:
        url = f"https://github.com/seladb/PcapPlusPlus/archive/refs/tags/{ref}.zip"
    print("[fallback] downloading", url)
    with urllib.request.urlopen(url) as rsp:
        data = rsp.read()
    with zipfile.ZipFile(io.BytesIO(data)) as zf:
        zf.extractall(PCAPPP_DIR.parent)
    extracted = PCAPPP_DIR.parent / f"PcapPlusPlus-{ref}"
    extracted.rename(PCAPPP_DIR)

try:
    if PCAPPP_TAG == "master":
        run(["git", "clone", "--depth", "1", PCAPPP_REPO, str(PCAPPP_DIR)])
    else:
        run(["git", "clone", "--depth", "1", "--branch", PCAPPP_TAG,
             PCAPPP_REPO, str(PCAPPP_DIR)])
except subprocess.CalledProcessError as e:
    print(f"[WARN] git clone failed (exit {e.returncode}); falling back to zip …")
    download_pcappp_zip(PCAPPP_TAG)

# Build with CMake
os.chdir(PCAPPP_DIR)
run(["cmake", "-S", ".", "-B", "build", "-DCMAKE_BUILD_TYPE=Release"])
run(["cmake", "--build", "build", "-j", "2"])
run(["sudo", "cmake", "--install", "build"])
print("PcapPlusPlus installed in /usr/local")

# ---------------------------------------------------------------------------
# 4 – Build + install PcapSplitter
# ---------------------------------------------------------------------------
print("\n=== 4 / 6  Building PcapSplitter ===")
if not PCAPSPLITTER_BIN.exists():
    if PCAPSPLITTER_DIR.exists():
        shutil.rmtree(PCAPSPLITTER_DIR)
    run(["git", "clone", PCAPSPLITTER_REPO, str(PCAPSPLITTER_DIR)])
    os.chdir(PCAPSPLITTER_DIR)
    run(["make", "-j", "2"])
    run(["sudo", "cp", "pcapsplitter", str(PCAPSPLITTER_BIN)])
    run(["sudo", "chmod", "+x", str(PCAPSPLITTER_BIN)])
    print("PcapSplitter installed.")
else:
    print("PcapSplitter already present – skipping build.")

# ---------------------------------------------------------------------------
# 5 – Compile netFound C++ helpers
# ---------------------------------------------------------------------------
print("\n=== 5 / 6  Building netFound helpers ===")
cpp_src   = NETFOUND_DIR / "src/pre_process/packets_processing_src"
cpp_build = cpp_src / "build"
ensure_dir(cpp_build)

os.chdir(cpp_build)
if not (cpp_src / "CMakeLists.txt").exists():
    print("[WARN] CMakeLists.txt missing – helpers skipped.")
else:
    run(["cmake", ".."])
    run(["make", "-j", "2"])
    helpers_out = NETFOUND_DIR / "src/pre_process"
    for tool in ("1_filter", "3_field_extraction"):
        built = cpp_build / tool
        if built.exists():
            shutil.copy2(built, helpers_out / tool)
            (helpers_out / tool).chmod(0o755)
            print("  •", tool, "built →", helpers_out / tool)
        else:
            print("  • [WARN]", tool, "missing in build output")

# ---------------------------------------------------------------------------
# 6 – Install Python dependencies
# ---------------------------------------------------------------------------
print("\n=== 6 / 6  Installing Python deps ===")
req = NETFOUND_DIR / "requirements.txt"
if req.exists():
    run([sys.executable, "-m", "pip", "install", "-q", "-r", str(req)])
else:
    run([sys.executable, "-m", "pip", "install", "-q", *PY_REQS_FALLBACK])
print("Python packages installed.")

print("\n=== Setup complete!  Current WD:", os.getcwd(), "===")
print("Upload PCAPs to 'attack_data/raw/{0,1}' and continue with preprocessing.")


=== 0 / 6  Check for netFound.zip ===
Found /content/netFound.zip

=== 1 / 6  Unzipping netFound ===
→ unzip -qq /content/netFound.zip -d /content/_nf_unzip
Unzipped into /content/netFound

=== 2 / 6  Installing apt dependencies ===
→ apt-get update -qq
→ apt-get install -y -qq cmake g++ libpcap-dev git parallel make
System packages installed.

=== 3 / 6  Building PcapPlusPlus (master) ===
→ git clone --depth 1 https://github.com/seladb/PcapPlusPlus.git /content/PcapPlusPlus
→ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
→ cmake --build build -j 2
→ sudo cmake --install build
PcapPlusPlus installed in /usr/local

=== 4 / 6  Building PcapSplitter ===
PcapSplitter already present – skipping build.

=== 5 / 6  Building netFound helpers ===
[WARN] CMakeLists.txt missing – helpers skipped.

=== 6 / 6  Installing Python deps ===
→ /usr/bin/python3 -m pip install -q pyarrow pandas datasets transformers scikit-learn huggingface_hub torch google-generativeai
Python packages installed.

=== S

In [None]:
# --- Cell 2 : Upload netFound scripts, PCAP data & configs (generic file-names) ---

import os
import glob

print("🔄  User action: upload your data & scripts to /content")
print("   •  Any *.pcap file name is accepted – no specific pattern required.\n")

# --------------------------------------------------------------------
# 1. Ensure directory layout exists (create empty dirs if not uploaded yet)
# --------------------------------------------------------------------
ATTACK_DIR   = "/content/attack_data/raw/0"
BENIGN_DIR   = "/content/attack_data/raw/1"
NF_SRC_DIR   = "/content/netFound/src/pre_process"
NF_CFG_DIR   = "/content/netFound/configs"

for d in (ATTACK_DIR, BENIGN_DIR, NF_SRC_DIR, NF_CFG_DIR):
    os.makedirs(d, exist_ok=True)

# --------------------------------------------------------------------
# 2. Quick inventory of uploaded PCAPs
# --------------------------------------------------------------------
def count_pcaps(path):
    return len(glob.glob(os.path.join(path, "*.pcap")) +
               glob.glob(os.path.join(path, "*.PCAP")))

print(f"📂 {ATTACK_DIR}  →  {count_pcaps(ATTACK_DIR)} PCAP files detected")
print(f"📂 {BENIGN_DIR}  →  {count_pcaps(BENIGN_DIR)} PCAP files detected")

# --------------------------------------------------------------------
# 3. Create a dummy tokenizer config if the user hasn’t provided one
# --------------------------------------------------------------------
dummy_cfg = os.path.join(NF_CFG_DIR, "TestFinetuningConfig.json")
if not os.path.exists(dummy_cfg):
    with open(dummy_cfg, "w") as fh:
        fh.write(
            '{"IPFields": [], "TCPFields": [], "UDPFields": [], '
            '"ICMPFields": [], "Payload": [], "internalIPs": ["127.0.0.1/8"]}'
        )
    print(f"📝  Created placeholder tokenizer config at {dummy_cfg}")

print("\n✅  Directory structure ready. Proceed to the next cell to compile the C++ tools.")


🔄  User action: upload your data & scripts to /content
   •  Any *.pcap file name is accepted – no specific pattern required.

📂 /content/attack_data/raw/0  →  10 PCAP files detected
📂 /content/attack_data/raw/1  →  10 PCAP files detected
📝  Created placeholder tokenizer config at /content/netFound/configs/TestFinetuningConfig.json

✅  Directory structure ready. Proceed to the next cell to compile the C++ tools.


In [None]:

# --- Cell 3: Compile netFound's C++ Preprocessing Tools ---
# This cell assumes you have uploaded the netFound/src/pre_process/packets_processing_src/ directory.

import os
import subprocess

NETFOUND_DIR_COLAB = "/content/netFound/netFound"
CPP_SRC_DIR_COLAB = os.path.join(NETFOUND_DIR_COLAB, "src/pre_process/packets_processing_src")
CPP_BUILD_DIR_COLAB = os.path.join(CPP_SRC_DIR_COLAB, "build")
PRE_PROCESS_SCRIPTS_DIR_COLAB = os.path.join(NETFOUND_DIR_COLAB, "src/pre_process")

if not os.path.exists(CPP_SRC_DIR_COLAB):
    print(f"ERROR: Directory '{CPP_SRC_DIR_COLAB}' not found. Please upload it from your netFound clone.")
else:
    print(f"Compiling netFound C++ tools in {CPP_BUILD_DIR_COLAB}...")
    os.makedirs(CPP_BUILD_DIR_COLAB, exist_ok=True)

    original_cwd = os.getcwd()
    os.chdir(CPP_BUILD_DIR_COLAB)

    cmake_success = False
    try:
        # Ensure CMakeLists.txt is present
        if not os.path.exists("../CMakeLists.txt"):
             print(f"ERROR: CMakeLists.txt not found in {CPP_SRC_DIR_COLAB}. Cannot compile C++ tools.")
        else:
            subprocess.run(["cmake", ".."], check=True, capture_output=True)
            subprocess.run(["make", "-j", "2", "-s"], check=True) # -s for silent
            cmake_success = True
    except subprocess.CalledProcessError as e:
        print(f"ERROR during C++ tools compilation (cmake or make): {e}")
        print("STDOUT:", e.stdout.decode() if e.stdout else "N/A")
        print("STDERR:", e.stderr.decode() if e.stderr else "N/A")
    finally:
        os.chdir(original_cwd)

    if cmake_success:
        print("Copying compiled tools to src/pre_process/...")
        for tool_name in ["1_filter", "3_field_extraction"]:
            source_path = os.path.join(CPP_BUILD_DIR_COLAB, tool_name)
            dest_path = os.path.join(PRE_PROCESS_SCRIPTS_DIR_COLAB, tool_name)
            if os.path.exists(source_path):
                subprocess.run(["cp", source_path, dest_path], check=True)
                subprocess.run(["chmod", "+x", dest_path], check=True)
                print(f"  Copied and set +x for {dest_path}")
            else:
                print(f"  ERROR: Compiled tool '{tool_name}' not found at {source_path}.")
        print("netFound C++ tools compiled and copied.")
    else:
        print("C++ tools compilation failed. Preprocessing scripts may not run correctly.")

print("\nSetup for C++ tools complete. Proceed to run the preprocessing pipeline.")


Compiling netFound C++ tools in /content/netFound/netFound/src/pre_process/packets_processing_src/build...
Copying compiled tools to src/pre_process/...
  Copied and set +x for /content/netFound/netFound/src/pre_process/1_filter
  Copied and set +x for /content/netFound/netFound/src/pre_process/3_field_extraction
netFound C++ tools compiled and copied.

Setup for C++ tools complete. Proceed to run the preprocessing pipeline.


In [None]:

# --- Cell 4: Run Local-Style Preprocessing Script in Colab ---
# This cell executes the bash script logic from your "Part 1" directly in Colab.
# It assumes Cells 1, 2, and 3 have completed successfully.

import subprocess
import os
import shutil

print("Starting local-style netFound PCAP preprocessing pipeline in Colab...")

# --- Configuration (paths are relative to /content/ or absolute) ---
NETFOUND_REPO_DIR_COLAB = "/content/netFound"
INPUT_DATA_ROOT_FOLDER_COLAB = "/content/attack_data" # Contains raw/0/*.pcap and raw/1/*.pcap
TOKENIZER_JSON_CONFIG_COLAB = os.path.join(NETFOUND_REPO_DIR_COLAB, "configs/TestFinetuningConfig.json")
FINAL_ARROW_OUTPUT_BASE_COLAB = os.path.join(INPUT_DATA_ROOT_FOLDER_COLAB, "final", "arrow_per_pcap")

FILTER_SCRIPT_COLAB = os.path.join(NETFOUND_REPO_DIR_COLAB, "netFound/src/pre_process/1_filter.sh")
SPLIT_SCRIPT_COLAB = os.path.join(NETFOUND_REPO_DIR_COLAB, "netFound/src/pre_process/2_pcap_splitting.sh")
EXTRACT_SCRIPT_COLAB = os.path.join(NETFOUND_REPO_DIR_COLAB, "netFound/src/pre_process/3_extract_fields.sh")
TOKENIZE_PY_SCRIPT_COLAB = os.path.join(NETFOUND_REPO_DIR_COLAB, "netFound/src/pre_process/Tokenize.py")
COLLECT_PY_SCRIPT_COLAB = os.path.join(NETFOUND_REPO_DIR_COLAB, "netFound/src/pre_process/CollectTokensInFiles.py")
TCP_OPTIONS_FLAG_COLAB = "0"

# Ensure shell scripts are executable (might have been done in Cell 1, but good to re-check)
for script_path in [FILTER_SCRIPT_COLAB, SPLIT_SCRIPT_COLAB, EXTRACT_SCRIPT_COLAB]:
    if os.path.exists(script_path):
        subprocess.run(["chmod", "+x", script_path], check=True)
    else:
        print(f"Warning: Script {script_path} not found!")


# --- Start Preprocessing ---
print(f"netFound Repository in Colab: {NETFOUND_REPO_DIR_COLAB}")
print(f"Input Data Root in Colab: {INPUT_DATA_ROOT_FOLDER_COLAB}")
print(f"Tokenizer Config: {TOKENIZER_JSON_CONFIG_COLAB}")
print(f"Final Arrow Output Base: {FINAL_ARROW_OUTPUT_BASE_COLAB}")

for label_dir_name_colab in ["0", "1"]:
    print(f"\n--- Processing Label: {label_dir_name_colab} ---")

    RAW_PCAP_INPUT_DIR_C = os.path.join(INPUT_DATA_ROOT_FOLDER_COLAB, "raw", label_dir_name_colab)
    FILTERED_PCAP_DIR_C = os.path.join(INPUT_DATA_ROOT_FOLDER_COLAB, "filtered", label_dir_name_colab)
    SPLIT_FLOWS_DIR_C = os.path.join(INPUT_DATA_ROOT_FOLDER_COLAB, "split", label_dir_name_colab)
    EXTRACTED_FIELDS_DIR_C = os.path.join(INPUT_DATA_ROOT_FOLDER_COLAB, "extracted", label_dir_name_colab)
    FINAL_ARROW_LABEL_DIR_C = os.path.join(FINAL_ARROW_OUTPUT_BASE_COLAB, label_dir_name_colab)

    os.makedirs(FILTERED_PCAP_DIR_C, exist_ok=True)
    os.makedirs(SPLIT_FLOWS_DIR_C, exist_ok=True)
    os.makedirs(EXTRACTED_FIELDS_DIR_C, exist_ok=True)
    os.makedirs(FINAL_ARROW_LABEL_DIR_C, exist_ok=True)

    if not os.path.exists(RAW_PCAP_INPUT_DIR_C) or not any(f.endswith('.pcap') for f in os.listdir(RAW_PCAP_INPUT_DIR_C)):
        print(f"  No PCAP files found in {RAW_PCAP_INPUT_DIR_C}. Skipping label {label_dir_name_colab}.")
        continue

    print(f"  [1/4] Filtering PCAPs from {RAW_PCAP_INPUT_DIR_C} to {FILTERED_PCAP_DIR_C}...")
    cmd_filter = ["bash", FILTER_SCRIPT_COLAB, RAW_PCAP_INPUT_DIR_C, FILTERED_PCAP_DIR_C]
    result = subprocess.run(cmd_filter, capture_output=True, text=True)
    if result.returncode != 0: print(f"    Error in 1_filter.sh: {result.stderr}") #else: print(f"    Filter Output: {result.stdout[:200]}...")

    print(f"  [2/4] Splitting PCAPs from {FILTERED_PCAP_DIR_C} into flows in {SPLIT_FLOWS_DIR_C}...")
    cmd_split = ["bash", SPLIT_SCRIPT_COLAB, FILTERED_PCAP_DIR_C, SPLIT_FLOWS_DIR_C]
    result = subprocess.run(cmd_split, capture_output=True, text=True)
    if result.returncode != 0: print(f"    Error in 2_pcap_splitting.sh: {result.stderr}") #else: print(f"    Split Output: {result.stdout[:200]}...")

    print(f"  [3/4] Extracting fields from flows in {SPLIT_FLOWS_DIR_C} to {EXTRACTED_FIELDS_DIR_C}...")
    cmd_extract = ["bash", EXTRACT_SCRIPT_COLAB, SPLIT_FLOWS_DIR_C, EXTRACTED_FIELDS_DIR_C, TCP_OPTIONS_FLAG_COLAB]
    result = subprocess.run(cmd_extract, capture_output=True, text=True)
    if result.returncode != 0: print(f"    Error in 3_extract_fields.sh: {result.stderr}") #else: print(f"    Extract Output: {result.stdout[:200]}...")

    print(f"  [4/4] Tokenizing extracted fields to Arrow format in {FINAL_ARROW_LABEL_DIR_C}...")
    if not os.path.exists(EXTRACTED_FIELDS_DIR_C) or not os.listdir(EXTRACTED_FIELDS_DIR_C):
        print(f"    No subfolders found in {EXTRACTED_FIELDS_DIR_C} to tokenize. Check previous steps.")
        continue

    for pcap_subfolder_name_colab in os.listdir(EXTRACTED_FIELDS_DIR_C):
        pcap_subfolder_path_colab = os.path.join(EXTRACTED_FIELDS_DIR_C, pcap_subfolder_name_colab)
        if os.path.isdir(pcap_subfolder_path_colab):
            TEMP_SHARD_DIR_C = os.path.join(FINAL_ARROW_LABEL_DIR_C, f"{pcap_subfolder_name_colab}_shards")
            os.makedirs(TEMP_SHARD_DIR_C, exist_ok=True)

            print(f"    Tokenizing flows from {pcap_subfolder_path_colab} (ID: '{pcap_subfolder_name_colab}')...")
            cmd_tokenize = ["python3", TOKENIZE_PY_SCRIPT_COLAB,
                            "--conf_file", TOKENIZER_JSON_CONFIG_COLAB,
                            "--input_dir", pcap_subfolder_path_colab,
                            "--output_dir", TEMP_SHARD_DIR_C,
                            "--label", pcap_subfolder_name_colab, # Store pcap_folder_name in 'labels' column
                            "--cores", "1"]
            result = subprocess.run(cmd_tokenize, capture_output=True, text=True)
            if result.returncode != 0: print(f"      Error in Tokenize.py for {pcap_subfolder_name_colab}: {result.stderr}") #else: print(f"      Tokenize Output: {result.stdout[:200]}...")

            COMBINED_ARROW_FILE_C = os.path.join(FINAL_ARROW_LABEL_DIR_C, f"{pcap_subfolder_name_colab}.arrow")
            print(f"    Combining shards for '{pcap_subfolder_name_colab}' into {COMBINED_ARROW_FILE_C}...")
            cmd_collect = ["python3", COLLECT_PY_SCRIPT_COLAB, TEMP_SHARD_DIR_C, COMBINED_ARROW_FILE_C]
            result = subprocess.run(cmd_collect, capture_output=True, text=True)
            if result.returncode != 0: print(f"      Error in CollectTokensInFiles.py for {pcap_subfolder_name_colab}: {result.stderr}") #else: print(f"      Collect Output: {result.stdout[:200]}...")

            if os.path.exists(TEMP_SHARD_DIR_C):
                shutil.rmtree(TEMP_SHARD_DIR_C)
            print(f"    Done with '{pcap_subfolder_name_colab}'. Arrow file at: {COMBINED_ARROW_FILE_C}")

print("\n--- Colab Preprocessing Finished ---")
print(f"Final Arrow files should be in: {FINAL_ARROW_OUTPUT_BASE_COLAB}/<label_dir_name>/<original_pcap_basename>.arrow")
print("The 'labels' column in these Arrow files contains the <original_pcap_basename> (e.g., 'patator-multi-cloud-attack-57497_attack').")
print("You can now download this directory or use it directly in Colab for Part 2 (Training Concept Mapping).")


Starting local-style netFound PCAP preprocessing pipeline in Colab...
netFound Repository in Colab: /content/netFound
Input Data Root in Colab: /content/attack_data
Tokenizer Config: /content/netFound/configs/TestFinetuningConfig.json
Final Arrow Output Base: /content/attack_data/final/arrow_per_pcap

--- Processing Label: 0 ---
  [1/4] Filtering PCAPs from /content/attack_data/raw/0 to /content/attack_data/filtered/0...
  [2/4] Splitting PCAPs from /content/attack_data/filtered/0 into flows in /content/attack_data/split/0...
  [3/4] Extracting fields from flows in /content/attack_data/split/0 to /content/attack_data/extracted/0...
  [4/4] Tokenizing extracted fields to Arrow format in /content/attack_data/final/arrow_per_pcap/0...
    Tokenizing flows from /content/attack_data/extracted/0/patator-multi-cloud-attack-78806 (ID: 'patator-multi-cloud-attack-78806')...
    Combining shards for 'patator-multi-cloud-attack-78806' into /content/attack_data/final/arrow_per_pcap/0/patator-multi

In [None]:
# --- Cell 1 (Part 2): Setup for Training - Uploads & Path Verification (Revised for datasets) ---
import os
import sys
import subprocess

print("--- Pip Installing/Updating Packages ---")
# Install specific versions to ensure consistency and known good states
# Using pip install --upgrade to ensure we get the specified version or newer if compatible
# Using -q for quieter output
# subprocess.run(["pip", "install", "-q", "-U",
#                 "transformers==4.38.2",  # A recent stable version
#                 "datasets==2.18.0",    # A recent stable version, known to have from_arrow
#                 "torch",
#                 "numpy",
#                 "scikit-learn",
#                 "huggingface_hub",
#                 "pyarrow"], check=True)
print("Python packages (transformers, datasets, torch, etc.) ensured.")


--- Pip Installing/Updating Packages ---
Python packages (transformers, datasets, torch, etc.) ensured.


In [None]:

print("\n--- User Action Required: Upload Files (if not already present) ---")
# (Keep your existing upload instructions here)
print("1. Your 'concepts.txt' file to /content/concepts.txt.")
print("2. Your 'concept_scores.txt' file to /content/concept_scores.txt.")
print("3. The 'arrow_per_pcap/' directory to /content/arrow_per_pcap/ (containing 0/*.arrow and 1/*.arrow files).")
print("4. The 'src/train/' directory from your 'netFound' clone to /content/netFound/src/train/.")
print("5. Your 'config.json' to /content/config.json")

# Verify essential uploaded files/directories (paths based on your previous script)
CONCEPTS_FILE_PATH_PT2 = "/content/concepts.txt"
BINNED_CONCEPT_SCORES_FILE_PATH_PT2 = "/content/concept_scores.txt"
ARROW_DATA_BASE_DIR_PT2 = "/content/attack_data/final/arrow_per_pcap"
NETFOUND_TRAIN_SRC_DIR_PT2 = "/content/netFound/netFound/src/train" # Adjusted as per your last path
UPLOADED_CONFIG_JSON_PATH_PT2 = "/content/config.json"

# Basic checks
if not os.path.exists(CONCEPTS_FILE_PATH_PT2): print(f"ERROR: '{CONCEPTS_FILE_PATH_PT2}' not found.")
if not os.path.exists(BINNED_CONCEPT_SCORES_FILE_PATH_PT2): print(f"ERROR: '{BINNED_CONCEPT_SCORES_FILE_PATH_PT2}' not found.")
if not os.path.isdir(ARROW_DATA_BASE_DIR_PT2): print(f"ERROR: Arrow data directory '{ARROW_DATA_BASE_DIR_PT2}' not found.")
if not os.path.isdir(NETFOUND_TRAIN_SRC_DIR_PT2): print(f"ERROR: netFound src/train directory '{NETFOUND_TRAIN_SRC_DIR_PT2}' not found.")
if not os.path.exists(UPLOADED_CONFIG_JSON_PATH_PT2): print(f"ERROR: '{UPLOADED_CONFIG_JSON_PATH_PT2}' not found.")

# Add to Python path
netfound_src_dir = os.path.dirname(NETFOUND_TRAIN_SRC_DIR_PT2) # /content/netFound/netFound/src
if NETFOUND_TRAIN_SRC_DIR_PT2 not in sys.path: sys.path.insert(0, NETFOUND_TRAIN_SRC_DIR_PT2)
if netfound_src_dir not in sys.path: sys.path.insert(0, netfound_src_dir)

print("\nSetup check complete. If no ERRORs, proceed to the next cell for training.")


--- User Action Required: Upload Files (if not already present) ---
1. Your 'concepts.txt' file to /content/concepts.txt.
2. Your 'concept_scores.txt' file to /content/concept_scores.txt.
3. The 'arrow_per_pcap/' directory to /content/arrow_per_pcap/ (containing 0/*.arrow and 1/*.arrow files).
4. The 'src/train/' directory from your 'netFound' clone to /content/netFound/src/train/.
5. Your 'config.json' to /content/config.json
ERROR: '/content/config.json' not found.

Setup check complete. If no ERRORs, proceed to the next cell for training.


In [None]:
!pip install --upgrade --quiet datasets


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/491.5 kB[0m [31m26.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0m

In [None]:
# --- Cell 2 (Part 2): Training the Concept Mapping Model (Robust Arrow Reading + Pandas Fallback) ---

import os
import sys
import re
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset as TorchDataset, DataLoader
import pyarrow as pa
import pyarrow.ipc as ipc
import pyarrow.parquet as pq
import pandas as pd
from datasets import Dataset as HFDataset, concatenate_datasets
from huggingface_hub import hf_hub_download

# 1) Patch sys.path to include netFound source
TRAIN_SRC = "/content/netFound/netFound/src/train"
sys.path.insert(0, TRAIN_SRC)
from NetFoundModels    import NetFoundBase, AttentivePooling
from NetfoundConfig    import NetfoundConfig
from NetfoundTokenizer import NetFoundTokenizer

# 2) Configuration
HF_MODEL            = "snlucsb/netFound-640M-base"
CONCEPTS_FILE       = "/content/concepts.txt"
SCORES_FILE         = "/content/concept_scores.txt"
CONFIG_JSON         = "/content/config.json"
ARROW_DATA_DIR      = "/content/attack_data/final/arrow_per_pcap"
ID_FIELD            = "labels"
BATCH_SIZE          = 4
LR                  = 5e-5
EPOCHS              = 25
DEVICE              = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 3) Load base concepts
with open(CONCEPTS_FILE, 'r', encoding='utf-8') as f:
    base_concepts = [l.strip() for l in f if l.strip()]
concept2idx = {c:i for i,c in enumerate(base_concepts)}
NUM_CONCEPTS = len(base_concepts)

# 4) Parse binned scores
binned_scores = {}
cur_id, buf = None, []
with open(SCORES_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        m = re.match(r"File:\s*packet_(.+?)\.txt", line)
        if m:
            if cur_id and buf:
                t = torch.zeros(NUM_CONCEPTS, dtype=torch.long)
                for cname, b in buf:
                    t[concept2idx[cname]] = b
                binned_scores[cur_id] = t
            cur_id, buf = m.group(1), []
        else:
            m2 = re.match(r'-\s*"(.*?)".*:\s*(\d+)', line)
            if m2 and cur_id:
                buf.append((m2.group(1).strip(), int(m2.group(2))))
    if cur_id and buf:
        t = torch.zeros(NUM_CONCEPTS, dtype=torch.long)
        for cname, b in buf:
            t[concept2idx[cname]] = b
        binned_scores[cur_id] = t
assert binned_scores, "No binned scores parsed!"

# 5) Load NetfoundConfig
with open(CONFIG_JSON, 'r') as f:
    cfgd = json.load(f)
defaults = {
    "vocab_size":65539, "hidden_size":1024, "num_hidden_layers":24,
    "num_attention_heads":16, "intermediate_size":3072,
    "max_bursts":12, "max_burst_length":109,
    "metaFeatures":4, "pad_token_id": NetFoundTokenizer.PAD_TOKEN
}
for k,v in defaults.items():
    cfgd.setdefault(k, v)
if "model_max_length" not in cfgd:
    cfgd["model_max_length"] = cfgd["max_bursts"] * cfgd["max_burst_length"]
net_cfg = NetfoundConfig(**cfgd)

# 6) Instantiate & freeze NetFoundBase
net_base = NetFoundBase(config=net_cfg)
try:
    path = hf_hub_download(repo_id=HF_MODEL, filename="pytorch_model.bin")
    sd   = torch.load(path, map_location="cpu")
    adapted = {k.replace("base_transformer.",""):v for k,v in sd.items() if k.startswith("base_transformer.")}
    if not adapted:
        adapted = {k:v for k,v in sd.items() if k in net_base.state_dict()}
    net_base.load_state_dict(adapted, strict=False)
except:
    print("⚠️ HF weights not loaded, using random init.")
for p in net_base.parameters(): p.requires_grad = False
net_base.to(DEVICE).eval()

pooler = AttentivePooling(config=net_cfg).to(DEVICE)



class ConceptMapper(nn.Module):
    def __init__(self, hid, n_concepts, n_bins):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(hid, hid//2), nn.ReLU(), nn.Dropout(0.25)
        )
        self.heads = nn.ModuleList([nn.Linear(hid//2, n_bins) for _ in range(n_concepts)])
    def forward(self, x):
        h = self.fc(x)
        return torch.stack([hd(h) for hd in self.heads], dim=1)

NUM_BINS = max(t.max().item() for t in binned_scores.values()) + 1
mapper   = ConceptMapper(net_cfg.hidden_size, NUM_CONCEPTS, NUM_BINS).to(DEVICE)
# Helper (must be defined before __init__)
# 1) Robust Arrow loader (file‐IPC, then stream‐IPC)
def _load_arrow_table(path):
    with open(path, "rb") as f:
        try:
            return ipc.open_file(f).read_all()
        except pa.lib.ArrowInvalid:
            f.seek(0)
            return ipc.open_stream(f).read_all()

class ConceptDataset(TorchDataset):
    def __init__(self, arrow_dir, score_map, cfg):
        """
        arrow_dir: path to .../arrow_per_pcap
        score_map: dict mapping pcap_id (with suffix) -> binned-score Tensor
        cfg:       NetfoundConfig
        """
        self.score_map = score_map
        self.cfg       = cfg
        self.mb        = cfg.max_bursts
        self.mbl       = cfg.max_burst_length

        # 2) Gather only those .arrow files whose pcap_id is in score_map
        self.entries = []
        for lbl in ["0", "1"]:
            sub = os.path.join(arrow_dir, lbl)
            if not os.path.isdir(sub):
                continue
            for fn in os.listdir(sub):
                if not fn.endswith(".arrow"):
                    continue
                base = os.path.splitext(fn)[0]                # e.g. "patator-multi-cloud-attack-57497"
                # detect suffix in filename: it might already include "_attack" or "_benign2"
                # so if base endswith "_attack" or "_benign2", use it directly; else append the folder-based suffix
                if base.endswith("_attack") or base.endswith("_benign2"):
                    pid = base
                else:
                    suffix = "attack" if lbl=="0" else "benign2"
                    pid = f"{base}_{suffix}"

                if pid in score_map:
                    fullpath = os.path.join(sub, fn)
                    self.entries.append((fullpath, pid))

        assert self.entries, "No .arrow files matched your concept_scores keys!"

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        path, pid = self.entries[idx]
        # 3) Load the entire Arrow table for this pcap
        table = _load_arrow_table(path)
        data  = table.to_pydict()  # dict of lists, e.g. data["burst_tokens"]

        # 4) Build input_ids & attention_mask
        cls, pad = NetFoundTokenizer.CLS_TOKEN, NetFoundTokenizer.PAD_TOKEN
        toks, mask = [], []
        bursts = data["burst_tokens"]
        for b in range(self.mb):
            seq = [cls] + list(bursts[b]) if b < len(bursts) else []
            seq = seq[:self.mbl] + [pad] * (self.mbl - len(seq))
            toks.extend(seq)
            L = min(len(bursts[b]) + 1, self.mbl) if b < len(bursts) else 0
            mask.extend([1] * L + [0] * (self.mbl - L))

        inp = {
            "input_ids":      torch.tensor(toks, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

        # 5) Attach meta-features
        for mk, arrk in zip(
            ["directions", "iats", "bytes", "pkt_count"],
            ["directions",  "iats",  "bytes",  "counts"]
        ):
            vals, feat = data[arrk], []
            for b in range(self.mb):
                v = float(vals[b]) if b < len(vals) else 0.0
                if mk == "directions" and b < len(vals):
                    v = 1.0 if vals[b] else -1.0
                for i in range(b * self.mbl, (b + 1) * self.mbl):
                    feat.append(v if mask[i] else 0.0)
            inp[mk] = torch.tensor(feat, dtype=torch.float)

        # 6) Protocol feature
        proto_vals = data["protocol"]
        proto_id   = proto_vals[0] if len(proto_vals)>0 else 0
        inp["protocol"] = torch.tensor(proto_id, dtype=torch.long)

        # 7) Return inputs + target vector
        target = self.score_map[pid]  # torch.LongTensor of shape [num_concepts]
        return inp, target

# --- Usage: ---
dataset = ConceptDataset(ARROW_DATA_DIR, binned_scores, net_cfg)
print("Num samples:", len(dataset))  # should now > 0
loader  = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

# 8) Training Loop
# 8) Training Loop
opt  = optim.AdamW(mapper.parameters(), lr=LR)
crit = nn.CrossEntropyLoss()

for ep in range(1, EPOCHS + 1):
    mapper.train()
    total_loss = 0.0

    for inputs, targets in loader:
        # Move everything to device
        for k, v in inputs.items():
            inputs[k] = v.to(DEVICE)
        targets = targets.to(DEVICE)

        # Prepare NetFound inputs
        model_inputs = {
            "input_ids":      inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "protocol":       inputs["protocol"],   # shape [batch]
            "direction":      inputs["directions"],
            "iats":           inputs["iats"],
            "bytes":          inputs["bytes"],
            "pkt_count":      inputs["pkt_count"],
            "return_dict":    True,                # ensure ModelOutput is returned
        }

        # 1) Forward through NetFoundBase
        with torch.no_grad():
            base_out = net_base(**model_inputs)
            hidden   = base_out.last_hidden_state   # [batch, seq_len, hidden_size]

        # 2) Pool & map
        reps   = hidden[:, ::net_cfg.max_burst_length, :]  # [batch, bursts, hidden]
        h_tok  = pooler(reps)                              # [batch, hidden]
        logits = mapper(h_tok)                             # [batch, concepts, bins]

        # 3) Compute loss & update
        loss = sum(crit(logits[:, c, :], targets[:, c])
                   for c in range(NUM_CONCEPTS))
        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item()

    # 4) Log
    avg_loss = total_loss / (len(loader) * NUM_CONCEPTS)
    print(f"Epoch {ep}/{EPOCHS} — Avg loss/concept: {avg_loss:.4f}")

print("✅ Training complete.")


Num samples: 10
Epoch 1/25 — Avg loss/concept: 2.3139
Epoch 2/25 — Avg loss/concept: 2.2777
Epoch 3/25 — Avg loss/concept: 2.2547
Epoch 4/25 — Avg loss/concept: 2.2395
Epoch 5/25 — Avg loss/concept: 2.2023
Epoch 6/25 — Avg loss/concept: 2.1830
Epoch 7/25 — Avg loss/concept: 2.1443
Epoch 8/25 — Avg loss/concept: 2.1152
Epoch 9/25 — Avg loss/concept: 2.0874
Epoch 10/25 — Avg loss/concept: 2.0662
Epoch 11/25 — Avg loss/concept: 2.0317
Epoch 12/25 — Avg loss/concept: 2.0149
Epoch 13/25 — Avg loss/concept: 1.9774
Epoch 14/25 — Avg loss/concept: 1.9446
Epoch 15/25 — Avg loss/concept: 1.9232
Epoch 16/25 — Avg loss/concept: 1.8916
Epoch 17/25 — Avg loss/concept: 1.8347
Epoch 18/25 — Avg loss/concept: 1.8003
Epoch 19/25 — Avg loss/concept: 1.7902
Epoch 20/25 — Avg loss/concept: 1.7640
Epoch 21/25 — Avg loss/concept: 1.7086
Epoch 22/25 — Avg loss/concept: 1.6915
Epoch 23/25 — Avg loss/concept: 1.6526
Epoch 24/25 — Avg loss/concept: 1.6132
Epoch 25/25 — Avg loss/concept: 1.5909
✅ Training complet

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Freeze your trained ConceptMapper
for p in mapper.parameters():
    p.requires_grad = False

# Define the output head (concepts -> controller label)
NUM_OUTPUT_CLASSES = 2  # set to your number of classes
output_head = nn.Linear(NUM_CONCEPTS, NUM_OUTPUT_CLASSES).to(DEVICE)

# Optimizer and loss
opt2  = optim.AdamW(output_head.parameters(), lr=1e-3)
crit2 = nn.CrossEntropyLoss()

# DataLoader for controller labels (expects (inputs, y_true) tuples)
# e.g., controller_loader = DataLoader(controller_dataset, ...)

NUM_EPOCHS_OUT = 10
for epoch in range(1, NUM_EPOCHS_OUT + 1):
    output_head.train()
    total_loss = 0.0
    for inputs, y_true in controller_loader:
        # move to device
        for k,v in inputs.items(): inputs[k] = v.to(DEVICE)
        y_true = y_true.to(DEVICE)

        # 1) Get frozen embeddings -> concept features
        with torch.no_grad():
            base_out = net_base(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                protocol=inputs['protocol'],
                direction=inputs['directions'],
                iats=inputs['iats'],
                bytes=inputs['bytes'],
                pkt_count=inputs['pkt_count'],
                return_dict=True
            )
            hidden = base_out.last_hidden_state
            reps   = hidden[:, ::net_cfg.max_burst_length, :]
            h_tok  = pooler(reps)
            logits_c = mapper(h_tok)  # [B, C, bins]
            concept_feats = logits_c.argmax(dim=-1).float()  # [B, C]

        # 2) Forward through output head
        y_hat = output_head(concept_feats)
        loss  = crit2(y_hat, y_true)

        # 3) Backprop on output head only
        opt2.zero_grad()
        loss.backward()
        opt2.step()
        total_loss += loss.item()

    avg = total_loss / len(controller_loader)
    print(f"[Output Head] Epoch {epoch}/{NUM_EPOCHS_OUT} - Avg Loss: {avg:.4f}")
