In [1]:
# Cell 1: Global Configuration Initialization with Debug Slider
import os
import sys
import pickle
import logging
import ipywidgets as widgets
from IPython.display import display
import builtins

# Initialize logging
logger = logging.getLogger('smbreplay')
logger.handlers = []  # Clear existing handlers

# Stream handler for Jupyter output
stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(asctime)s] %(message)s', datefmt='%a %b %d %H:%M:%S %Y'))
logger.addHandler(stream_handler)

# File handler for persistent logs
log_file = '/home/jovyan/work/smbreplay/smbreplay.log'
os.makedirs(os.path.dirname(log_file), exist_ok=True)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(asctime)s] %(message)s', datefmt='%a %b %d %H:%M:%S %Y'))
logger.addHandler(file_handler)

# Map verbosity levels (0–3) to logging levels
VERBOSITY_TO_LOGGING = {
    0: logging.CRITICAL,  # Only critical errors
    1: logging.INFO,      # Info and above
    2: logging.DEBUG,     # Debug and above
    3: logging.DEBUG      # Same as 2, but can extend for finer granularity later
}

# Export logger to builtins
builtins.logger = logger

# Initialize configurations
config_dir = "/home/jovyan/work/smbreplay/"
os.makedirs(config_dir, exist_ok=True)
config_file = os.path.join(config_dir, "config.pkl")

if not hasattr(builtins, 'pcap_config'):
    builtins.pcap_config = {
        "capture_path": None,
        "verbose_level": 0  # Default to CRITICAL
    }
if not hasattr(builtins, 'replay_config'):
    builtins.replay_config = {
        "server_ip": "10.216.29.241",
        "domain": "nas-deep.local",
        "username": "jtownsen",
        "password": "PASSWORD",
        "tree_name": "2pm",
        "max_wait": 5.0
    }

# Load from config.pkl if it exists
if os.path.exists(config_file):
    try:
        with open(config_file, 'rb') as f:
            loaded_config = pickle.load(f)
            if 'pcap_config' in loaded_config:
                # Update all pcap_config keys, including verbose_level
                builtins.pcap_config.update({k: v for k, v in loaded_config['pcap_config'].items() if k in builtins.pcap_config})
            if 'replay_config' in loaded_config:
                builtins.replay_config.update({k: v for k, v in loaded_config['replay_config'].items() if k in builtins.replay_config})
        logger.info(f"Loaded config from {config_file}: pcap_config={builtins.pcap_config}, replay_config={builtins.replay_config}")
    except (pickle.PickleError, IOError) as e:
        logger.critical(f"Failed to load {config_file}: {e}. Using defaults.")

# Set logger level from pcap_config
logger.setLevel(VERBOSITY_TO_LOGGING.get(builtins.pcap_config.get("verbose_level", 0), logging.CRITICAL))

# Define debug slider
builtins.debug_slider = widgets.IntSlider(
    value=builtins.pcap_config.get("verbose_level", 0),
    min=0,
    max=3,
    step=1,
    description="Debug Level:",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d"
)

# Define debug slider handler
def on_debug_slider_change(change):
    """Handle debug slider changes."""
    verbosity = change["new"]
    builtins.pcap_config["verbose_level"] = verbosity
    logger.setLevel(VERBOSITY_TO_LOGGING.get(verbosity, logging.CRITICAL))
    # Save updated config to config.pkl
    try:
        with open(config_file, 'wb') as f:
            pickle.dump({'pcap_config': builtins.pcap_config, 'replay_config': builtins.replay_config}, f)
        logger.info(f"Updated {config_file} with verbose_level={verbosity} (logging level: {logging.getLevelName(logger.level)})")
    except (pickle.PickleError, IOError) as e:
        logger.critical(f"Failed to save {config_file}: {e}")
    logger.debug(f"Debug slider changed to verbose_level={verbosity}")

# Attach handler
builtins.debug_slider.observe(on_debug_slider_change, names="value")

# Initialize other globals
if not hasattr(builtins, 'operations'):
    builtins.operations = []

# Initialize capture and server config globals
builtins.capture = builtins.pcap_config.get("capture_path")
builtins.server_ip = builtins.replay_config.get("server_ip")
builtins.domain = builtins.replay_config.get("domain")
builtins.username = builtins.replay_config.get("username")
builtins.password = builtins.replay_config.get("password")
builtins.tree_name = builtins.replay_config.get("tree_name")
builtins.all_cells_run = False

# Display debug slider
display(builtins.debug_slider)
logger.info("Cell 1 initialized. Configuration set in /home/jovyan/work/smbreplay/config.pkl.")

ModuleNotFoundError: No module named 'ipywidgets'

# Project Summary

## Objective
Develop a system to capture, store, and replay SMB2 (Server Message Block version 2) network traffic in a controlled lab environment for diagnostic, testing, and protocol interaction analysis, critical for file sharing in Windows-based systems.

## Approach
- **Capture**: Extract SMB2 packets from PCAP files using `ntap-tshark` via SSH.
- **Storage**: Store data in Parquet files by session (`smb2.sesid`) with JSON metadata.
- **Replay**: Replicate file operations on a lab server using `impacket` (replacing `smbclient` due to `pysmb` deprecation).
- **User Interface**: Provide an interactive dashboard for PCAP selection, ingestion, session visualization, and replay configuration, with settings in `config.pkl`.
- **Modular Design**: Use reusable Python functions in `builtins` across notebook cells.

## Tools
- **Data Storage**: Parquet (compressed), JSON (metadata).
- **Packet Capture**: `tshark`/`ntap-tshark`.
- **Replay**: `impacket`.
- **UI and Data Handling**: `ipywidgets`, `pandas`, `pyarrow`.
- **SSH Interactions**: `paramiko`, `subprocess`.
- **Memory Monitoring**: `psutil`.
- **Logging**: Structured with `JupyterOutputHandler`.

## Development Environment
- Jupyter notebooks (Cells 1–11) in a container with an 8 GB memory limit.
- Remote server access via SSH for packet capture and operations.

## Lab Server Details
- **IP**: 10.216.29.241
- **Domain**: nas-deep.local
- **Username**: jtownsen
- **Password**: [REDACTED]
- **Share**: 2pm

## Key Workflow
1. Capture SMB2 traffic with `ntap-tshark`.
2. Process and store sessions in Parquet files with JSON metadata.
3. Replay sessions using `impacket` on the lab server.
4. Manage via interactive UI with settings in `config.pkl`.

## Pre-Trace Conditions
Ensure the lab server’s file system matches the original state by pre-creating directories and files based on `smb2.filename` and `smb2.cmd` before replay.

## Phases and Current Status
### Phase 1: Comprehensive SMB2 Field Capture
- **Status**: Completed
- **Details**: Captured 619 SMB2 fields (e.g., `smb2.cmd`, `smb2.filename`) using `ntap-tshark`, stored in Parquet with zstd compression. Fixed `smb2.filename` accuracy.

### Phase 2: Session-Based Storage
- **Status**: Completed
- **Details**: Organized data by `smb2.sesid` into Parquet files (e.g., `smb2_session_0x98fc00000000d580.parquet`) with JSON metadata. Validated with 5,741 frames from a 319,000-packet PCAP.

### Phase 3: Replay Mechanism Development
- **Status**: In Progress
- **Details**: Transitioning to `impacket`. Partial `smb2.cmd` mapping (e.g., 5 → Create). Focus on completing `impacket` integration and `smb2.cmd` mapping (0–18).
- **Recent Progress**: UI enhancements (tooltips, `smb2.nt_status` mapping), replay logic refactoring.
- **Time Remaining**: 2–3 days

### Phase 4: Validation and Iteration
- **Status**: Not Started
- **Details**: Plan to compare original and replayed PCAPs, handle edge cases, and scale to multiple sessions.
- **Time Estimate**: 2–4 days

### Phase 5: Automation and Deployment
- **Status**: Partially Completed
- **Details**: Interactive UI with dynamic dropdowns and `config.pkl` settings. Automation and packaging pending.
- **Time Estimate**: 3–5 days

## Timeline
- **Completed**: Phases 1 and 2
- **Phase 3**: 2–3 days
- **Phase 4**: 2–4 days
- **Phase 5**: 3–5 days
- **Total Remaining**: 7–12 days

## Key Functions by Cell
Below are the key functions from each cell, detailing their roles in the system:

### Cell 1: Global Configuration Initialization
- **`on_debug_slider_change(change)`**:
  - Updates logging verbosity (0–3, mapping to CRITICAL, INFO, DEBUG) via a slider.
  - Saves `pcap_config` to `config.pkl`.
  - Logs verbosity changes.
- **Purpose**: Initializes logging (`smbreplay.log`), `pcap_config` (capture path, verbosity), and `replay_config` (server details). Stores configs in `/home/jovyan/work/smbreplay/config.pkl`. Displays debug slider.

### Cell 2: Setup and SMB2 Utility Functions
- **`shorten_path(full_path, max_components=3)`**:
  - Shortens file paths to the last `max_components` for display.
- **`normalize_path(path)`**:
  - Normalizes paths (lowercase, backslashes to slashes) for comparison.
- **`get_tree_name_mapping(frames)`**:
  - Maps `smb2.tid` to share names from Tree Connect frames.
- **`check_ssh_connectivity()`**:
  - Verifies SSH connection to `backend` server for `ntap-tshark`.
- **Purpose**: Sets up `itables` for interactive tables, defines SMB2 command mappings (`SMB2_OP_NAME_DESC`), FSCTL constants, and file/info level mappings. Ensures SSH connectivity.

### Cell 3: SMB2 Field Definitions
- **`normalize_hex_field(value, field_name)`**:
  - Normalizes hex fields (e.g., `smb2.nt_status`, `smb2.sesid`) to uppercase hex format (32-bit or 64-bit).
- **`normalize_fid(value)`**:
  - Normalizes `smb2.fid` to 128-bit hex, handling UUID or hex formats.
- **Purpose**: Defines 619 SMB2 fields from `smb2_fields.txt`, tracking fields (e.g., `frame.number`), and hex fields for normalization. Validates critical fields and defines mappings for `smb2.cmd`, `smb2.nt_status`, etc.

### Cell 4: NTAP-Tshark Processing
- **`build_tshark_command(capture, fields, reassembly, packet_limit, log_level, temp_dir, verbose)`**:
  - Constructs SSH command for `ntap-tshark` to extract SMB2 fields from PCAP.
- **`extract_fields(line, fields)`**:
  - Parses `tshark` output lines into dictionaries with frame, stream, IP, and SMB2 fields.
- **`process_tshark_output(cmd, fields)`**:
  - Processes `tshark` output into a DataFrame, optimizing memory and normalizing fields.
- **`save_to_parquet(df, parquet_path)`**:
  - Saves DataFrame to Parquet with zstd compression, handling multi-value fields.
- **`create_remote_directory(case_number, trace_name, force_reingest)`**:
  - Creates and verifies remote session storage directory (`/stingray/<case_number>/.tracer/<trace_name>/sessions`).
- **`clear_directory(directory)`**:
  - Clears files in a remote directory for re-ingestion.
- **`status_callback(message)`**:
  - Logs status messages for ingestion progress.
- **Purpose**: Handles `ntap-tshark` execution, data parsing, and storage in Parquet files. Manages remote directories with SSH.

### Cell 5: Ingestion and Session Extraction
- **`get_packet_count(capture_path)`**:
  - Retrieves packet count from PCAP using `ntap-capinfos`.
- **`normalize_sesid(sesid_str)`**:
  - Normalizes `smb2.sesid`, handling lists/commas and excluding invalid values.
- **`normalize_cmd(cmd_str)`**:
  - Normalizes `smb2.cmd`, handling lists/commas.
- **`save_session_metadata(case_number, trace_name, sessions, output_dir)`**:
  - Saves session metadata (e.g., session count, frame count) to JSON.
- **`run_ingestion(capture_path, reassembly_enabled, force_reingest, verbose)`**:
  - Orchestrates PCAP ingestion: validates PCAP, extracts fields with `tshark`, splits into sessions by `smb2.sesid`, and saves to Parquet and JSON.
- **Purpose**: Manages ingestion of PCAP files, splitting into session-based Parquet files, and storing metadata.

### Cell 6: Session Selection and Utilities
- **`load_capture()`**:
  - Loads capture path from `config.pkl` or `pcap_config`, validating existence.
- **`get_output_dir(capture)`**:
  - Derives session storage directory from capture path, ensuring write access.
- **`list_session_files(output_dir)`**:
  - Lists session Parquet files, normalizing to lowercase and removing duplicates.
- **`shorten_path(path, max_length=50, min_filename_length=20)`**:
  - Shortens paths for display, prioritizing filenames.
- **`normalize_path(path)`**:
  - Normalizes paths for comparison, preserving leading slashes.
- **`get_tree_name_mapping(df)`**:
  - Maps `smb2.tid` to tree names from Tree Connect requests.
- **Purpose**: Provides utilities for loading captures, managing session directories, and handling path normalization for UI display.

### Cell 7: Session Loading and Filtering
- **`load_and_summarize_session(capture, session_file)`**:
  - Loads session Parquet file, returning frames, field options, file options, and default fields.
- **`update_operations(capture, session_file, selected_file, selected_fields)`**:
  - Prepares operations data, normalizing fields and filtering by file if specified.
- **Purpose**: Loads and filters session data, preparing operations for display with normalized fields and mapped descriptions.

### Cell 8: Replay Mechanism
- **`setup_pre_trace_state(conn, selected_operations, default_tree_id)`**:
  - Sets up lab server file system by creating directories and pre-existing files before replay.
- **`replay_session(selected_operations, output_widget)`**:
  - Replays SMB2 operations using `impacket`, handling Tree Connect, Create, Close, Read, and Write commands. Manages `tid` and `fid` mappings.
- **Purpose**: Implements session replay on the lab server, ensuring pre-trace state and executing SMB2 commands.

### Cell 9: Dashboard Setup
- **Purpose**: Initializes dashboard widgets (e.g., `case_number_input`, `capture_dropdown`, `session_dropdown`, `ingest_button`, `replay_button`) and output widgets (`log_output`, `output_cell`). Sets up `JupyterOutputHandler` for logging and initializes `replay_config` values.

### Cell 10: Event Handlers and Rendering
- **`status_callback(message)`**:
  - Logs and displays status messages in `log_output`.
- **`update_progress(message)`**:
  - Updates progress messages in `progress_output`.
- **`update_button_states()`**:
  - Enables/disables buttons based on session availability.
- **`render_page()`**:
  - Renders operations DataTable with mandatory (`Frame`, `Command`, `Path`, `smb2.nt_status`) and optional columns, including summaries of commands and create actions.
- **`on_case_number_change(change)`**:
  - Updates `capture_dropdown` with PCAP files for the entered case number.
- **`on_capture_change(change)`**:
  - Updates `session_dropdown` with session files, adjusts button states, and saves `capture_path` to `config.pkl`.
- **`on_ingest_button_clicked(b)`**:
  - Triggers PCAP ingestion with `run_ingestion` (no force re-ingest).
- **`on_reingest_button_clicked(b)`**:
  - Triggers PCAP re-ingestion with `force_reingest=True`.
- **`on_replay_button_clicked(b)`**:
  - Initiates session replay via `replay_session`.
- **`on_session_change(change)`**:
  - Loads session data, updates `file_combobox` and `check_fields_select`, and renders operations.
- **`on_file_change(change)`**:
  - Filters operations by selected file and re-renders the table.
- **`on_fields_change(change)`**:
  - Updates table columns based on selected fields and re-renders.
- **`on_save_config(b)`**:
  - Saves `replay_config` to `config.pkl`, preserving `pcap_config`.
- **`initialize_dashboard()`**:
  - Auto-loads capture and sessions if pre-set in `config.pkl`.
- **Purpose**: Handles UI interactions, rendering operations tables, and managing configuration saves.

### Cell 11: Dashboard Display
- **`update_dashboard_layout(verbose_level)`**:
  - Constructs and displays the dashboard, conditionally showing logs based on verbosity.
- **Purpose**: Finalizes dashboard display, integrating all widgets and ensuring dynamic log visibility.

## Recent Developments
- Standardized storage path: `/stingray/<case_number>/.tracer/<trace_name>/sessions`.
- Optimized directory clearing and logging.
- Fixed `smb2.sesid` and `smb2.cmd` normalization.
- Enhanced UI with tooltips and dynamic table rendering.
- Added debug slider for verbosity control (Cell 1).

## Next Steps
- **Verify Ingestion**: Re-run on `az3-CVO-python.pcapng` and test with a smaller PCAP.
- **Finalize `smb2.cmd` Mapping**: Complete mappings (0–18) using `SMB2_OP_NAME_DESC`.
- **Develop `impacket` Replay**: Fully implement and test in Cell 8.
- **Start Phase 4**: Validate replay accuracy by comparing PCAPs.
- **Complete Phase 5**: Automate and package the system.

## Getting Started
- **Priority**: Finalize `smb2.cmd` mapping and `impacket` replay.
- **Key Cells**:
  - **Cell 1**: Configuration and logging.
  - **Cells 8–10**: Replay, dashboard, and event handling.
  - **Cell 11**: Dashboard display.
- **Action Plan**:
  1. Test replay script in Cell 8 with `impacket`.
  2. Re-run ingestion on `az3-CVO-python.pcapng` and validate.
  3. Complete `smb2.cmd` mappings (0–18).
  4. Begin Phase 4 validation.

# Project Summary

## Objective
Develop a system to capture, store, and replay SMB2 (Server Message Block version 2) network traffic in a controlled lab environment for diagnostic, testing, and protocol interaction analysis, critical for file sharing in Windows-based systems.

## Approach
- **Capture**: Extract SMB2 packets from PCAP files using `ntap-tshark` via SSH.
- **Storage**: Store data in Parquet files by session (`smb2.sesid`) with JSON metadata.
- **Replay**: Replicate file operations on a lab server using `impacket` (replacing `smbclient` due to `pysmb` deprecation).
- **User Interface**: Provide an interactive dashboard for PCAP selection, ingestion, session visualization, and replay configuration, with settings in `config.pkl`.
- **Modular Design**: Use reusable Python functions in `builtins` across notebook cells.

## Tools
- **Data Storage**: Parquet (compressed), JSON (metadata).
- **Packet Capture**: `tshark`/`ntap-tshark`.
- **Replay**: `impacket`.
- **UI and Data Handling**: `ipywidgets`, `pandas`, `pyarrow`.
- **SSH Interactions**: `paramiko`, `subprocess`.
- **Memory Monitoring**: `psutil`.
- **Logging**: Structured with `JupyterOutputHandler`.

## Development Environment
- Jupyter notebooks (Cells 1–11) in a container with an 8 GB memory limit.
- Remote server access via SSH for packet capture and operations.

## Lab Server Details
- **IP**: 10.216.29.241
- **Domain**: nas-deep.local
- **Username**: jtownsen
- **Password**: [REDACTED]
- **Share**: 2pm

## Key Workflow
1. Capture SMB2 traffic with `ntap-tshark`.
2. Process and store sessions in Parquet files with JSON metadata.
3. Replay sessions using `impacket` on the lab server.
4. Manage via interactive UI with settings in `config.pkl`.

## Pre-Trace Conditions
Ensure the lab server’s file system matches the original state by pre-creating directories and files based on `smb2.filename` and `smb2.cmd` before replay.

## Phases and Current Status
### Phase 1: Comprehensive SMB2 Field Capture
- **Status**: Completed
- **Details**: Captured 619 SMB2 fields (e.g., `smb2.cmd`, `smb2.filename`) using `ntap-tshark`, stored in Parquet with zstd compression. Fixed `smb2.filename` accuracy.

### Phase 2: Session-Based Storage
- **Status**: Completed
- **Details**: Organized data by `smb2.sesid` into Parquet files (e.g., `smb2_session_0x98fc00000000d580.parquet`) with JSON metadata. Validated with 5,741 frames from a 319,000-packet PCAP.

### Phase 3: Replay Mechanism Development
- **Status**: In Progress
- **Details**: Transitioning to `impacket`. Partial `smb2.cmd` mapping (e.g., 5 → Create). Focus on completing `impacket` integration and `smb2.cmd` mapping (0–18).
- **Recent Progress**: UI enhancements (tooltips, `smb2.nt_status` mapping), replay logic refactoring.
- **Time Remaining**: 2–3 days

### Phase 4: Validation and Iteration
- **Status**: Not Started
- **Details**: Plan to compare original and replayed PCAPs, handle edge cases, and scale to multiple sessions.
- **Time Estimate**: 2–4 days

### Phase 5: Automation and Deployment
- **Status**: Partially Completed
- **Details**: Interactive UI with dynamic dropdowns and `config.pkl` settings. Automation and packaging pending.
- **Time Estimate**: 3–5 days

## Timeline
- **Completed**: Phases 1 and 2
- **Phase 3**: 2–3 days
- **Phase 4**: 2–4 days
- **Phase 5**: 3–5 days
- **Total Remaining**: 7–12 days

## Key Functions by Cell
Below are the key functions from each cell, detailing their roles in the system:

### Cell 1: Global Configuration Initialization
- **`on_debug_slider_change(change)`**:
  - Updates logging verbosity (0–3, mapping to CRITICAL, INFO, DEBUG) via a slider.
  - Saves `pcap_config` to `config.pkl`.
  - Logs verbosity changes.
- **Purpose**: Initializes logging (`smbreplay.log`), `pcap_config` (capture path, verbosity), and `replay_config` (server details). Stores configs in `/home/jovyan/work/smbreplay/config.pkl`. Displays debug slider.

### Cell 2: Setup and SMB2 Utility Functions
- **`shorten_path(full_path, max_components=3)`**:
  - Shortens file paths to the last `max_components` for display.
- **`normalize_path(path)`**:
  - Normalizes paths (lowercase, backslashes to slashes) for comparison.
- **`get_tree_name_mapping(frames)`**:
  - Maps `smb2.tid` to share names from Tree Connect frames.
- **`check_ssh_connectivity()`**:
  - Verifies SSH connection to `backend` server for `ntap-tshark`.
- **Purpose**: Sets up `itables` for interactive tables, defines SMB2 command mappings (`SMB2_OP_NAME_DESC`), FSCTL constants, and file/info level mappings. Ensures SSH connectivity.

### Cell 3: SMB2 Field Definitions
- **`normalize_hex_field(value, field_name)`**:
  - Normalizes hex fields (e.g., `smb2.nt_status`, `smb2.sesid`) to uppercase hex format (32-bit or 64-bit).
- **`normalize_fid(value)`**:
  - Normalizes `smb2.fid` to 128-bit hex, handling UUID or hex formats.
- **Purpose**: Defines 619 SMB2 fields from `smb2_fields.txt`, tracking fields (e.g., `frame.number`), and hex fields for normalization. Validates critical fields and defines mappings for `smb2.cmd`, `smb2.nt_status`, etc.

### Cell 4: NTAP-Tshark Processing
- **`build_tshark_command(capture, fields, reassembly, packet_limit, log_level, temp_dir, verbose)`**:
  - Constructs SSH command for `ntap-tshark` to extract SMB2 fields from PCAP.
- **`extract_fields(line, fields)`**:
  - Parses `tshark` output lines into dictionaries with frame, stream, IP, and SMB2 fields.
- **`process_tshark_output(cmd, fields)`**:
  - Processes `tshark` output into a DataFrame, optimizing memory and normalizing fields.
- **`save_to_parquet(df, parquet_path)`**:
  - Saves DataFrame to Parquet with zstd compression, handling multi-value fields.
- **`create_remote_directory(case_number, trace_name, force_reingest)`**:
  - Creates and verifies remote session storage directory (`/stingray/<case_number>/.tracer/<trace_name>/sessions`).
- **`clear_directory(directory)`**:
  - Clears files in a remote directory for re-ingestion.
- **`status_callback(message)`**:
  - Logs status messages for ingestion progress.
- **Purpose**: Handles `ntap-tshark` execution, data parsing, and storage in Parquet files. Manages remote directories with SSH.

### Cell 5: Ingestion and Session Extraction
- **`get_packet_count(capture_path)`**:
  - Retrieves packet count from PCAP using `ntap-capinfos`.
- **`normalize_sesid(sesid_str)`**:
  - Normalizes `smb2.sesid`, handling lists/commas and excluding invalid values.
- **`normalize_cmd(cmd_str)`**:
  - Normalizes `smb2.cmd`, handling lists/commas.
- **`save_session_metadata(case_number, trace_name, sessions, output_dir)`**:
  - Saves session metadata (e.g., session count, frame count) to JSON.
- **`run_ingestion(capture_path, reassembly_enabled, force_reingest, verbose)`**:
  - Orchestrates PCAP ingestion: validates PCAP, extracts fields with `tshark`, splits into sessions by `smb2.sesid`, and saves to Parquet and JSON.
- **Purpose**: Manages ingestion of PCAP files, splitting into session-based Parquet files, and storing metadata.

### Cell 6: Session Selection and Utilities
- **`load_capture()`**:
  - Loads capture path from `config.pkl` or `pcap_config`, validating existence.
- **`get_output_dir(capture)`**:
  - Derives session storage directory from capture path, ensuring write access.
- **`list_session_files(output_dir)`**:
  - Lists session Parquet files, normalizing to lowercase and removing duplicates.
- **`shorten_path(path, max_length=50, min_filename_length=20)`**:
  - Shortens paths for display, prioritizing filenames.
- **`normalize_path(path)`**:
  - Normalizes paths for comparison, preserving leading slashes.
- **`get_tree_name_mapping(df)`**:
  - Maps `smb2.tid` to tree names from Tree Connect requests.
- **Purpose**: Provides utilities for loading captures, managing session directories, and handling path normalization for UI display.

### Cell 7: Session Loading and Filtering
- **`load_and_summarize_session(capture, session_file)`**:
  - Loads session Parquet file, returning frames, field options, file options, and default fields.
- **`update_operations(capture, session_file, selected_file, selected_fields)`**:
  - Prepares operations data, normalizing fields and filtering by file if specified.
- **Purpose**: Loads and filters session data, preparing operations for display with normalized fields and mapped descriptions.

### Cell 8: Replay Mechanism
- **`setup_pre_trace_state(conn, selected_operations, default_tree_id)`**:
  - Sets up lab server file system by creating directories and pre-existing files before replay.
- **`replay_session(selected_operations, output_widget)`**:
  - Replays SMB2 operations using `impacket`, handling Tree Connect, Create, Close, Read, and Write commands. Manages `tid` and `fid` mappings.
- **Purpose**: Implements session replay on the lab server, ensuring pre-trace state and executing SMB2 commands.

### Cell 9: Dashboard Setup
- **Purpose**: Initializes dashboard widgets (e.g., `case_number_input`, `capture_dropdown`, `session_dropdown`, `ingest_button`, `replay_button`) and output widgets (`log_output`, `output_cell`). Sets up `JupyterOutputHandler` for logging and initializes `replay_config` values.

### Cell 10: Event Handlers and Rendering
- **`status_callback(message)`**:
  - Logs and displays status messages in `log_output`.
- **`update_progress(message)`**:
  - Updates progress messages in `progress_output`.
- **`update_button_states()`**:
  - Enables/disables buttons based on session availability.
- **`render_page()`**:
  - Renders operations DataTable with mandatory (`Frame`, `Command`, `Path`, `smb2.nt_status`) and optional columns, including summaries of commands and create actions.
- **`on_case_number_change(change)`**:
  - Updates `capture_dropdown` with PCAP files for the entered case number.
- **`on_capture_change(change)`**:
  - Updates `session_dropdown` with session files, adjusts button states, and saves `capture_path` to `config.pkl`.
- **`on_ingest_button_clicked(b)`**:
  - Triggers PCAP ingestion with `run_ingestion` (no force re-ingest).
- **`on_reingest_button_clicked(b)`**:
  - Triggers PCAP re-ingestion with `force_reingest=True`.
- **`on_replay_button_clicked(b)`**:
  - Initiates session replay via `replay_session`.
- **`on_session_change(change)`**:
  - Loads session data, updates `file_combobox` and `check_fields_select`, and renders operations.
- **`on_file_change(change)`**:
  - Filters operations by selected file and re-renders the table.
- **`on_fields_change(change)`**:
  - Updates table columns based on selected fields and re-renders.
- **`on_save_config(b)`**:
  - Saves `replay_config` to `config.pkl`, preserving `pcap_config`.
- **`initialize_dashboard()`**:
  - Auto-loads capture and sessions if pre-set in `config.pkl`.
- **Purpose**: Handles UI interactions, rendering operations tables, and managing configuration saves.

### Cell 11: Dashboard Display
- **`update_dashboard_layout(verbose_level)`**:
  - Constructs and displays the dashboard, conditionally showing logs based on verbosity.
- **Purpose**: Finalizes dashboard display, integrating all widgets and ensuring dynamic log visibility.

## Recent Developments
- Standardized storage path: `/stingray/<case_number>/.tracer/<trace_name>/sessions`.
- Optimized directory clearing and logging.
- Fixed `smb2.sesid` and `smb2.cmd` normalization.
- Enhanced UI with tooltips and dynamic table rendering.
- Added debug slider for verbosity control (Cell 1).

## Next Steps
- **Verify Ingestion**: Re-run on `az3-CVO-python.pcapng` and test with a smaller PCAP.
- **Finalize `smb2.cmd` Mapping**: Complete mappings (0–18) using `SMB2_OP_NAME_DESC`.
- **Develop `impacket` Replay**: Fully implement and test in Cell 8.
- **Start Phase 4**: Validate replay accuracy by comparing PCAPs.
- **Complete Phase 5**: Automate and package the system.

## Getting Started
- **Priority**: Finalize `smb2.cmd` mapping and `impacket` replay.
- **Key Cells**:
  - **Cell 1**: Configuration and logging.
  - **Cells 8–10**: Replay, dashboard, and event handling.
  - **Cell 11**: Dashboard display.
- **Action Plan**:
  1. Test replay script in Cell 8 with `impacket`.
  2. Re-run ingestion on `az3-CVO-python.pcapng` and validate.
  3. Complete `smb2.cmd` mappings (0–18).
  4. Begin Phase 4 validation.

# Project Summary

## Objective
Develop a system to capture, store, and replay SMB2 (Server Message Block version 2) network traffic in a controlled lab environment for diagnostic, testing, and protocol interaction analysis, critical for file sharing in Windows-based systems.

## Approach
- **Capture**: Extract SMB2 packets from PCAP files using `ntap-tshark` via SSH.
- **Storage**: Store data in Parquet files by session (`smb2.sesid`) with JSON metadata.
- **Replay**: Replicate file operations on a lab server using `impacket` (replacing `smbclient` due to `pysmb` deprecation).
- **User Interface**: Provide an interactive dashboard for PCAP selection, ingestion, session visualization, and replay configuration, with settings in `config.pkl`.
- **Modular Design**: Use reusable Python functions in `builtins` across notebook cells.

## Tools
- **Data Storage**: Parquet (compressed), JSON (metadata).
- **Packet Capture**: `tshark`/`ntap-tshark`.
- **Replay**: `impacket`.
- **UI and Data Handling**: `ipywidgets`, `pandas`, `pyarrow`.
- **SSH Interactions**: `paramiko`, `subprocess`.
- **Memory Monitoring**: `psutil`.
- **Logging**: Structured with `JupyterOutputHandler`.

## Development Environment
- Jupyter notebooks (Cells 1–11) in a container with an 8 GB memory limit.
- Remote server access via SSH for packet capture and operations.

## Lab Server Details
- **IP**: 10.216.29.241
- **Domain**: nas-deep.local
- **Username**: jtownsen
- **Password**: [REDACTED]
- **Share**: 2pm

## Key Workflow
1. Capture SMB2 traffic with `ntap-tshark`.
2. Process and store sessions in Parquet files with JSON metadata.
3. Replay sessions using `impacket` on the lab server.
4. Manage via interactive UI with settings in `config.pkl`.

## Pre-Trace Conditions
Ensure the lab server’s file system matches the original state by pre-creating directories and files based on `smb2.filename` and `smb2.cmd` before replay.

## Phases and Current Status
### Phase 1: Comprehensive SMB2 Field Capture
- **Status**: Completed
- **Details**: Captured 619 SMB2 fields (e.g., `smb2.cmd`, `smb2.filename`) using `ntap-tshark`, stored in Parquet with zstd compression. Fixed `smb2.filename` accuracy.

### Phase 2: Session-Based Storage
- **Status**: Completed
- **Details**: Organized data by `smb2.sesid` into Parquet files (e.g., `smb2_session_0x98fc00000000d580.parquet`) with JSON metadata. Validated with 5,741 frames from a 319,000-packet PCAP.

### Phase 3: Replay Mechanism Development
- **Status**: In Progress
- **Details**: Transitioning to `impacket`. Partial `smb2.cmd` mapping (e.g., 5 → Create). Focus on completing `impacket` integration and `smb2.cmd` mapping (0–18).
- **Recent Progress**: UI enhancements (tooltips, `smb2.nt_status` mapping), replay logic refactoring.
- **Time Remaining**: 2–3 days

### Phase 4: Validation and Iteration
- **Status**: Not Started
- **Details**: Plan to compare original and replayed PCAPs, handle edge cases, and scale to multiple sessions.
- **Time Estimate**: 2–4 days

### Phase 5: Automation and Deployment
- **Status**: Partially Completed
- **Details**: Interactive UI with dynamic dropdowns and `config.pkl` settings. Automation and packaging pending.
- **Time Estimate**: 3–5 days

## Timeline
- **Completed**: Phases 1 and 2
- **Phase 3**: 2–3 days
- **Phase 4**: 2–4 days
- **Phase 5**: 3–5 days
- **Total Remaining**: 7–12 days

## Key Functions by Cell
Below are the key functions from each cell, detailing their roles in the system:

### Cell 1: Global Configuration Initialization
- **`on_debug_slider_change(change)`**:
  - Updates logging verbosity (0–3, mapping to CRITICAL, INFO, DEBUG) via a slider.
  - Saves `pcap_config` to `config.pkl`.
  - Logs verbosity changes.
- **Purpose**: Initializes logging (`smbreplay.log`), `pcap_config` (capture path, verbosity), and `replay_config` (server details). Stores configs in `/home/jovyan/work/smbreplay/config.pkl`. Displays debug slider.

### Cell 2: Setup and SMB2 Utility Functions
- **`shorten_path(full_path, max_components=3)`**:
  - Shortens file paths to the last `max_components` for display.
- **`normalize_path(path)`**:
  - Normalizes paths (lowercase, backslashes to slashes) for comparison.
- **`get_tree_name_mapping(frames)`**:
  - Maps `smb2.tid` to share names from Tree Connect frames.
- **`check_ssh_connectivity()`**:
  - Verifies SSH connection to `backend` server for `ntap-tshark`.
- **Purpose**: Sets up `itables` for interactive tables, defines SMB2 command mappings (`SMB2_OP_NAME_DESC`), FSCTL constants, and file/info level mappings. Ensures SSH connectivity.

### Cell 3: SMB2 Field Definitions
- **`normalize_hex_field(value, field_name)`**:
  - Normalizes hex fields (e.g., `smb2.nt_status`, `smb2.sesid`) to uppercase hex format (32-bit or 64-bit).
- **`normalize_fid(value)`**:
  - Normalizes `smb2.fid` to 128-bit hex, handling UUID or hex formats.
- **Purpose**: Defines 619 SMB2 fields from `smb2_fields.txt`, tracking fields (e.g., `frame.number`), and hex fields for normalization. Validates critical fields and defines mappings for `smb2.cmd`, `smb2.nt_status`, etc.

### Cell 4: NTAP-Tshark Processing
- **`build_tshark_command(capture, fields, reassembly, packet_limit, log_level, temp_dir, verbose)`**:
  - Constructs SSH command for `ntap-tshark` to extract SMB2 fields from PCAP.
- **`extract_fields(line, fields)`**:
  - Parses `tshark` output lines into dictionaries with frame, stream, IP, and SMB2 fields.
- **`process_tshark_output(cmd, fields)`**:
  - Processes `tshark` output into a DataFrame, optimizing memory and normalizing fields.
- **`save_to_parquet(df, parquet_path)`**:
  - Saves DataFrame to Parquet with zstd compression, handling multi-value fields.
- **`create_remote_directory(case_number, trace_name, force_reingest)`**:
  - Creates and verifies remote session storage directory (`/stingray/<case_number>/.tracer/<trace_name>/sessions`).
- **`clear_directory(directory)`**:
  - Clears files in a remote directory for re-ingestion.
- **`status_callback(message)`**:
  - Logs status messages for ingestion progress.
- **Purpose**: Handles `ntap-tshark` execution, data parsing, and storage in Parquet files. Manages remote directories with SSH.

### Cell 5: Ingestion and Session Extraction
- **`get_packet_count(capture_path)`**:
  - Retrieves packet count from PCAP using `ntap-capinfos`.
- **`normalize_sesid(sesid_str)`**:
  - Normalizes `smb2.sesid`, handling lists/commas and excluding invalid values.
- **`normalize_cmd(cmd_str)`**:
  - Normalizes `smb2.cmd`, handling lists/commas.
- **`save_session_metadata(case_number, trace_name, sessions, output_dir)`**:
  - Saves session metadata (e.g., session count, frame count) to JSON.
- **`run_ingestion(capture_path, reassembly_enabled, force_reingest, verbose)`**:
  - Orchestrates PCAP ingestion: validates PCAP, extracts fields with `tshark`, splits into sessions by `smb2.sesid`, and saves to Parquet and JSON.
- **Purpose**: Manages ingestion of PCAP files, splitting into session-based Parquet files, and storing metadata.

### Cell 6: Session Selection and Utilities
- **`load_capture()`**:
  - Loads capture path from `config.pkl` or `pcap_config`, validating existence.
- **`get_output_dir(capture)`**:
  - Derives session storage directory from capture path, ensuring write access.
- **`list_session_files(output_dir)`**:
  - Lists session Parquet files, normalizing to lowercase and removing duplicates.
- **`shorten_path(path, max_length=50, min_filename_length=20)`**:
  - Shortens paths for display, prioritizing filenames.
- **`normalize_path(path)`**:
  - Normalizes paths for comparison, preserving leading slashes.
- **`get_tree_name_mapping(df)`**:
  - Maps `smb2.tid` to tree names from Tree Connect requests.
- **Purpose**: Provides utilities for loading captures, managing session directories, and handling path normalization for UI display.

### Cell 7: Session Loading and Filtering
- **`load_and_summarize_session(capture, session_file)`**:
  - Loads session Parquet file, returning frames, field options, file options, and default fields.
- **`update_operations(capture, session_file, selected_file, selected_fields)`**:
  - Prepares operations data, normalizing fields and filtering by file if specified.
- **Purpose**: Loads and filters session data, preparing operations for display with normalized fields and mapped descriptions.

### Cell 8: Replay Mechanism
- **`setup_pre_trace_state(conn, selected_operations, default_tree_id)`**:
  - Sets up lab server file system by creating directories and pre-existing files before replay.
- **`replay_session(selected_operations, output_widget)`**:
  - Replays SMB2 operations using `impacket`, handling Tree Connect, Create, Close, Read, and Write commands. Manages `tid` and `fid` mappings.
- **Purpose**: Implements session replay on the lab server, ensuring pre-trace state and executing SMB2 commands.

### Cell 9: Dashboard Setup
- **Purpose**: Initializes dashboard widgets (e.g., `case_number_input`, `capture_dropdown`, `session_dropdown`, `ingest_button`, `replay_button`) and output widgets (`log_output`, `output_cell`). Sets up `JupyterOutputHandler` for logging and initializes `replay_config` values.

### Cell 10: Event Handlers and Rendering
- **`status_callback(message)`**:
  - Logs and displays status messages in `log_output`.
- **`update_progress(message)`**:
  - Updates progress messages in `progress_output`.
- **`update_button_states()`**:
  - Enables/disables buttons based on session availability.
- **`render_page()`**:
  - Renders operations DataTable with mandatory (`Frame`, `Command`, `Path`, `smb2.nt_status`) and optional columns, including summaries of commands and create actions.
- **`on_case_number_change(change)`**:
  - Updates `capture_dropdown` with PCAP files for the entered case number.
- **`on_capture_change(change)`**:
  - Updates `session_dropdown` with session files, adjusts button states, and saves `capture_path` to `config.pkl`.
- **`on_ingest_button_clicked(b)`**:
  - Triggers PCAP ingestion with `run_ingestion` (no force re-ingest).
- **`on_reingest_button_clicked(b)`**:
  - Triggers PCAP re-ingestion with `force_reingest=True`.
- **`on_replay_button_clicked(b)`**:
  - Initiates session replay via `replay_session`.
- **`on_session_change(change)`**:
  - Loads session data, updates `file_combobox` and `check_fields_select`, and renders operations.
- **`on_file_change(change)`**:
  - Filters operations by selected file and re-renders the table.
- **`on_fields_change(change)`**:
  - Updates table columns based on selected fields and re-renders.
- **`on_save_config(b)`**:
  - Saves `replay_config` to `config.pkl`, preserving `pcap_config`.
- **`initialize_dashboard()`**:
  - Auto-loads capture and sessions if pre-set in `config.pkl`.
- **Purpose**: Handles UI interactions, rendering operations tables, and managing configuration saves.

### Cell 11: Dashboard Display
- **`update_dashboard_layout(verbose_level)`**:
  - Constructs and displays the dashboard, conditionally showing logs based on verbosity.
- **Purpose**: Finalizes dashboard display, integrating all widgets and ensuring dynamic log visibility.

## Recent Developments
- Standardized storage path: `/stingray/<case_number>/.tracer/<trace_name>/sessions`.
- Optimized directory clearing and logging.
- Fixed `smb2.sesid` and `smb2.cmd` normalization.
- Enhanced UI with tooltips and dynamic table rendering.
- Added debug slider for verbosity control (Cell 1).

## Next Steps
- **Verify Ingestion**: Re-run on `az3-CVO-python.pcapng` and test with a smaller PCAP.
- **Finalize `smb2.cmd` Mapping**: Complete mappings (0–18) using `SMB2_OP_NAME_DESC`.
- **Develop `impacket` Replay**: Fully implement and test in Cell 8.
- **Start Phase 4**: Validate replay accuracy by comparing PCAPs.
- **Complete Phase 5**: Automate and package the system.

## Getting Started
- **Priority**: Finalize `smb2.cmd` mapping and `impacket` replay.
- **Key Cells**:
  - **Cell 1**: Configuration and logging.
  - **Cells 8–10**: Replay, dashboard, and event handling.
  - **Cell 11**: Dashboard display.
- **Action Plan**:
  1. Test replay script in Cell 8 with `impacket`.
  2. Re-run ingestion on `az3-CVO-python.pcapng` and validate.
  3. Complete `smb2.cmd` mappings (0–18).
  4. Begin Phase 4 validation.

# Project Summary

## Objective
Develop a system to capture, store, and replay SMB2 (Server Message Block version 2) network traffic in a controlled lab environment for diagnostic, testing, and protocol interaction analysis, critical for file sharing in Windows-based systems.

## Approach
- **Capture**: Extract SMB2 packets from PCAP files using `ntap-tshark` via SSH.
- **Storage**: Store data in Parquet files by session (`smb2.sesid`) with JSON metadata.
- **Replay**: Replicate file operations on a lab server using `impacket` (replacing `smbclient` due to `pysmb` deprecation).
- **User Interface**: Provide an interactive dashboard for PCAP selection, ingestion, session visualization, and replay configuration, with settings in `config.pkl`.
- **Modular Design**: Use reusable Python functions in `builtins` across notebook cells.

## Tools
- **Data Storage**: Parquet (compressed), JSON (metadata).
- **Packet Capture**: `tshark`/`ntap-tshark`.
- **Replay**: `impacket`.
- **UI and Data Handling**: `ipywidgets`, `pandas`, `pyarrow`.
- **SSH Interactions**: `paramiko`, `subprocess`.
- **Memory Monitoring**: `psutil`.
- **Logging**: Structured with `JupyterOutputHandler`.

## Development Environment
- Jupyter notebooks (Cells 1–11) in a container with an 8 GB memory limit.
- Remote server access via SSH for packet capture and operations.

## Lab Server Details
- **IP**: 10.216.29.241
- **Domain**: nas-deep.local
- **Username**: jtownsen
- **Password**: [REDACTED]
- **Share**: 2pm

## Key Workflow
1. Capture SMB2 traffic with `ntap-tshark`.
2. Process and store sessions in Parquet files with JSON metadata.
3. Replay sessions using `impacket` on the lab server.
4. Manage via interactive UI with settings in `config.pkl`.

## Pre-Trace Conditions
Ensure the lab server’s file system matches the original state by pre-creating directories and files based on `smb2.filename` and `smb2.cmd` before replay.

## Phases and Current Status
### Phase 1: Comprehensive SMB2 Field Capture
- **Status**: Completed
- **Details**: Captured 619 SMB2 fields (e.g., `smb2.cmd`, `smb2.filename`) using `ntap-tshark`, stored in Parquet with zstd compression. Fixed `smb2.filename` accuracy.

### Phase 2: Session-Based Storage
- **Status**: Completed
- **Details**: Organized data by `smb2.sesid` into Parquet files (e.g., `smb2_session_0x98fc00000000d580.parquet`) with JSON metadata. Validated with 5,741 frames from a 319,000-packet PCAP.

### Phase 3: Replay Mechanism Development
- **Status**: In Progress
- **Details**: Transitioning to `impacket`. Partial `smb2.cmd` mapping (e.g., 5 → Create). Focus on completing `impacket` integration and `smb2.cmd` mapping (0–18).
- **Recent Progress**: UI enhancements (tooltips, `smb2.nt_status` mapping), replay logic refactoring.
- **Time Remaining**: 2–3 days

### Phase 4: Validation and Iteration
- **Status**: Not Started
- **Details**: Plan to compare original and replayed PCAPs, handle edge cases, and scale to multiple sessions.
- **Time Estimate**: 2–4 days

### Phase 5: Automation and Deployment
- **Status**: Partially Completed
- **Details**: Interactive UI with dynamic dropdowns and `config.pkl` settings. Automation and packaging pending.
- **Time Estimate**: 3–5 days

## Timeline
- **Completed**: Phases 1 and 2
- **Phase 3**: 2–3 days
- **Phase 4**: 2–4 days
- **Phase 5**: 3–5 days
- **Total Remaining**: 7–12 days

## Key Functions by Cell
Below are the key functions from each cell, detailing their roles in the system:

### Cell 1: Global Configuration Initialization
- **`on_debug_slider_change(change)`**:
  - Updates logging verbosity (0–3, mapping to CRITICAL, INFO, DEBUG) via a slider.
  - Saves `pcap_config` to `config.pkl`.
  - Logs verbosity changes.
- **Purpose**: Initializes logging (`smbreplay.log`), `pcap_config` (capture path, verbosity), and `replay_config` (server details). Stores configs in `/home/jovyan/work/smbreplay/config.pkl`. Displays debug slider.

### Cell 2: Setup and SMB2 Utility Functions
- **`shorten_path(full_path, max_components=3)`**:
  - Shortens file paths to the last `max_components` for display.
- **`normalize_path(path)`**:
  - Normalizes paths (lowercase, backslashes to slashes) for comparison.
- **`get_tree_name_mapping(frames)`**:
  - Maps `smb2.tid` to share names from Tree Connect frames.
- **`check_ssh_connectivity()`**:
  - Verifies SSH connection to `backend` server for `ntap-tshark`.
- **Purpose**: Sets up `itables` for interactive tables, defines SMB2 command mappings (`SMB2_OP_NAME_DESC`), FSCTL constants, and file/info level mappings. Ensures SSH connectivity.

### Cell 3: SMB2 Field Definitions
- **`normalize_hex_field(value, field_name)`**:
  - Normalizes hex fields (e.g., `smb2.nt_status`, `smb2.sesid`) to uppercase hex format (32-bit or 64-bit).
- **`normalize_fid(value)`**:
  - Normalizes `smb2.fid` to 128-bit hex, handling UUID or hex formats.
- **Purpose**: Defines 619 SMB2 fields from `smb2_fields.txt`, tracking fields (e.g., `frame.number`), and hex fields for normalization. Validates critical fields and defines mappings for `smb2.cmd`, `smb2.nt_status`, etc.

### Cell 4: NTAP-Tshark Processing
- **`build_tshark_command(capture, fields, reassembly, packet_limit, log_level, temp_dir, verbose)`**:
  - Constructs SSH command for `ntap-tshark` to extract SMB2 fields from PCAP.
- **`extract_fields(line, fields)`**:
  - Parses `tshark` output lines into dictionaries with frame, stream, IP, and SMB2 fields.
- **`process_tshark_output(cmd, fields)`**:
  - Processes `tshark` output into a DataFrame, optimizing memory and normalizing fields.
- **`save_to_parquet(df, parquet_path)`**:
  - Saves DataFrame to Parquet with zstd compression, handling multi-value fields.
- **`create_remote_directory(case_number, trace_name, force_reingest)`**:
  - Creates and verifies remote session storage directory (`/stingray/<case_number>/.tracer/<trace_name>/sessions`).
- **`clear_directory(directory)`**:
  - Clears files in a remote directory for re-ingestion.
- **`status_callback(message)`**:
  - Logs status messages for ingestion progress.
- **Purpose**: Handles `ntap-tshark` execution, data parsing, and storage in Parquet files. Manages remote directories with SSH.

### Cell 5: Ingestion and Session Extraction
- **`get_packet_count(capture_path)`**:
  - Retrieves packet count from PCAP using `ntap-capinfos`.
- **`normalize_sesid(sesid_str)`**:
  - Normalizes `smb2.sesid`, handling lists/commas and excluding invalid values.
- **`normalize_cmd(cmd_str)`**:
  - Normalizes `smb2.cmd`, handling lists/commas.
- **`save_session_metadata(case_number, trace_name, sessions, output_dir)`**:
  - Saves session metadata (e.g., session count, frame count) to JSON.
- **`run_ingestion(capture_path, reassembly_enabled, force_reingest, verbose)`**:
  - Orchestrates PCAP ingestion: validates PCAP, extracts fields with `tshark`, splits into sessions by `smb2.sesid`, and saves to Parquet and JSON.
- **Purpose**: Manages ingestion of PCAP files, splitting into session-based Parquet files, and storing metadata.

### Cell 6: Session Selection and Utilities
- **`load_capture()`**:
  - Loads capture path from `config.pkl` or `pcap_config`, validating existence.
- **`get_output_dir(capture)`**:
  - Derives session storage directory from capture path, ensuring write access.
- **`list_session_files(output_dir)`**:
  - Lists session Parquet files, normalizing to lowercase and removing duplicates.
- **`shorten_path(path, max_length=50, min_filename_length=20)`**:
  - Shortens paths for display, prioritizing filenames.
- **`normalize_path(path)`**:
  - Normalizes paths for comparison, preserving leading slashes.
- **`get_tree_name_mapping(df)`**:
  - Maps `smb2.tid` to tree names from Tree Connect requests.
- **Purpose**: Provides utilities for loading captures, managing session directories, and handling path normalization for UI display.

### Cell 7: Session Loading and Filtering
- **`load_and_summarize_session(capture, session_file)`**:
  - Loads session Parquet file, returning frames, field options, file options, and default fields.
- **`update_operations(capture, session_file, selected_file, selected_fields)`**:
  - Prepares operations data, normalizing fields and filtering by file if specified.
- **Purpose**: Loads and filters session data, preparing operations for display with normalized fields and mapped descriptions.

### Cell 8: Replay Mechanism
- **`setup_pre_trace_state(conn, selected_operations, default_tree_id)`**:
  - Sets up lab server file system by creating directories and pre-existing files before replay.
- **`replay_session(selected_operations, output_widget)`**:
  - Replays SMB2 operations using `impacket`, handling Tree Connect, Create, Close, Read, and Write commands. Manages `tid` and `fid` mappings.
- **Purpose**: Implements session replay on the lab server, ensuring pre-trace state and executing SMB2 commands.

### Cell 9: Dashboard Setup
- **Purpose**: Initializes dashboard widgets (e.g., `case_number_input`, `capture_dropdown`, `session_dropdown`, `ingest_button`, `replay_button`) and output widgets (`log_output`, `output_cell`). Sets up `JupyterOutputHandler` for logging and initializes `replay_config` values.

### Cell 10: Event Handlers and Rendering
- **`status_callback(message)`**:
  - Logs and displays status messages in `log_output`.
- **`update_progress(message)`**:
  - Updates progress messages in `progress_output`.
- **`update_button_states()`**:
  - Enables/disables buttons based on session availability.
- **`render_page()`**:
  - Renders operations DataTable with mandatory (`Frame`, `Command`, `Path`, `smb2.nt_status`) and optional columns, including summaries of commands and create actions.
- **`on_case_number_change(change)`**:
  - Updates `capture_dropdown` with PCAP files for the entered case number.
- **`on_capture_change(change)`**:
  - Updates `session_dropdown` with session files, adjusts button states, and saves `capture_path` to `config.pkl`.
- **`on_ingest_button_clicked(b)`**:
  - Triggers PCAP ingestion with `run_ingestion` (no force re-ingest).
- **`on_reingest_button_clicked(b)`**:
  - Triggers PCAP re-ingestion with `force_reingest=True`.
- **`on_replay_button_clicked(b)`**:
  - Initiates session replay via `replay_session`.
- **`on_session_change(change)`**:
  - Loads session data, updates `file_combobox` and `check_fields_select`, and renders operations.
- **`on_file_change(change)`**:
  - Filters operations by selected file and re-renders the table.
- **`on_fields_change(change)`**:
  - Updates table columns based on selected fields and re-renders.
- **`on_save_config(b)`**:
  - Saves `replay_config` to `config.pkl`, preserving `pcap_config`.
- **`initialize_dashboard()`**:
  - Auto-loads capture and sessions if pre-set in `config.pkl`.
- **Purpose**: Handles UI interactions, rendering operations tables, and managing configuration saves.

### Cell 11: Dashboard Display
- **`update_dashboard_layout(verbose_level)`**:
  - Constructs and displays the dashboard, conditionally showing logs based on verbosity.
- **Purpose**: Finalizes dashboard display, integrating all widgets and ensuring dynamic log visibility.

## Recent Developments
- Standardized storage path: `/stingray/<case_number>/.tracer/<trace_name>/sessions`.
- Optimized directory clearing and logging.
- Fixed `smb2.sesid` and `smb2.cmd` normalization.
- Enhanced UI with tooltips and dynamic table rendering.
- Added debug slider for verbosity control (Cell 1).

## Next Steps
- **Verify Ingestion**: Re-run on `az3-CVO-python.pcapng` and test with a smaller PCAP.
- **Finalize `smb2.cmd` Mapping**: Complete mappings (0–18) using `SMB2_OP_NAME_DESC`.
- **Develop `impacket` Replay**: Fully implement and test in Cell 8.
- **Start Phase 4**: Validate replay accuracy by comparing PCAPs.
- **Complete Phase 5**: Automate and package the system.

## Getting Started
- **Priority**: Finalize `smb2.cmd` mapping and `impacket` replay.
- **Key Cells**:
  - **Cell 1**: Configuration and logging.
  - **Cells 8–10**: Replay, dashboard, and event handling.
  - **Cell 11**: Dashboard display.
- **Action Plan**:
  1. Test replay script in Cell 8 with `impacket`.
  2. Re-run ingestion on `az3-CVO-python.pcapng` and validate.
  3. Complete `smb2.cmd` mappings (0–18).
  4. Begin Phase 4 validation.

In [None]:
# Cell 2: Setup, Imports, and SMB2 Utility Functions
import os
import subprocess
import re
from typing import Dict, List, Any
import pandas as pd
from impacket.smb3structs import *
from impacket.nt_errors import ERROR_MESSAGES
from itables import init_notebook_mode
import paramiko
import sys
import builtins

# Import logger from Cell 1
if not hasattr(builtins, 'logger'):
    raise ImportError("Cell 1 must define logger in builtins.")
logger = builtins.logger

# Import replay_config from Cell 1
if not hasattr(builtins, 'replay_config'):
    raise ImportError("Cell 1 must define replay_config in builtins.")
replay_config = builtins.replay_config

# Initialize notebook mode
init_notebook_mode(all_interactive=True)
logger.debug("Initialized itables notebook mode with all_interactive=True")

# Constants
SEP = 4 * " "
NTAPSHARK_PATH = "/usr/local/bin/ntap-tshark"

# SMB2 Command Names
SMB2_OP_NAME_DESC = {
    0: ("Negotiate Protocol Request", "Negotiate Protocol Response"),
    1: ("Session Setup Request", "Session Setup Response"),
    2: ("Session Logoff Request", "Session Logoff Response"),
    3: ("Tree Connect Request", "Tree Connect Response"),
    4: ("Tree Disconnect Request", "Tree Disconnect Response"),
    5: ("Create Request", "Create Response"),
    6: ("Close Request", "Close Response"),
    7: ("Flush Request", "Flush Response"),
    8: ("Read Request", "Read Response"),
    9: ("Write Request", "Write Response"),
    10: ("Lock Request", "Lock Response"),
    11: ("IOCTL Request", "IOCTL Response"),
    12: ("Cancel Request", "Cancel Response"),
    13: ("Echo Request", "Echo Response"),
    14: ("Query Directory Request", "Query Directory Response"),
    15: ("Change Notify Request", "Change Notify Response"),
    16: ("Query Info Request", "Query Info Response"),
    17: ("Set Info Request", "Set Info Response"),
    18: ("Oplock Break Request", "Oplock Break Response"),
}
logger.debug(f"Defined SMB2_OP_NAME_DESC with {len(SMB2_OP_NAME_DESC)} command mappings")

# Utility Functions
def shorten_path(full_path: str, max_components: int = 3) -> str:
    """Shorten file paths to the last max_components."""
    logger.debug(f"Shortening path: {full_path}")
    if full_path == "Entire Stream":
        return full_path
    components = full_path.split('\\')
    if len(components) <= max_components:
        return full_path
    shortened = '...\\' + '\\'.join(components[-max_components:])
    logger.debug(f"Shortened path to: {shortened}")
    return shortened

def normalize_path(path: str) -> str:
    """Normalize file paths for comparison."""
    logger.debug(f"Normalizing path: {path}")
    if pd.isna(path) or path in ["N/A", "", "Entire Stream"]:
        return "N/A"
    normalized = path.strip().replace('/', '\\').lower()
    logger.debug(f"Normalized path to: {normalized}")
    return normalized

def get_tree_name_mapping(frames: pd.DataFrame) -> Dict[str, str]:
    """Map tree IDs to share names based on Tree Connect frames."""
    logger.info("Generating tree name mappings")
    tree_mapping = {}
    if isinstance(frames, list):
        frames = pd.DataFrame(frames)
        logger.debug("Converted input list to DataFrame")
    
    request_frames = frames[(frames['smb2.cmd'] == '3') & (frames['smb2.flags.response'] != 'True')]
    logger.debug(f"Found {len(request_frames)} Tree Connect request frames")
    
    for _, request_frame in request_frames.iterrows():
        tree_path = request_frame.get('smb2.tree', None)
        if pd.isna(tree_path) or not tree_path:
            logger.debug(f"No tree path in request frame {request_frame.get('frame.number')}")
            continue
        share_name = tree_path.split('\\')[-1] if '\\' in tree_path else tree_path
        response_frames = frames[(frames['smb2.cmd'] == '3') & 
                                (frames['smb2.flags.response'] == 'True') &
                                (frames['frame.number'].astype(int) > int(request_frame.get('frame.number', 0)))]
        for _, response_frame in response_frames.iterrows():
            tid = response_frame.get('smb2.tid', None)
            if tid and pd.notna(tid):
                tree_mapping[tid] = share_name
                logger.debug(f"Mapped tree ID {tid} to share {share_name} from frame {request_frame.get('frame.number')}")
                break
    
    logger.info(f"Found {len(tree_mapping)} tree ID mappings")
    return tree_mapping

def check_ssh_connectivity() -> bool:
    """Verify SSH connection to backend server."""
    ssh_host = "backend"
    ssh_user = "root"
    ssh_key = "/home/jovyan/.ssh/id_rsa"
    logger.info(f"Checking SSH connection to {ssh_host}")
    try:
        client = paramiko.SSHClient()
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        client.connect(ssh_host, username=ssh_user, key_filename=ssh_key, timeout=5)
        client.close()
        logger.info(f"SSH connection to {ssh_host} successful")
        return True
    except Exception as e:
        logger.critical(f"SSH connection to {ssh_host} failed: {str(e)}")
        return False

# FSCTL_* Constants
FSCTL_CONSTANTS = {
    "FSCTL_CREATE_OR_GET_OBJECT_ID": 0x900c0,
    "FSCTL_DELETE_OBJECT_ID": 0x900a0,
    "FSCTL_DELETE_REPARSE_POINT": 0x900ac,
    "FSCTL_DUPLICATE_EXTENTS_TO_FILE": 0x98344,
    "FSCTL_FILESYSTEM_GET_STATISTICS": 0x90060,
    "FSCTL_FIND_FILES_BY_SID": 0x9008f,
    "FSCTL_GET_COMPRESSION": 0x9003c,
    "FSCTL_GET_INTEGRITY_INFORMATION": 0x9027c,
    "FSCTL_GET_NTFS_VOLUME_DATA": 0x90064,
    "FSCTL_GET_REFS_VOLUME_DATA": 0x902d8,
    "FSCTL_GET_OBJECT_ID": 0x9009c,
    "FSCTL_GET_REPARSE_POINT": 0x900a8,
    "FSCTL_GET_RETRIEVAL_POINTERS": 0x90073,
    "FSCTL_IS_PATHNAME_VALID": 0x9002c,
    "FSCTL_LMR_SET_LINK_TRACKING_INFORMATION": 0x1400ec,
    "FSCTL_OFFLOAD_READ": 0x94264,
    "FSCTL_OFFLOAD_WRITE": 0x98268,
    "FSCTL_QUERY_ALLOCATED_RANGES": 0x940cf,
    "FSCTL_QUERY_FAT_BPB": 0x90058,
    "FSCTL_QUERY_FILE_REGIONS": 0x90284,
    "FSCTL_QUERY_ON_DISK_VOLUME_INFO": 0x9013c,
    "FSCTL_QUERY_SPARING_INFO": 0x90138,
    "FSCTL_READ_FILE_USN_DATA": 0x900eb,
    "FSCTL_RECALL_FILE": 0x90117,
    "FSCTL_SET_COMPRESSION": 0x9c040,
    "FSCTL_SET_DEFECT_MANAGEMENT": 0x98134,
    "FSCTL_SET_ENCRYPTION": 0x900d7,
    "FSCTL_SET_INTEGRITY_INFORMATION": 0x9c280,
    "FSCTL_SET_OBJECT_ID": 0x90098,
    "FSCTL_SET_OBJECT_ID_EXTENDED": 0x900bc,
    "FSCTL_SET_SPARSE": 0x900c4,
    "FSCTL_SET_ZERO_DATA": 0x980c8,
    "FSCTL_SET_ZERO_ON_DEALLOCATION": 0x90194,
    "FSCTL_SIS_COPYFILE": 0x90100,
    "FSCTL_WRITE_USN_CLOSE_RECORD": 0x900ef,
    "FSCTL_DFS_GET_REFERRALS": 0x60194,
    "FSCTL_PIPE_PEEK": 0x11400c,
    "FSCTL_PIPE_WAIT": 0x110018,
    "FSCTL_PIPE_TRANSCEIVE": 0x11c017,
    "FSCTL_SRV_COPYCHUNK": 0x1440f0,
    "FSCTL_SRV_ENUMERATE_SNAPSHOTS": 0x144064,
    "FSCTL_SRV_REQUEST_RESUME_KEY": 0x1400c4,
    "FSCTL_SRV_READ_HASH": 0x1440e8,
    "FSCTL_SRV_COPYCHUNK_WRITE": 0x1440f4,
    "FSCTL_LMR_REQUEST_RESILIENCY": 0x1400d8,
    "FSCTL_QUERY_NETWORK_INTERFACE_INFO": 0x1400fc,
    "FSCTL_SET_REPARSE_POINT": 0x900a4,
    "FSCTL_DFS_GET_REFERRALS_EX": 0x601a0,
    "FSCTL_FILE_LEVEL_TRIM": 0x98208,
    "FSCTL_VALIDATE_NEGOTIATE_INFO": 0x140204,
    "FSCTL_QUERY_SHARED_VIRTUAL_DISK_SUPPORT": 0x90300,
    "FSCTL_SVHDX_SYNC_TUNNEL_REQUEST": 0x90304,
}
logger.debug(f"Defined FSCTL_CONSTANTS with {len(FSCTL_CONSTANTS)} entries")

# File Information Classes
FILE_INFO_CLASSES = {
    "FILE_DIRECTORY_INFORMATION": 1,
    "FILE_FULL_DIRECTORY_INFORMATION": 2,
    "FILEID_FULL_DIRECTORY_INFORMATION": 38,
    "FILE_BOTH_DIRECTORY_INFORMATION": 3,
    "FILEID_BOTH_DIRECTORY_INFORMATION": 37,
    "FILENAMES_INFORMATION": 12,
}
logger.debug(f"Defined FILE_INFO_CLASSES with {len(FILE_INFO_CLASSES)} entries")

# SMB2 Info Levels
SMB2_INFO_LEVELS = {
    "SMB2_0_INFO_FILE": 0x01,
    "SMB2_0_INFO_FILESYSTEM": 0x02,
    "SMB2_0_INFO_SECURITY": 0x03,
}
logger.debug(f"Defined SMB2_INFO_LEVELS with {len(SMB2_INFO_LEVELS)} entries")

# File Info Classes
SMB2_FILE_INFO_CLASSES = {
    "SMB2_FILE_ACCESS_INFO": 8,
    "SMB2_FILE_ALIGNMENT_INFO": 17,
    "SMB2_FILE_ALL_INFO": 18,
    "SMB2_FILE_ALTERNATE_NAME_INFO": 21,
    "SMB2_ATTRIBUTE_TAG_INFO": 35,
    "SMB2_FILE_BASIC_INFO": 4,
    "SMB2_FILE_COMPRESSION_INFO": 28,
    "SMB2_FILE_EA_INFO": 7,
    "SMB2_FULL_EA_INFO": 15,
    "SMB2_FILE_INTERNAL_INFO": 6,
    "SMB2_FILE_MODE_INFO": 16,
    "SMB2_FILE_NETWORK_OPEN_INFO": 34,
    "SMB2_FILE_PIPE_INFO": 23,
    "SMB2_FILE_POSITION_INFO": 14,
    "SMB2_FILE_STANDARD_INFO": 5,
    "SMB2_FILE_STREAM_INFO": 22,
    "SMB2_FILESYSTEM_ATTRIBUTE_INFO": 5,
    "SMB2_FILESYSTEM_CONTROL_INFO": 6,
    "SMB2_FILESYSTEM_DEVICE_INFO": 4,
    "SMB2_FILESYSTEM_FULL_SIZE_INFO": 7,
    "SMB2_FILESYSTEM_OBJECT_ID_INFO": 8,
    "SMB2_FILESYSTEM_SECTOR_SIZE_INFO": 11,
    "SMB2_FILESYSTEM_SIZE_INFO": 3,
    "SMB2_FILESYSTEM_VOLUME_INFO": 1,
    "SMB2_FILE_ALLOCATION_INFO": 19,
    "SMB2_FILE_DISPOSITION_INFO": 13,
    "SMB2_FILE_END_OF_FILE_INFO": 20,
    "SMB2_FILE_LINK_INFO": 11,
    "SMB2_FILE_RENAME_INFO": 10,
    "SMB2_FILE_SHORT_NAME_INFO": 45,
    "SMB2_FILE_VALID_DATA_LENGTH_INFO": 47,
}
logger.debug(f"Defined SMB2_FILE_INFO_CLASSES with {len(SMB2_FILE_INFO_CLASSES)} entries")

# Add to builtins
builtins.check_ssh_connectivity = check_ssh_connectivity
builtins.SMB2_OP_NAME_DESC = SMB2_OP_NAME_DESC
builtins.FSCTL_CONSTANTS = FSCTL_CONSTANTS
builtins.FILE_INFO_CLASSES = FILE_INFO_CLASSES
builtins.SMB2_INFO_LEVELS = SMB2_INFO_LEVELS
builtins.SMB2_FILE_INFO_CLASSES = SMB2_FILE_INFO_CLASSES
builtins.ERROR_MESSAGES = ERROR_MESSAGES
builtins.shorten_path = shorten_path
builtins.normalize_path = normalize_path
builtins.get_tree_name_mapping = get_tree_name_mapping

# Setup Confirmation
if not check_ssh_connectivity():
    logger.critical("Setup aborted due to SSH connection failure")
    sys.exit(1)
else:
    logger.info(f"Setup initialized on {pd.Timestamp.now()} for remote ntap-tshark on backend")

In [None]:
# Cell 3: Define comprehensive SMB2 and related fields from smb2_fields.txt
import pandas as pd
import subprocess
import os
import uuid
from typing import List, Dict
import builtins

# Import from builtins (set in Cells 1–2)
from builtins import logger, check_ssh_connectivity, SMB2_OP_NAME_DESC, FSCTL_CONSTANTS, FILE_INFO_CLASSES, SMB2_INFO_LEVELS, SMB2_FILE_INFO_CLASSES, ERROR_MESSAGES

SSH_KEY = "/home/jovyan/.ssh/id_rsa"
SSH_USER = "root"
SSH_HOST = "backend"
NTAPSHARK_PATH = "/usr/local/bin/ntap-tshark"

# Tracking fields for network context
TRACKING_FIELDS = [
    "frame.number", "tcp.stream", "ip.src", "ip.dst", "frame.time", "frame.time_delta",
    "frame.len", "tcp.srcport", "tcp.dstport", "tcp.seq", "tcp.ack", "tcp.len",
    "ip.ttl", "ip.proto", "frame.time_epoch", "tcp.flags", "tcp.window_size", "ip.id"
]
logger.debug(f"Defined {len(TRACKING_FIELDS)} TRACKING_FIELDS")

# Hex fields requiring normalization
HEX_FIELDS = [
    "smb2.nt_status", "smb2.ioctl.function", "smb2.tid", "smb2.sesid", "smb2.msg_id", "smb2.fid",
    "smb2.create.action"  # Added for create action normalization
]
logger.debug(f"Defined HEX_FIELDS: {', '.join(HEX_FIELDS)}")

# Create action mappings (from impacket.smb3structs)
CREATE_ACTION_DESC = {
    0: "FILE_SUPERSEDED",
    1: "FILE_OPENED",
    2: "FILE_CREATED",
    3: "FILE_OVERWRITTEN",
    4: "FILE_EXISTS",
    5: "FILE_DOES_NOT_EXIST"
}
logger.debug(f"Defined CREATE_ACTION_DESC with {len(CREATE_ACTION_DESC)} mappings")

# Generate smb2_fields.txt if missing
smb2_fields_file = "smb2_fields.txt"
if not os.path.exists(smb2_fields_file):
    logger.info(f"Generating {smb2_fields_file}...")
    if not check_ssh_connectivity():
        logger.critical(f"Cannot connect to {SSH_HOST} for ntap-tshark")
        raise RuntimeError(f"Cannot connect to {SSH_HOST} for ntap-tshark")
    cmd = f"ssh -i {SSH_KEY} -p 22 {SSH_USER}@{SSH_HOST} {NTAPSHARK_PATH} -G fields | grep smb2 > {smb2_fields_file}"
    logger.debug(f"Executing command: {cmd}")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        logger.critical(f"Error generating {smb2_fields_file}: stderr={result.stderr}, stdout={result.stdout}")
        raise RuntimeError(f"Failed to generate {smb2_fields_file}")
    logger.info(f"{smb2_fields_file} generated successfully")

# Extract SMB2 fields from smb2_fields.txt
logger.info(f"Reading {smb2_fields_file}")
with open(smb2_fields_file, "r") as f:
    smb2_field_lines = f.readlines()
logger.info(f"Read {len(smb2_field_lines)} lines from {smb2_fields_file}")
if not smb2_field_lines:
    logger.critical(f"Error: {smb2_fields_file} is empty. Check ntap-tshark -G fields output.")
    raise RuntimeError(f"{smb2_fields_file} is empty")

SMB2_FIELDS = []
for line in smb2_field_lines:
    parts = line.strip().split("\t")
    if len(parts) >= 4 and parts[0] == "F" and parts[2].startswith("smb2."):
        SMB2_FIELDS.append(parts[2])
logger.info(f"Extracted {len(SMB2_FIELDS)} SMB2 fields from {smb2_fields_file}")
logger.debug(f"First 5 SMB2 fields: {', '.join(SMB2_FIELDS[:5])}...")

# Combine and deduplicate fields
FIELDS = sorted(set(TRACKING_FIELDS + SMB2_FIELDS))
logger.info(f"Combined {len(FIELDS)} unique fields for ingestion")

# Field corrections
if 'smb.file_name' in FIELDS:
    logger.warning(f"Replacing 'smb.file_name' with 'smb2.filename' in FIELDS")
    FIELDS[FIELDS.index('smb.file_name')] = 'smb2.filename'

# Validate critical fields
CRITICAL_FIELDS = [
    "smb2.cmd", "smb2.sesid", "smb2.filename", "smb2.write_data", "smb2.read_data",
    "smb2.ioctl.function", "smb2.tid", "smb2.nt_status", "smb2.msg_id", "smb2.fid",
    "smb2.tree", "smb2.create.disposition", "smb2.create.options", "smb2.share_flags",
    "smb2.access_mask", "smb2.file_attributes", "smb2.infolevel", "smb2.buffer_code",
    "smb2.create.action"  # Added to ensure capture
]
missing_fields = [f for f in CRITICAL_FIELDS if f not in FIELDS]
if missing_fields:
    logger.warning(f"Critical fields missing: {', '.join(missing_fields)}. Proceeding as fields are confirmed present in smb2_fields.txt.")
else:
    logger.info(f"Critical fields validated: {', '.join(CRITICAL_FIELDS)}")

logger.info(f"Defined {len(FIELDS)} fields for ingestion")
logger.debug(f"First 10 fields: {', '.join(FIELDS[:10])}...")

# Generic normalization function for hex fields
def normalize_hex_field(value, field_name):
    """Normalize hex fields to uppercase hex format (0xXXXXXXXX or 0xXXXXXXXXXXXXXXXX)."""
    logger.debug(f"Normalizing {field_name}: {value}")
    if pd.isna(value) or value is None or value == '':
        return None
    try:
        if isinstance(value, str):
            # Handle multi-valued strings (e.g., '0xC05D0000,0x00000000')
            value = value.split(',')[0].strip().lower().replace('0x', '')
            value = int(value, 16)
        elif isinstance(value, (int, float)):
            value = int(value)
        else:
            logger.warning(f"Invalid {field_name} type: {type(value)}")
            return None
        if field_name in ['smb2.sesid', 'smb2.msg_id']:
            normalized = f"0x{value:016X}"  # 64-bit fields
        elif field_name == 'smb2.create.action':
            normalized = str(value)  # Keep as integer string for mapping
        else:
            normalized = f"0x{value:08X}"  # 32-bit fields
        logger.debug(f"Normalized {field_name} to: {normalized}")
        return normalized
    except (ValueError, TypeError) as e:
        logger.warning(f"Invalid {field_name} format: {value}, error: {e}")
        return None

# Normalization function for smb2.fid
def normalize_fid(value):
    """Normalize smb2.fid, handling UUID or hex formats."""
    logger.debug(f"Normalizing smb2.fid: {value}")
    if pd.isna(value) or value is None or value == '':
        return None
    try:
        if isinstance(value, str):
            # Handle UUID-like format (e.g., '01d01154-9d3e-80b6-c1e9-ce0000000000')
            if '-' in value:
                uuid_str = value.replace('-', '')
                uuid_obj = uuid.UUID(uuid_str)
                normalized = f"0x{uuid_obj.int:032X}"
                logger.debug(f"Normalized UUID smb2.fid to: {normalized}")
                return normalized
            # Handle multi-valued strings or hex
            value = value.split(',')[0].strip().lower().replace('0x', '')
            value = int(value, 16)
        elif isinstance(value, (int, float)):
            value = int(value)
        else:
            logger.warning(f"Invalid smb2.fid type: {type(value)}")
            return None
        normalized = f"0x{value:032X}"  # 128-bit field
        logger.debug(f"Normalized smb2.fid to: {normalized}")
        return normalized
    except (ValueError, TypeError) as e:
        logger.debug(f"Invalid smb2.fid format: {value}, error: {e}")
        return None  # Suppress warning to reduce log spam

# Combine info level mappings
INFO_LEVEL_MAPPING = {
    **{str(k): v for k, v in FILE_INFO_CLASSES.items()},
    **{str(k): v for k, v in SMB2_INFO_LEVELS.items()},
    **{str(k): v for k, v in SMB2_FILE_INFO_CLASSES.items()}
}
logger.debug(f"Combined {len(INFO_LEVEL_MAPPING)} info level mappings")

FIELD_MAPPINGS = {
    "smb2.cmd": {
        "mapping": {str(k): v[0] if not v[1] else f"{v[0]} / {v[1]}" for k, v in SMB2_OP_NAME_DESC.items()},
        "normalize": lambda x: str(int(float(x.split(',')[0].strip()))) if x and pd.notna(x) and isinstance(x, str) else str(int(x)) if x and pd.notna(x) else None,
        "description": "Maps SMB2 command codes to operation names (request/response)."
    },
    "smb2.nt_status": {
        "mapping": {k: v[0] for k, v in ERROR_MESSAGES.items()},
        "normalize": lambda x: normalize_hex_field(x, "smb2.nt_status"),
        "description": "Maps NT status codes to error names (impacket)."
    },
    "smb2.ioctl.function": {
        "mapping": {str(f"0x{v:08X}"): k for k, v in FSCTL_CONSTANTS.items()},
        "normalize": lambda x: normalize_hex_field(x, "smb2.ioctl.function"),
        "description": "Maps IOCTL function codes to FSCTL names."
    },
    "smb2.tid": {
        "mapping": {},
        "normalize": lambda x: normalize_hex_field(x, "smb2.tid"),
        "description": "Normalizes tree ID to hex format."
    },
    "smb2.sesid": {
        "mapping": {},
        "normalize": lambda x: normalize_hex_field(x, "smb2.sesid"),
        "description": "Normalizes session ID to hex format."
    },
    "smb2.msg_id": {
        "mapping": {},
        "normalize": lambda x: normalize_hex_field(x, "smb2.msg_id"),
        "description": "Normalizes message ID to hex format."
    },
    "smb2.fid": {
        "mapping": {},
        "normalize": normalize_fid,
        "description": "Normalizes file ID to 128-bit hex format, handling UUIDs."
    },
    "smb2.infolevel": {
        "mapping": INFO_LEVEL_MAPPING,
        "normalize": lambda x: str(int(x)) if x and pd.notna(x) else None,
        "description": "Maps info level codes to file, directory, and filesystem info class names."
    },
    "smb2.create.action": {
        "mapping": {str(k): v for k, v in CREATE_ACTION_DESC.items()},
        "normalize": lambda x: normalize_hex_field(x, "smb2.create.action"),
        "description": "Maps create action codes to action names (e.g., FILE_OPENED)."
    }
}
logger.info(f"Field mappings defined for: {', '.join(FIELD_MAPPINGS.keys())}")
logger.info(f"Hex fields tracked: {', '.join(HEX_FIELDS)}")

builtins.FIELDS = FIELDS
builtins.FIELD_MAPPINGS = FIELD_MAPPINGS
builtins.HEX_FIELDS = HEX_FIELDS
builtins.CREATE_ACTION_DESC = CREATE_ACTION_DESC  # Export for use in other cells

In [None]:
# Cell 4: Define ntap-tshark processing functions
from typing import List, Dict, Tuple, Any
import subprocess
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import shlex
import os
import json
import psutil
import traceback
import builtins

# Import from builtins (set in Cells 1–3)
from builtins import logger, check_ssh_connectivity, FIELDS

SSH_KEY = "/home/jovyan/.ssh/id_rsa"
SSH_USER = "root"
SSH_HOST = "backend"
NTAPSHARK_PATH = "/usr/local/bin/ntap-tshark"
PARQUET_FILE = "tshark_output_full.parquet"

logger.info(f"Fields being extracted: {len(FIELDS)} fields")
logger.debug(f"Fields (first 50): {', '.join(FIELDS[:50])}...")

def build_tshark_command(capture, fields, reassembly=False, packet_limit=None, log_level="debug", temp_dir="/tmp", verbose=False):
    """Construct SSH command for ntap-tshark to process PCAP."""
    logger.info(f"Building tshark command for capture: {capture}")
    tshark_args = [
        NTAPSHARK_PATH,
        "--log-level", log_level,
        "--temp-dir", temp_dir,
        "-r", capture,
        "-Y", "smb2",
        "-T", "fields",
        "-E", "separator=|",
        "-E", "header=y",
        "-E", "occurrence=a",
        "-q"
    ]
    
    if reassembly:
        tshark_args.append("-2")
        logger.debug("Enabled TCP reassembly with -2 flag")
    if packet_limit is not None:
        tshark_args.extend(["-c", str(packet_limit)])
        logger.debug(f"Set packet limit to {packet_limit}")
    if verbose:
        tshark_args.append("-V")
        logger.debug("Enabled verbose tshark output with -V flag")
    
    for field in fields:
        tshark_args.extend(["-e", field])
    
    quoted_tshark_cmd = " ".join(shlex.quote(arg) for arg in tshark_args)
    cmd = [
        "ssh",
        "-i", SSH_KEY,
        "-p", "22",
        f"{SSH_USER}@{SSH_HOST}",
        quoted_tshark_cmd
    ]
    
    logger.debug(f"Constructed SSH command: {' '.join(cmd)[:400]}...")
    logger.debug(f"Quoted ntap-tshark command: {quoted_tshark_cmd[:400]}...")
    
    return cmd, fields

def extract_fields(line: str, fields: List[str]) -> Tuple[int, int, str, str, str, Dict[str, str]]:
    """Parse a tshark output line into a field dictionary."""
    logger.debug(f"Extracting fields from line: {line.strip()[:100]}...")
    try:
        split_line = line.split("|")
        if not split_line or not split_line[0].strip() or len(split_line) < 5:
            logger.warning(f"Invalid line format (split on |): {line.strip()[:100]}...")
            return 0, -1, "", "", "", {}
        
        if len(split_line) < len(fields):
            logger.warning(f"Line has {len(split_line)} fields, expected at least {len(fields)}: {line.strip()[:100]}...")
            split_line.extend([""] * (len(fields) - len(split_line)))
        
        field_dict = {}
        for i, value in enumerate(split_line[:len(fields)]):
            cleaned_value = value.split("\x02")[0] if value else ""
            field_dict[fields[i]] = cleaned_value
        
        frame_number = field_dict.get("frame.number", "")
        if frame_number.isdigit() and int(frame_number) <= 10:
            logger.debug(f"Extracted fields (first 5): {dict(list(field_dict.items())[:5])}")
        
        frame = 0
        stream_str = field_dict.get("tcp.stream", "")
        stream = int(stream_str) if stream_str and stream_str.isdigit() else -1
        if not stream_str or not stream_str.isdigit():
            logger.warning(f"Invalid tcp.stream '{stream_str}' in line: {line.strip()[:100]}...")
        
        ip_src = field_dict.pop("ip.src", field_dict.pop("ipv6.src", ""))
        ip_dst = field_dict.pop("ip.dst", field_dict.pop("ipv6.dst", ""))
        sesid = field_dict.pop("smb2.sesid", "")
        field_dict.pop("frame.number", None)
        field_dict.pop("tcp.stream", None)
        
        key_fields = ['smb2.cmd', 'smb2.filename', 'smb2.tid']
        for field in key_fields:
            if field not in field_dict:
                field_dict[field] = ""
        
        multi_value_fields = ['smb2.sesid', 'smb2.cmd', 'smb2.filename', 'smb2.tid', 'smb2.nt_status', 'smb2.msg_id']
        for field in multi_value_fields:
            if field in field_dict and field_dict[field]:
                if ',' in field_dict[field]:
                    field_dict[field] = [v.strip() for v in field_dict[field].split(',') if v.strip()]
                else:
                    field_dict[field] = [field_dict[field]] if field_dict[field] else []
        
        return frame, stream, ip_src, ip_dst, sesid, field_dict
    except Exception as e:
        logger.critical(f"Error in extract_fields: {str(e)}\n{traceback.format_exc()}")
        return 0, -1, "", "", "", {}

def process_tshark_output(cmd: List[str], fields: List[str]) -> pd.DataFrame:
    """Process tshark output into a DataFrame."""
    logger.info(f"Processing tshark output with command: {' '.join(cmd)[:200]}...")
    try:
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        data = []
        line_count = 0
        skip_count = 0
        header_skipped = False
        header_fields = None
        
        for line in proc.stdout:
            line = line.strip()
            if not line:
                continue
                
            if not header_skipped:
                header_fields = line.split("|")
                if header_fields != fields[:len(header_fields)]:
                    logger.warning(f"Header fields mismatch! Expected: {fields[:10]}, Got: {header_fields[:10]}")
                logger.debug(f"Header row: {line[:200]}...")
                header_skipped = True
                continue
            
            line_count += 1
            if line_count % 1000 == 0:
                logger.info(f"Processed {line_count} lines, memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
            if line_count <= 10:
                logger.debug(f"Raw line {line_count}: {line[:200]}...")
            try:
                frame, stream, ip_src, ip_dst, sesid, field_dict = extract_fields(line, fields)
                corrected_frame = line_count
                field_dict['frame.number'] = str(corrected_frame)
                record = {
                    "frame.number": corrected_frame,
                    "tcp.stream": stream,
                    "ip.src": ip_src,
                    "ip.dst": ip_dst,
                    "smb2.sesid": sesid,
                    **field_dict
                }
                data.append(record)
            except (KeyError, ValueError) as e:
                skip_count += 1
                if skip_count <= 5:
                    logger.warning(f"Skipping line {line_count} due to error: {e} - Raw: {line[:100]}...")
                continue
        
        proc.stdout.close()
        proc.wait()
        
        if proc.returncode != 0:
            stderr_output = proc.stderr.read()
            logger.critical(f"tshark failed with exit code {proc.returncode}, stderr: {stderr_output}")
            raise subprocess.CalledProcessError(proc.returncode, cmd, output=json.dumps(data, indent=2) if data else "", stderr=stderr_output)
        
        if not data:
            logger.critical("No data extracted from tshark output")
            return pd.DataFrame()
        
        logger.info(f"Creating DataFrame with {len(data)} records, memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
        defined_columns = ["frame.number", "tcp.stream", "ip.src", "ip.dst", "smb2.sesid"]
        unique_fields = [f for f in fields if f not in defined_columns]
        df = pd.DataFrame(data, columns=defined_columns + unique_fields)
        logger.info(f"Processed {len(df)} frames from tshark output, skipped {skip_count} lines")
        logger.info(f"Total lines processed: {line_count}")
        
        # Log DataFrame memory usage before optimization
        df_memory_mb = df.memory_usage(deep=True).sum() / 1024**2
        logger.info(f"DataFrame memory usage before optimization: {df_memory_mb:.2f} MB")
        
        # Normalize smb2.sesid early
        if 'smb2.sesid' in df.columns:
            multi_value_count = df['smb2.sesid'].str.contains(',').sum()
            logger.debug(f"Found {multi_value_count} rows with multi-valued smb2.sesid")
            df['smb2.sesid'] = df['smb2.sesid'].apply(lambda x: ','.join(x) if isinstance(x, list) else x if x else '')
            logger.debug(f"Normalized smb2.sesid values (first 10): {list(df['smb2.sesid'].head(10))}")
        
        if 'tcp.stream' in df.columns:
            df['tcp.stream'] = pd.to_numeric(df['tcp.stream'], errors='coerce', downcast='integer')
            logger.info(f"Converted tcp.stream to dtype: {df['tcp.stream'].dtype}")
        
        # Downcast numeric columns
        for col in df.columns:
            if df[col].dtype == 'float64':
                df[col] = pd.to_numeric(df[col], errors='coerce', downcast='float')
            elif df[col].dtype == 'int64':
                df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
        
        hex_fields = ['smb2.nt_status', 'smb2.tid', 'smb2.sesid', 'smb2.fid', 'smb2.flags']
        for col in hex_fields:
            if col in df.columns:
                df[col] = df[col].map(lambda x: x[0] if isinstance(x, list) and x else x if not isinstance(x, list) else '')
                if col == 'smb2.tid':
                    logger.debug(f"Normalized smb2.tid values (first 10): {list(df['smb2.tid'].head(10))}")
        
        if 'frame.number' in df.columns:
            df['frame.number'] = pd.to_numeric(df['frame.number'], errors='coerce', downcast='integer')
        if 'ip.src' in df.columns:
            df['ip.src'] = df['ip.src'].astype('string')
        if 'ip.dst' in df.columns:
            df['ip.dst'] = df['ip.dst'].astype('string')
        if 'smb2.msg_id' in df.columns:
            df['smb2.msg_id'] = df['smb2.msg_id'].map(lambda x: x[0] if isinstance(x, list) and x else x)
            df['smb2.msg_id'] = pd.to_numeric(df['smb2.msg_id'], errors='coerce').astype('UInt64')
        
        # Log DataFrame memory usage after optimization
        df_memory_mb_opt = df.memory_usage(deep=True).sum() / 1024**2
        logger.info(f"DataFrame memory usage after optimization: {df_memory_mb_opt:.2f} MB")
        
        logger.debug("Hex fields after processing:")
        logger.debug(df[['smb2.nt_status', 'smb2.tid', 'smb2.sesid']].head().to_string())
        logger.debug("Multi-value fields before processing:")
        logger.debug(df[['smb2.filename', 'smb2.cmd']].head().to_string())
        
        return df
    except Exception as e:
        logger.critical(f"Error in process_tshark_output: {str(e)}\n{traceback.format_exc()}")
        raise

def save_to_parquet(df, parquet_path):
    """Save DataFrame to Parquet file."""
    logger.info(f"Saving DataFrame to {parquet_path}")
    try:
        multi_value_fields = ['smb2.sesid', 'smb2.cmd', 'smb2.filename', 'smb2.tid', 'smb2.nt_status', 'smb2.msg_id']
        df_copy = df.copy()
        for col in multi_value_fields:
            if col in df_copy.columns:
                df_copy[col] = df_copy[col].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else str(x) if x else '')
        table = pa.Table.from_pandas(df_copy, preserve_index=False)
        pq.write_table(table, parquet_path, compression='zstd')
        logger.info(f"Saved DataFrame to {parquet_path}")
    except Exception as e:
        logger.critical(f"Error saving Parquet file {parquet_path}: {str(e)}\n{traceback.format_exc()}")
        raise

def create_remote_directory(case_number: str, trace_name: str, force_reingest: bool = False) -> str:
    """Create remote directory for session storage."""
    logger.info(f"Creating remote directory for case {case_number}, trace {trace_name}")
    try:
        if not check_ssh_connectivity():
            logger.critical(f"Cannot connect to {SSH_HOST} for directory creation")
            raise RuntimeError(f"SSH connection to {SSH_HOST} failed")
        
        base_dir = os.path.join("/stingray", case_number)
        tracer_dir = os.path.join(base_dir, ".tracer")
        pcap_dir = os.path.join(tracer_dir, trace_name.split('.')[0])  # Remove all extensions consistently
        output_dir = os.path.join(pcap_dir, "sessions")
        
        cmd = [
            "ssh", "-i", SSH_KEY, "-p", "22", f"{SSH_USER}@{SSH_HOST}",
            f"mkdir -p {shlex.quote(output_dir)} && chmod -R 777 {shlex.quote(base_dir)}"
        ]
        logger.debug(f"Executing directory creation command: {' '.join(cmd)}")
        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        logger.info(f"Created remote directory: {output_dir} with 777 permissions (stdout: {result.stdout})")
        status_callback(f"Created remote directory: {output_dir}")
        
        if force_reingest:
            clear_cmd = [
                "ssh", "-i", SSH_KEY, "-p", "22", f"{SSH_USER}@{SSH_HOST}",
                f"rm -rf {shlex.quote(os.path.join(output_dir, '*'))}"
            ]
            logger.debug(f"Executing clear command for force_reingest: {' '.join(clear_cmd)}")
            result = subprocess.run(clear_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            logger.info(f"Cleared {output_dir} due to force_reingest (stdout: {result.stdout})")
        
        check_cmd = [
            "ssh", "-i", SSH_KEY, "-p", "22", f"{SSH_USER}@{SSH_HOST}",
            f"test -d {shlex.quote(output_dir)} && test -w {shlex.quote(output_dir)}"
        ]
        try:
            logger.debug(f"Verifying directory: {' '.join(check_cmd)}")
            result = subprocess.run(check_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            logger.info(f"Verified {output_dir} exists and is writable")
        except subprocess.CalledProcessError as e:
            logger.critical(f"Error: {output_dir} not writable after creation: {e.stderr}")
            status_callback(f"Error: {output_dir} not writable")
            raise
        
        return output_dir
    except subprocess.CalledProcessError as e:
        logger.critical(f"Error creating remote directory {output_dir}: {e.stderr} (stdout: {e.stdout})")
        status_callback(f"Error creating remote directory: {e.stderr}")
        raise
    except Exception as e:
        logger.critical(f"Error in create_remote_directory: {str(e)}\n{traceback.format_exc()}")
        status_callback(f"Error in create_remote_directory: {str(e)}")
        raise

def clear_directory(directory: str):
    """Clear all files in a remote directory."""
    logger.info(f"Clearing directory: {directory}")
    try:
        if not check_ssh_connectivity():
            logger.critical(f"Cannot connect to {SSH_HOST} for directory clearing")
            raise RuntimeError(f"SSH connection to {SSH_HOST} failed")
        
        cmd = ["ssh", "-i", SSH_KEY, "-p", "22", f"{SSH_USER}@{SSH_HOST}",
               f"rm -rf {shlex.quote(directory)}/*"]
        logger.debug(f"Executing clear command: {' '.join(cmd)}")
        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        logger.info(f"Cleared all files in {directory} on {SSH_HOST} (stdout: {result.stdout})")
    except subprocess.CalledProcessError as e:
        logger.critical(f"Failed to clear directory {directory}: stderr={e.stderr}, stdout={e.stdout}")
        raise
    except Exception as e:
        logger.critical(f"Error in clear_directory: {str(e)}\n{traceback.format_exc()}")
        raise

def status_callback(message):
    """Log status messages."""
    logger.info(f"Status: {message}")

builtins.status_callback = status_callback
builtins.build_tshark_command = build_tshark_command
builtins.process_tshark_output = process_tshark_output
builtins.save_to_parquet = save_to_parquet
builtins.create_remote_directory = create_remote_directory
builtins.clear_directory = clear_directory

logger.info("NTAPshark processing functions and utilities defined")

In [None]:
# Cell 5: Define ingestion and session extraction logic
import os
import time
import pickle
import traceback
from collections import OrderedDict
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
import subprocess
import shlex
import psutil
import builtins
from typing import Dict

# Import required functions and variables from builtins with error handling
try:
    from builtins import logger, status_callback, build_tshark_command, process_tshark_output, create_remote_directory, clear_directory, save_to_parquet, check_ssh_connectivity, FIELDS
    print("Successfully imported all required functions from builtins")
except ImportError as e:
    print(f"ImportError in Cell 5: {e}")
    print("Make sure all previous cells (1-4) have been executed successfully")
    raise

_unique_sesids = set()

def get_packet_count(capture_path):
    """Retrieve the number of packets in a PCAP file using ntap-capinfos."""
    logger.info(f"Retrieving packet count for {capture_path}")
    try:
        cmd = [
            "ssh", "-i", "/home/jovyan/.ssh/id_rsa", "-p", "22", "root@backend",
            f"/usr/local/bin/ntap-capinfos -c {shlex.quote(capture_path)}"
        ]
        logger.debug(f"Executing capinfos command: {' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        output = result.stdout
        for line in output.splitlines():
            if "Number of packets:" in line:
                count_str = line.split(":")[1].strip()
                try:
                    count_str_lower = count_str.lower()
                    if 'k' in count_str_lower:
                        count = int(float(count_str_lower.replace('k', '')) * 1000)
                    elif 'm' in count_str_lower:
                        count = int(float(count_str_lower.replace('m', '')) * 1000000)
                    else:
                        count = int(count_str)
                    logger.info(f"Packet count for {capture_path}: {count}")
                    return count
                except ValueError:
                    logger.critical(f"Invalid packet count format: {count_str}")
                    status_callback(f"Error - Invalid packet count format: {count_str}")
                    return None
        logger.critical(f"Could not parse packet count from capinfos output: {output}")
        status_callback(f"Error - Could not parse packet count for {capture_path}")
        return None
    except subprocess.CalledProcessError as e:
        logger.critical(f"Error running capinfos: {e.stderr}")
        status_callback(f"Error running capinfos for {capture_path}: {e.stderr}")
        return None
    except Exception as e:
        logger.critical(f"Error in get_packet_count: {str(e)}\n{traceback.format_exc()}")
        status_callback(f"Error in get_packet_count: {str(e)}")
        return None

def normalize_sesid(sesid_str):
    """Normalize smb2.sesid values, handling lists and commas."""
    logger.debug(f"Normalizing sesid: {str(sesid_str)[:200]}")
    try:
        if pd.isna(sesid_str) or not sesid_str:
            return []
        if isinstance(sesid_str, list):
            sesids = list(dict.fromkeys(item.strip() for item in sesid_str if item and item != "0x0000000000000000"))
        else:
            sesids = list(dict.fromkeys(item.strip() for item in sesid_str.split(',') if item and item != "0x0000000000000000"))
        
        sesid_key = str(sesids)[:200]
        if sesid_key not in _unique_sesids:
            logger.debug(f"Normalized sesid to: {sesid_key}")
            _unique_sesids.add(sesid_key)
        
        return sesids
    except Exception as e:
        logger.critical(f"Error in normalize_sesid: {str(e)}\n{traceback.format_exc()}")
        return []

def normalize_cmd(cmd_str):
    """Normalize smb2.cmd values, handling lists and commas."""
    logger.debug(f"Normalizing cmd: {str(cmd_str)[:200]}")
    try:
        if pd.isna(cmd_str).any() if isinstance(cmd_str, (list, pd.Series)) else pd.isna(cmd_str):
            return []
        if not cmd_str:
            return []
        if isinstance(cmd_str, list):
            return [item.strip() for item in cmd_str if item]
        return [item.strip() for item in cmd_str.split(',') if item]
    except Exception as e:
        logger.critical(f"Error in normalize_cmd: {str(e)}\n{traceback.format_exc()}")
        return []

def save_session_metadata(case_number: str, trace_name: str, sessions: Dict[str, pd.DataFrame], output_dir: str):
    """Save session metadata to JSON."""
    logger.info(f"Saving session metadata to {output_dir}")
    try:
        metadata = {
            "case_number": case_number,
            "trace_name": trace_name,
            "session_count": len(sessions),
            "session_details": {sesid: {"frame_count": len(df), "columns": len(df.columns)} for sesid, df in sessions.items()}
        }
        metadata_path = os.path.join(output_dir, "session_metadata.json")
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        logger.info(f"Saved session metadata to {metadata_path}")
    except Exception as e:
        logger.critical(f"Error in save_session_metadata: {str(e)}\n{traceback.format_exc()}")
        raise

def run_ingestion(capture_path=None, reassembly_enabled=False, force_reingest=False, verbose=False):
    """Orchestrate PCAP ingestion and session extraction."""
    global _unique_sesids
    logger.info(f"Starting ingestion with capture_path: {capture_path}, reassembly_enabled: {reassembly_enabled}, force_reingest: {force_reingest}, verbose: {verbose}")
    
    # Get status_callback from builtins first, before any validation
    if hasattr(builtins, 'status_callback') and callable(builtins.status_callback):
        status_callback = builtins.status_callback
        logger.debug("Using status_callback from builtins")
    else:
        # Fallback status_callback
        def status_callback(message):
            logger.info(f"Status: {message}")
        logger.warning("Using fallback status_callback - builtins.status_callback not available")
    
    try:
        # Validate that all required functions are available and callable
        required_functions = {
            'status_callback': status_callback,
            'build_tshark_command': build_tshark_command,
            'process_tshark_output': process_tshark_output,
            'create_remote_directory': create_remote_directory,
            'save_to_parquet': save_to_parquet,
            'check_ssh_connectivity': check_ssh_connectivity
        }
        
        for func_name, func in required_functions.items():
            if not callable(func):
                raise ValueError(f"Required function '{func_name}' is not callable: {type(func)}")
        
        logger.debug("All required functions validated as callable")

        # Initialize _unique_sesids
        _unique_sesids = set()
        logger.debug("Initialized _unique_sesids set")

        # Load capture_path from config if None
        config_file = "/home/jovyan/work/smbreplay/config.pkl"
        if capture_path is None and os.path.exists(config_file):
            try:
                with open(config_file, 'rb') as f:
                    settings = pickle.load(f)
                    capture_path = settings.get("capture_path")
                logger.info(f"Loaded capture_path from config.pkl: {capture_path}")
            except (pickle.PickleError, IOError) as e:
                logger.critical(f"Error loading {config_file}: {e}")
                status_callback(f"Error - Failed to load {config_file}: {e}")
                return None

        if not capture_path:
            logger.critical("No valid capture path available for re-ingestion")
            status_callback("Critical: No valid capture path available for re-ingestion")
            return None

        capture_path = os.path.abspath(capture_path)
        trace_name = os.path.basename(capture_path).split('.')[0]
        logger.info(f"Validating PCAP: {capture_path}")
        status_callback(f"Starting ingestion for {trace_name}")

        if not os.path.exists(capture_path):
            logger.critical(f"PCAP file not found: {capture_path}")
            status_callback(f"Error - PCAP file not found: {capture_path}")
            return None

        # Validate PCAP with tshark
        try:
            validate_cmd = [
                "ssh", "-i", "/home/jovyan/.ssh/id_rsa", "-p", "22", "root@backend",
                f"/usr/local/bin/ntap-tshark -r {shlex.quote(capture_path)} -c 1"
            ]
            logger.debug(f"Executing PCAP validation command: {' '.join(validate_cmd)}")
            result = subprocess.run(validate_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            logger.debug(f"PCAP validation stdout: {result.stdout}")
            logger.info(f"PCAP file {trace_name} validated")
            status_callback(f"PCAP file {trace_name} validated")
        except subprocess.CalledProcessError as e:
            logger.critical(f"Error validating PCAP: {e.stderr}")
            status_callback(f"Error - PCAP file invalid or corrupt: {e.stderr}")
            return None
        
        if not check_ssh_connectivity():
            logger.critical("Cannot connect to SSH_HOST for ntap-tshark")
            status_callback("Error - Cannot connect to SSH_HOST for ntap-tshark")
            return None
        
        packet_count = get_packet_count(capture_path)
        packet_limit = 10000 if packet_count is not None and packet_count > 10000 else None
        logger.info(f"Packet limit set to: {packet_limit if packet_limit is not None else 'None (full capture)'}")
        status_callback(f"Packet limit set to: {packet_limit if packet_limit is not None else 'None (full capture)'}")
        
        parts = capture_path.split(os.sep)
        case_number = parts[2] if len(parts) >= 3 and parts[1] == 'stingray' else "unknown"
        logger.info(f"Ingesting {trace_name} for case {case_number}")
        status_callback(f"Ingesting {trace_name} for case {case_number}")
        
        base_fields = [
            "frame.number", "frame.time_epoch", "ip.src", "ip.dst", "smb2.sesid", 
            "smb2.cmd", "smb2.filename", "smb2.tid", "smb2.nt_status", "smb2.msg_id"
        ]
        additional_fields = FIELDS
        if "smb2.ioctl.function" not in additional_fields:
            additional_fields.append("smb2.ioctl.function")
        fields = list(OrderedDict.fromkeys(base_fields + additional_fields))
        logger.info(f"Using {len(fields)} fields for tshark extraction")
        status_callback(f"Using {len(fields)} fields for tshark extraction")
        
        cmd, used_fields = build_tshark_command(capture_path, fields, reassembly=reassembly_enabled, packet_limit=packet_limit, verbose=verbose)
        logger.debug(f"Full tshark command: {' '.join(cmd)[:400]}...")
        status_callback(f"Full tshark command: {' '.join(cmd)[:400]}...")
        
        start_time = time.time()
        try:
            df = process_tshark_output(cmd, used_fields)
        except subprocess.CalledProcessError as e:
            logger.critical(f"Error during ingestion: {e.stderr}")
            status_callback(f"Error during ingestion: {e.stderr}")
            return None
        
        if df.empty:
            logger.critical("No data extracted from tshark output")
            status_callback("Error - No data extracted from tshark output")
            return None
        
        logger.info(f"Processed {len(df)} frames")
        status_callback(f"Processed {len(df)} frames")
        
        logger.info(f"Extracting unique session IDs from {len(df)} rows")
        status_callback(f"Extracting unique session IDs from {len(df)} rows")
        try:
            unique_sesids = df['smb2.sesid'].apply(normalize_sesid).explode().unique()
            unique_sesids = [s for s in unique_sesids if s and str(s).lower() != 'nan']
            logger.info(f"Found {len(unique_sesids)} unique session IDs")
            status_callback(f"Found {len(unique_sesids)} unique session IDs")
        except Exception as e:
            logger.critical(f"Error extracting session IDs: {e}")
            status_callback(f"Error extracting session IDs: {e}")
            return None
        
        logger.info(f"Extracting {len(unique_sesids)} sessions")
        status_callback(f"Extracting {len(unique_sesids)} sessions")
        
        sessions = {}
        for i, sesid in enumerate(unique_sesids, 1):
            logger.debug(f"Processing session {i}/{len(unique_sesids)} - sesid: {sesid}")
            status_callback(f"Processing session {i}/{len(unique_sesids)} - sesid: {sesid}")
            
            sesid_filter = df['smb2.sesid'].apply(lambda x: sesid in normalize_sesid(x))
            session_df = df[sesid_filter].copy()
            
            session_df['smb2.cmd'] = session_df['smb2.cmd'].apply(normalize_cmd)
            session_df['smb2.filename'] = session_df['smb2.filename'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
            session_df['smb2.sesid'] = session_df['smb2.sesid'].apply(normalize_sesid)
            
            if not session_df.empty:
                sessions[sesid] = session_df
                logger.debug(f"Processed session {sesid} with {len(session_df)} frames")
                status_callback(f"Processed session {sesid} with {len(session_df)} frames")
            else:
                logger.warning(f"No frames found for session {sesid}")
                status_callback(f"Warning: No frames found for session {sesid}")
        
        logger.info(f"Extracted {len(sessions)} sessions")
        status_callback(f"Extracted {len(sessions)} sessions")
        
        output_dir = create_remote_directory(case_number, trace_name, force_reingest)
        if output_dir is None:
            logger.critical(f"Failed to create output directory for {trace_name}")
            status_callback(f"Error - Failed to create output directory for {trace_name}")
            return None
        logger.info(f"Output directory: {output_dir}")
        status_callback(f"Output directory: {output_dir}")
        
        parquet_path = os.path.join(output_dir, "tshark_output_full.parquet")
        try:
            if psutil.virtual_memory().available < 512 * 1024**2:
                logger.warning("Low available memory before saving Parquet files")
                status_callback("Warning: Low available memory before saving Parquet files")
            save_to_parquet(df, parquet_path)
            logger.info(f"Saved full data to {parquet_path}")
            status_callback(f"Saved full data to {parquet_path}")
            
            for sesid, session_df in sessions.items():
                session_parquet = os.path.join(output_dir, f"smb2_session_{sesid}.parquet")
                save_to_parquet(session_df, session_parquet)
                logger.info(f"Saved session {sesid} to {session_parquet}")
                status_callback(f"Saved session {sesid} to {session_parquet}")
            
            save_session_metadata(case_number, trace_name, sessions, output_dir)
            logger.info("Session metadata saved")
        except Exception as e:
            logger.critical(f"Error saving sessions or metadata: {str(e)}")
            status_callback(f"Error - Failed to save sessions or metadata: {str(e)}")
            return None
        
        elapsed_time = time.time() - start_time
        logger.info(f"Ingestion completed in {elapsed_time:.2f}s")
        status_callback(f"Ingestion completed in {elapsed_time:.2f}s")
        
        result = {"full_df": df, "sessions": sessions}
        logger.info(f"Ingestion result: {list(result.get('sessions', {}).keys())}")
        return result
    except Exception as e:
        logger.critical(f"Error in run_ingestion: {e}\n{traceback.format_exc()}")
        status_callback(f"Error in run_ingestion: {str(e)}")
        return None
        
builtins.run_ingestion = run_ingestion
logger.info("Ingestion and session extraction functions defined")

In [None]:
# Cell 6: Session Selection and Utility Functions
import os
import pandas as pd
import pickle
import subprocess
import shlex
import builtins

# Import from builtins (set in Cells 1–5)
from builtins import logger, check_ssh_connectivity, FIELDS, create_remote_directory, pcap_config

def load_capture():
    """Load capture from config.pkl, falling back to pcap_config if unavailable."""
    logger.info("Loading capture path from config.pkl")
    config_file = "/home/jovyan/work/smbreplay/config.pkl"
    if not os.path.exists(config_file):
        logger.warning(f"No {config_file} found. Checking pcap_config.")
        capture = pcap_config.get("capture_path")
        if capture and os.path.exists(capture):
            logger.info(f"Fallback to pcap_config capture path: {capture}")
            return capture
        logger.warning("No valid capture_path in pcap_config. Configure capture in Cell 8 dashboard.")
        return None
    try:
        with open(config_file, 'rb') as f:
            settings = pickle.load(f)
            logger.debug(f"Loaded config.pkl contents: {settings}")
            capture = settings.get("pcap_config", {}).get("capture_path")
            if capture and os.path.exists(capture):
                logger.info(f"Loaded capture path: {capture}")
                return capture
            logger.warning(f"Capture path {capture or 'missing'} in config.pkl does not exist. Checking pcap_config.")
            capture = pcap_config.get("capture_path")
            if capture and os.path.exists(capture):
                logger.info(f"Fallback to pcap_config capture path: {capture}")
                return capture
            logger.warning("No valid capture_path in config.pkl or pcap_config. Configure capture in Cell 8 dashboard.")
            return None
    except (pickle.PickleError, IOError) as e:
        logger.warning(f"Error loading {config_file}: {e}. Checking pcap_config.")
        capture = pcap_config.get("capture_path")
        if capture and os.path.exists(capture):
            logger.info(f"Fallback to pcap_config capture path: {capture}")
            return capture
        logger.warning("No valid capture_path in pcap_config. Configure capture in Cell 8.")
        return None

def get_output_dir(capture):
    """Derive output directory from capture path, validate write access, return None if invalid."""
    logger.info(f"Deriving output directory for capture: {capture}")
    if not capture:
        logger.warning("No capture file provided. Configure in Cell 8 dashboard.")
        return None
    try:
        capture = os.path.normpath(capture)
        parts = capture.split(os.sep)
        if len(parts) < 3 or parts[1] != 'stingray':
            raise ValueError("Invalid capture path format")
        case_number = parts[2]
        trace_name = os.path.basename(capture).split('.')[0]  # Remove all extensions consistently
        
        # Use create_remote_directory from Cell 4
        output_dir = create_remote_directory(case_number, trace_name)
        
        check_cmd = [
            "ssh", "-i", "/home/jovyan/.ssh/id_rsa", "-p", "22", "root@backend",
            f"test -d {shlex.quote(output_dir)} && test -w {shlex.quote(output_dir)}"
        ]
        logger.debug(f"Executing directory check command: {' '.join(check_cmd)}")
        subprocess.run(check_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        logger.info(f"Output directory set to: {output_dir} with write access confirmed")
        return output_dir
    except (subprocess.CalledProcessError, ValueError, Exception) as e:
        logger.warning(f"Output directory does not exist or is not writable: {output_dir}. Run Cell 5 to ingest trace with 777 permissions. Error: {e}")
        return None

def list_session_files(output_dir):
    """List session Parquet files in the output directory, normalizing to lowercase and removing duplicates."""
    logger.info(f"Listing session files in {output_dir}")
    if not output_dir:
        logger.warning(f"Invalid or missing output directory: {output_dir}")
        return []
    try:
        ls_cmd = [
            "ssh", "-i", "/home/jovyan/.ssh/id_rsa", "-p", "22", "root@backend",
            f"ls {shlex.quote(output_dir)}"
        ]
        logger.debug(f"Executing ls command: {' '.join(ls_cmd)}")
        result = subprocess.run(ls_cmd, capture_output=True, text=True, check=True)
        session_files = [f for f in result.stdout.splitlines() if f.startswith('smb2_session_') and f.endswith('.parquet')]
        # Normalize to lowercase and remove duplicates
        normalized_files = list(dict.fromkeys(f.lower() for f in session_files))
        if not normalized_files:
            logger.warning(f"No session files found in {output_dir}. Ensure ingestion completed successfully.")
        else:
            logger.info(f"Found {len(normalized_files)} unique session files in {output_dir}")
            logger.debug(f"Session files: {', '.join(normalized_files[:5])}...")
        return normalized_files
    except subprocess.CalledProcessError as e:
        logger.warning(f"Error listing session files in {output_dir}: {e.stderr}")
        return []
    except Exception as e:
        logger.warning(f"Error listing session files in {output_dir}: {e}")
        return []

def shorten_path(path: str, max_length: int = 50, min_filename_length: int = 20) -> str:
    """Dynamically shorten a file path for display, prioritizing the filename."""
    logger.debug(f"Shortening path: {path}")
    if not isinstance(path, str) or not path.strip() or path == "N/A":
        return "N/A"
    if len(path) <= max_length:
        return path
    
    components = path.replace('\\', '/').split('/')
    filename = components[-1]
    if len(filename) > min_filename_length:
        filename = f"{filename[:min_filename_length-3]}..."
    
    remaining_length = max_length - len(filename) - 3
    if remaining_length <= 0:
        shortened = f".../{filename}"
    else:
        path_part = '/'.join(components[:-1])
        if len(path_part) > remaining_length:
            path_part = f"{path_part[:remaining_length-3]}..."
        shortened = f"{path_part}/{filename}"
    
    logger.debug(f"Shortened path to: {shortened}")
    return shortened

def normalize_path(path: str) -> str:
    """Normalize a file path for comparison (lowercase, replace backslashes), preserving leading slash for absolute paths."""
    logger.debug(f"Normalizing path: {path}")
    if not isinstance(path, str) or not path.strip() or path == "N/A":
        return "N/A"
    is_absolute = path.startswith('/')
    normalized = path.lower().replace('\\', '/').strip('/')
    if is_absolute:
        normalized = f"/{normalized}"
    logger.debug(f"Normalized path to: {normalized}")
    return normalized

def get_tree_name_mapping(df: pd.DataFrame) -> dict:
    """Map SMB2 tree IDs to tree names based on Tree Connect requests."""
    logger.info("Generating tree name mapping")
    tree_mapping = {}
    required_columns = ['smb2.cmd', 'smb2.flags.response', 'smb2.tid', 'smb2.tree']
    if not all(col in df.columns for col in required_columns):
        missing = [col for col in required_columns if col not in df.columns]
        logger.warning(f"Missing columns for tree mapping: {missing}")
        return tree_mapping
    
    tree_connects = df[(df['smb2.cmd'] == '3') & (df['smb2.flags.response'] == 'False')]
    logger.debug(f"Found {len(tree_connects)} Tree Connect request frames")
    for _, row in tree_connects.iterrows():
        tid = row.get('smb2.tid', '')
        tree_name = row.get('smb2.tree', '')
        if tid and tid.strip() and tid != '0' and isinstance(tid, str) and tree_name.strip():
            tree_mapping[tid] = tree_name
            logger.debug(f"Mapped tree ID {tid} to {tree_name}")
    logger.info(f"Tree mapping created with {len(tree_mapping)} records")
    return tree_mapping

# Add core utilities to builtins
builtins.load_capture = load_capture
builtins.get_output_dir = get_output_dir
builtins.shorten_path = shorten_path
builtins.normalize_path = normalize_path
builtins.get_tree_name_mapping = get_tree_name_mapping
builtins.list_session_files = list_session_files

# Initialize Cell 6
logger.info("Session utility functions initialized. Capture loading and session selection deferred to dashboard.")

In [None]:
# Cell 7: Session Loading and Filtering - Loads and filters SMB2 sessions
import json
import pandas as pd
import os
import time
import pyarrow.parquet as pq
import uuid
import builtins

# Import from builtins (set in Cells 1–6)
from builtins import logger, check_ssh_connectivity, FIELDS, FIELD_MAPPINGS, HEX_FIELDS, SMB2_OP_NAME_DESC, load_capture, get_output_dir, shorten_path, normalize_path, get_tree_name_mapping

# Global variables
operations = []
session_frames = None
_execution_id = str(uuid.uuid4())  # Unique ID for this run

def load_and_summarize_session(capture, session_file):
    """Load a session file and return field options and file options."""
    global session_frames
    logger.info(f"Loading session file: {session_file}")
    
    output_dir = get_output_dir(capture)
    if not output_dir:
        logger.warning("Invalid output directory. Configure capture in Cell 8 dashboard.")
        return None, [], [], []
    
    session_path = os.path.join(output_dir, session_file)
    if not os.path.exists(session_path):
        logger.warning(f"Session file not found: {session_path}. Note: Path may be remote; consider SSH validation.")
        return None, [], [], []
    
    try:
        session_frames = pq.read_table(session_path).to_pandas()
        logger.info(f"Loaded session {session_file} with {len(session_frames)} frames")
    except Exception as e:
        logger.warning(f"Error loading session file: {e}. Try re-ingesting the PCAP in Cell 5.")
        return None, [], [], []
    
    if session_frames.empty:
        logger.warning(f"No frames found in session {session_file}")
        return None, [], [], []
    
    # Get field options
    all_fields = sorted([col for col in session_frames.columns if col.startswith('smb2.')])
    volatile_fields = ["smb2.time", "smb2.frame.time"]
    field_options = [f for f in all_fields if f not in volatile_fields]
    default_fields = ['smb2.nt_status', 'smb2.create.action']
    selected_fields = [f for f in default_fields if f in field_options]
    
    # Get file options, excluding "Entire Stream"
    unique_files = sorted(set(session_frames.get('smb2.filename', pd.Series([])).dropna()) - {'N/A', ''})
    file_options = unique_files  # No "Entire Stream" option
    
    logger.debug(f"Field options: {field_options[:5]}... (total: {len(field_options)})")
    logger.debug(f"File options: {file_options[:5]}... (total: {len(file_options)})")
    
    return session_frames, field_options, file_options, selected_fields

def update_operations(capture, session_file, selected_file=None, selected_fields=None):
    """Prepare operations data based on selected file and fields."""
    global operations, session_frames
    operations.clear()
    logger.info(f"Preparing operations for session: {session_file}, file: {selected_file}, fields: {selected_fields}")
    
    if session_frames is None or session_frames.empty:
        session_frames, _, _, _ = load_and_summarize_session(capture, session_file)
        if session_frames is None or session_frames.empty:
            logger.warning(f"Failed to load session data for {session_file}")
            return []
    
    if selected_fields is None:
        selected_fields = ['smb2.nt_status', 'smb2.create.action']
    selected_fields = [f for f in selected_fields if f in session_frames.columns]  # Filter invalid fields
    
    # Apply FIELD_MAPPINGS for normalization and mapping
    filtered_frames = session_frames.copy()
    for field in FIELD_MAPPINGS:
        if field in filtered_frames.columns:
            mapping = FIELD_MAPPINGS[field]["mapping"]
            normalize = FIELD_MAPPINGS[field]["normalize"]
            logger.debug(f"Normalizing field: {field}")
            filtered_frames[field] = filtered_frames[field].apply(normalize)
            
            # Special handling for fields that should only show when meaningful
            if field in ["smb2.create.action", "smb2.ioctl.function"]:
                # Only apply mapping to non-null, non-empty values
                filtered_frames[f"{field}_desc"] = filtered_frames[field].apply(
                    lambda x: mapping.get(str(x), "") if pd.notna(x) and str(x).strip() != "" and str(x) != "None" else ""
                )
            else:
                filtered_frames[f"{field}_desc"] = filtered_frames[field].map(mapping).fillna(f"Unknown ({filtered_frames[field]})")
    
    # Filter frames based on selected file
    if selected_file is None or not selected_file:  # No filter means all frames
        logger.info("No file selected, processing all frames")
    else:
        filtered_frames = filtered_frames[filtered_frames['smb2.filename'].apply(normalize_path) == normalize_path(selected_file)]
    
    logger.debug(f"Filtered {len(filtered_frames)} frames")
    if filtered_frames.empty and selected_file is not None:
        logger.warning(f"No operations found for file: {selected_file}")
        return []
    
    # Helper function for normalization
    def normalize_field(field_str):
        if pd.isna(field_str) or not field_str or field_str.strip() == '':
            return "N/A"
        return field_str.split(',')[0].strip()

    # Process frames
    start_time = time.time()
    total_frames = len(filtered_frames)
    mandatory_fields = ['frame.number', 'smb2.cmd', 'smb2.filename', 'smb2.nt_status', 'smb2.flags.response']
    for idx, row in filtered_frames.iterrows():
        if idx % 10000 == 0 and idx > 0:
            logger.debug(f"Processing frame {idx}/{total_frames}")
        
        # Normalize fields
        filename = normalize_field(row.get('smb2.filename', 'N/A'))
        tid = normalize_field(row.get('smb2.tid', 'N/A'))
        path = shorten_path(filename) if filename != "N/A" else "N/A"
        
        # Use mapped and normalized fields
        status_display = row.get('smb2.nt_status_desc', 'N/A')
        cmd = row.get('smb2.cmd', '-1')
        is_response = row.get('smb2.flags.response', 'False') == 'True'
        op_name = row.get('smb2.cmd_desc', 'Unknown Request / Response')
        
        # Handle status description
        status_desc = status_display.split('(')[0].strip() if status_display != 'N/A' else 'Not applicable'
        
        op = {
            'Frame': row.get('frame.number', 'N/A'),
            'Command': op_name,
            'Path': path,
            'Status': status_display,
            'StatusDesc': status_desc,
            'TID': tid,
            'orig_idx': idx
        }
        
        # Add selected fields
        for field in selected_fields:
            if field not in mandatory_fields:
                # Special handling for fields that should only show when meaningful
                if field in ["smb2.create.action", "smb2.ioctl.function"]:
                    value = row.get(f"{field}_desc", row.get(field, ""))
                    # Only include if there's actual meaningful data
                    if value and str(value).strip() != "" and str(value) != "N/A" and not str(value).startswith("Unknown"):
                        op[field] = str(value)
                    # Don't add the field if it's empty or meaningless
                else:
                    value = row.get(f"{field}_desc", row.get(field, 'N/A'))
                    op[field] = str(value) if value is not None else 'N/A'
        if 'smb2.ioctl.function' in filtered_frames.columns:
            ioctl_value = row.get('smb2.ioctl.function_desc', row.get('smb2.ioctl.function', ''))
            # Only include ioctl function if there's meaningful data
            if ioctl_value and str(ioctl_value).strip() != "" and str(ioctl_value) != "N/A" and not str(ioctl_value).startswith("Unknown"):
                op['smb2.ioctl.function'] = str(ioctl_value)
        
        operations.append(op)
        logger.debug(f"Processed row {idx}: filename={filename}, status_code={row.get('smb2.nt_status', 'N/A')}, tid={tid}, path={path}")
    
    logger.info(f"Processed {len(operations)} operations in {time.time() - start_time:.2f}s")
    return operations

# Add to builtins for Cell 8
builtins.load_and_summarize_session = load_and_summarize_session
builtins.update_operations = update_operations
builtins.operations = operations
builtins.session_frames = session_frames

# Initialize Cell 7
logger.info("Session loading and filtering utilities initialized. Session processing deferred to dashboard.")

In [None]:
# Cell 8: Replay Mechanism Definitions for SMB2 Sessions
import time
from impacket.smbconnection import SMBConnection, SessionError
from IPython.display import display, HTML, clear_output
import os
import builtins

# Import from builtins (set in Cells 1–7)
from builtins import logger, replay_config, SMB2_OP_NAME_DESC

def setup_pre_trace_state(conn, selected_operations, default_tree_id):
    """
    Set up the file system state on the lab server before replaying the selected operations.

    Args:
        conn: SMBConnection object to the lab server.
        selected_operations: List of selected operation dictionaries.
        default_tree_id: Tree ID to use for creating directories and files.
    """
    logger.info("Setting up pre-trace state for selected operations")

    # Collect all unique file paths and identify files created in the selected operations
    all_paths = set()
    created_files = set()
    for op in selected_operations:
        filename = op.get('smb2.filename', '')
        if filename and filename not in ['.', '..']:
            all_paths.add(filename)
        # Identify files created in the selected operations
        if (op.get('smb2.cmd') == '5' and 
            op.get('smb2.flags.response') == 'True' and 
            op.get('smb2.create.action') == 'FILE_CREATED'):
            created_files.add(filename)

    # Infer directories from paths
    directories = set()
    for path in all_paths:
        parts = path.split('\\')
        for i in range(1, len(parts)):
            dir_path = '\\'.join(parts[:i])
            if dir_path:
                directories.add(dir_path)

    # Create directories
    for dir_path in sorted(directories, key=lambda x: x.count('\\')):
        try:
            conn.createDirectory(default_tree_id, dir_path)
            logger.debug(f"Created directory: {dir_path}")
        except SessionError as e:
            if "STATUS_OBJECT_NAME_COLLISION" not in str(e):
                logger.error(f"Failed to create directory {dir_path}: {e}")

    # Create files that existed before the selected operations
    for path in all_paths:
        if path not in directories and path not in created_files:
            try:
                conn.createFile(default_tree_id, path, disposition=3)  # FILE_OPEN_IF
                logger.debug(f"Created pre-existing file: {path}")
            except SessionError as e:
                logger.error(f"Failed to create file {path}: {e}")

def replay_session(selected_operations, output_widget):
    """
    Replay selected SMB2 operations using impacket.

    Args:
        selected_operations: List of selected operation dictionaries.
        output_widget: IPython widget for displaying replay status.
    """
    logger.info("Starting replay_session for selected operations")
    if not selected_operations:
        logger.info("No operations selected for replay")
        with output_widget:
            clear_output(wait=True)
            display(HTML("<p style='color: red;'>No operations selected for replay.</p>"))
        return

    # Extract server configuration from replay_config
    server_ip = replay_config.get("server_ip", "10.216.29.241")
    domain = replay_config.get("domain", "nas-deep.local")
    username = replay_config.get("username", "jtownsen")
    password = replay_config.get("password", "")
    default_tree_name = replay_config.get("tree_name", "2pm")
    max_wait = replay_config.get("max_wait", 5.0)

    logger.debug(f"Using replay_config: server_ip={server_ip}, domain={domain}, username={username}, tree_name={default_tree_name}, max_wait={max_wait}")

    try:
        # Establish SMB connection
        logger.debug(f"Connecting to SMB server: {server_ip}")
        conn = SMBConnection(server_ip, server_ip, timeout=max_wait)
        conn.login(username, password, domain)
        logger.info("Successfully connected to SMB server")

        # Connect to the default tree
        default_tree_id = conn.connectTree(default_tree_name)
        logger.debug(f"Connected to default tree {default_tree_name}, tree_id={default_tree_id}")

        # Setup pre-trace state for selected operations
        setup_pre_trace_state(conn, selected_operations, default_tree_id)

        # Initialize mappings for tree IDs and file IDs
        tid_mapping = {}
        fid_mapping = {}
        state = {'last_new_tid': None, 'last_new_fid': None}

        # Command handlers
        def handle_tree_connect(conn, op, tid_mapping, state):
            share_path = op.get('smb2.tree', '')
            share_name = share_path.split('\\')[-1] if '\\' in share_path else share_path
            state['last_new_tid'] = conn.connectTree(share_name)
            logger.debug(f"Tree Connect: {share_name}, new_tid={state['last_new_tid']}")

        def handle_create(conn, op, tid_mapping, fid_mapping, state):
            original_tid = op.get('smb2.tid', '')
            new_tid = tid_mapping.get(original_tid, default_tree_id)
            filename = op.get('smb2.filename', '')
            disposition = int(op.get('smb2.create_disposition', 1))  # Default FILE_OPEN
            state['last_new_fid'] = conn.createFile(new_tid, filename, disposition=disposition)
            logger.debug(f"Create: {filename}, new_fid={state['last_new_fid']}")

        def handle_close(conn, op, tid_mapping, fid_mapping):
            original_tid = op.get('smb2.tid', '')
            original_fid = op.get('smb2.fid', '')
            new_tid = tid_mapping.get(original_tid, default_tree_id)
            new_fid = fid_mapping.get(original_fid)
            if new_fid:
                conn.closeFile(new_tid, new_fid)
                logger.debug(f"Close: fid={original_fid}")

        def handle_read(conn, op, tid_mapping, fid_mapping):
            original_tid = op.get('smb2.tid', '')
            original_fid = op.get('smb2.fid', '')
            new_tid = tid_mapping.get(original_tid, default_tree_id)
            new_fid = fid_mapping.get(original_fid)
            if new_fid:
                offset = int(op.get('smb2.read.offset', 0))
                length = int(op.get('smb2.read.length', 1024))
                conn.readFile(new_tid, new_fid, offset, length)
                logger.debug(f"Read: fid={original_fid}, offset={offset}, length={length}")

        def handle_write(conn, op, tid_mapping, fid_mapping):
            original_tid = op.get('smb2.tid', '')
            original_fid = op.get('smb2.fid', '')
            new_tid = tid_mapping.get(original_tid, default_tree_id)
            new_fid = fid_mapping.get(original_fid)
            if new_fid:
                offset = int(op.get('smb2.write.offset', 0))
                data = bytes.fromhex(op.get('smb2.write_data', '')) if op.get('smb2.write_data') else b''
                conn.writeFile(new_tid, new_fid, data, offset)
                logger.debug(f"Write: fid={original_fid}, offset={offset}, data_length={len(data)}")

        command_handlers = {
            3: handle_tree_connect,  # Tree Connect
            5: handle_create,       # Create
            6: handle_close,        # Close
            8: handle_read,         # Read
            9: handle_write         # Write
        }

        # Process selected operations in order
        for op in selected_operations:
            is_response = op.get('smb2.flags.response') == 'True'
            cmd = int(op.get('smb2.cmd', -1))

            if not is_response:  # Request
                if cmd in command_handlers:
                    try:
                        command_handlers[cmd](conn, op, tid_mapping, fid_mapping, state)
                    except SessionError as e:
                        logger.error(f"Error executing command {cmd}: {e}")
                elif 0 <= cmd <= 18:
                    logger.warning(f"Command {cmd} ({SMB2_OP_NAME_DESC.get(cmd, ('Unknown', 'Unknown'))[0]}) not yet implemented")
                else:
                    logger.warning(f"Invalid command code: {cmd}")
            else:  # Response
                if cmd == 3:  # Tree Connect response
                    original_tid = op.get('smb2.tid', '')
                    if state['last_new_tid'] is not None:
                        tid_mapping[original_tid] = state['last_new_tid']
                        logger.debug(f"Mapped tid {original_tid} to {state['last_new_tid']}")
                        state['last_new_tid'] = None
                elif cmd == 5:  # Create response
                    original_fid = op.get('smb2.fid', '')
                    if state['last_new_fid'] is not None:
                        fid_mapping[original_fid] = state['last_new_fid']
                        logger.debug(f"Mapped fid {original_fid} to {state['last_new_fid']}")
                        state['last_new_fid'] = None

        # Clean up
        logger.debug("Disconnecting from SMB server")
        conn.close()
        logger.info("Disconnected from SMB server")

        with output_widget:
            clear_output(wait=True)
            display(HTML("<p style='color: green;'>Replay completed.</p>"))

    except Exception as e:
        logger.critical(f"Error during replay: {e}")
        with output_widget:
            clear_output(wait=True)
            display(HTML(f"<p style='color: red;'>Replay failed: {e}</p>"))

# Export to builtins for Cell 10
builtins.replay_session = replay_session

In [None]:
# Cell 9: Dashboard Setup for Session Visualization and Configuration
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import os
import json
import pyarrow as pa
import pyarrow.parquet as pq
import builtins
import logging

# Import from builtins (set in Cells 1–8)
from builtins import logger, pcap_config, replay_config, capture
from builtins import load_capture, get_output_dir, shorten_path, normalize_path, get_tree_name_mapping, list_session_files
from builtins import load_and_summarize_session, update_operations, operations, session_frames, run_ingestion
from builtins import FIELD_MAPPINGS, HEX_FIELDS

# Custom JupyterOutputHandler for log_output
class JupyterOutputHandler(logging.Handler):
    def __init__(self, output_widget):
        super().__init__()
        self.output_widget = output_widget
        self.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - [%(asctime)s] %(message)s", datefmt="%a %b %d %H:%M:%S %Y"))

    def emit(self, record):
        msg = self.format(record)
        with self.output_widget:
            display(HTML(f"<pre>{msg}</pre>"))

# Initialize output widgets
builtins.log_output = widgets.Output()
builtins.output_cell = widgets.Output()
builtins.progress_output = widgets.Output()
builtins.progress_label = widgets.HTML(value="Ready")

# Add JupyterOutputHandler to logger
jupyter_handler = JupyterOutputHandler(builtins.log_output)
logger.addHandler(jupyter_handler)

# Log confirmation
logger.info("Initialized output widgets: log_output, output_cell, progress_output, progress_label")

# Define dashboard widgets
builtins.debug_slider = widgets.IntSlider(
    value=logger.getEffectiveLevel() // 10,  # Map logging level to slider (e.g., 20=INFO=1)
    min=0,
    max=3,
    description="Debug Level:",
    layout={'width': '700px'}
)

builtins.case_number_input = widgets.Text(
    value="",
    placeholder="2010373016",
    description="Case Number:",
    disabled=False,
    layout={'width': '700px'}
)

builtins.capture_dropdown = widgets.Dropdown(
    options=["Select a capture"],
    description="Capture:",
    disabled=True,
    layout={'width': '700px'}
)

builtins.session_dropdown = widgets.Dropdown(
    options=["Select a session"],
    description="Session:",
    disabled=True,
    layout={'width': '700px'}
)

builtins.ingest_button = widgets.Button(
    description="Ingest Trace",
    button_style="warning",
    tooltip="Ingest the selected capture into Parquet",
    disabled=True,
    layout={'visibility': 'visible'}
)

builtins.reingest_button = widgets.Button(
    description="Re-ingest",
    button_style="warning",
    tooltip="Re-ingest the selected capture, overwriting existing data",
    disabled=True,
    layout={'visibility': 'visible'}
)

builtins.replay_button = widgets.Button(
    description="Replay Session",
    button_style="success",
    tooltip="Replay the selected session on the lab server",
    disabled=True,
    layout={'visibility': 'visible'}
)

builtins.button_box = widgets.HBox([builtins.ingest_button, builtins.reingest_button, builtins.replay_button])

builtins.file_combobox = widgets.Combobox(
    options=["Entire Stream"],
    placeholder="Select a file (or use Entire Stream for all)",
    description="File Filter:",
    ensure_option=True,
    disabled=True,
    layout={'width': '900px'},
    style={'description_width': 'initial'},
    rows=10
)

# Apply custom CSS for combobox width
display(HTML("""
<style>
    .widget-combobox > select {
        width: 900px !important;
        max-width: 900px !important;
        min-width: 900px !important;
    }
</style>
"""))

builtins.check_fields_select = widgets.SelectMultiple(
    options=[],
    description="Fields:",
    disabled=True,
    layout={'width': '900px'},
    tooltip="Select fields to display as columns in the table for Entire Stream"
)

builtins.server_ip_input = widgets.Text(
    value=replay_config.get("server_ip", "10.216.29.241"),
    placeholder="Enter server IP",
    description="Server IP:",
    disabled=False
)

builtins.domain_input = widgets.Text(
    value=replay_config.get("domain", "nas-deep.local"),
    placeholder="Enter domain",
    description="Domain:",
    disabled=False
)

builtins.username_input = widgets.Text(
    value=replay_config.get("username", "jtownsen"),
    placeholder="Enter username",
    description="Username:",
    disabled=False
)

builtins.password_input = widgets.Password(
    value=replay_config.get("password", "!Elephant1"),
    placeholder="Enter password",
    description="Password:",
    disabled=False
)

builtins.tree_name_input = widgets.Text(
    value=replay_config.get("tree_name", "2pm"),
    placeholder="Enter tree name",
    description="Tree Name:",
    disabled=False
)

builtins.max_time_input = widgets.FloatText(
    value=replay_config.get("max_wait", 5.0),
    description="Max Wait (s):",
    disabled=False
)

builtins.save_button = widgets.Button(
    description="Save Config",
    button_style="info",
    tooltip="Save replay server configuration",
    disabled=True
)

builtins.dashboard = widgets.VBox([
    builtins.debug_slider, builtins.case_number_input, builtins.capture_dropdown, builtins.session_dropdown,
    builtins.button_box, builtins.file_combobox, builtins.check_fields_select, builtins.server_ip_input,
    builtins.domain_input, builtins.username_input, builtins.password_input, builtins.tree_name_input,
    builtins.max_time_input, builtins.save_button, builtins.log_output, builtins.progress_output, builtins.output_cell
])

# Initialize case number if capture_path is valid
capture = load_capture()
if capture and os.path.exists(capture):
    case_num = capture.split(os.sep)[2] if len(capture.split(os.sep)) > 2 else ""
    if case_num:
        builtins.case_number_input.value = case_num
        logger.info(f"Initialized case number: {case_num} from capture: {capture}")
else:
    logger.info("No valid initial capture path in config. Enter a case number and select a capture.")
    with builtins.output_cell:
        clear_output(wait=True)
        display(HTML("<p>Enter a case number and select a capture to view data.</p>"))

# Export to builtins
builtins.debug_slider = debug_slider
builtins.pcap_config = pcap_config
builtins.replay_config = replay_config
logger.info("Dashboard widgets initialized and exported to builtins.")

In [None]:
# Cell 10: Event Handlers and Rendering for Session Visualization Dashboard
import time
import pandas as pd
from IPython.display import display, HTML, clear_output
import os
import pickle
import builtins
import mimetypes
import struct

# Import from builtins (set in Cells 1–9) with fallback
try:
    from builtins import (
        logger, pcap_config, replay_config, replay_session,
        load_capture, get_output_dir, list_session_files, load_and_summarize_session,
        update_operations, operations, session_frames, run_ingestion,
        log_output, output_cell, progress_output, progress_label,
        case_number_input, capture_dropdown, session_dropdown,
        ingest_button, reingest_button, replay_button, button_box, file_combobox,
        check_fields_select, server_ip_input, domain_input, username_input,
        password_input, tree_name_input, max_time_input, save_button, dashboard,
        FIELD_MAPPINGS, HEX_FIELDS, CREATE_ACTION_DESC
    )
except ImportError as e:
    logger.critical(f"Failed to import from builtins: {e}. Check Cell 9 execution.")
    raise

config_file = "/home/jovyan/work/smbreplay/config.pkl"

def status_callback(message):
    """Log status messages to logger and display in log_output."""
    logger.info(f"Status: {message}")
    with log_output:
        display(HTML(f"<pre>{message}</pre>"))

builtins.status_callback = status_callback

def is_capture_file(file_path):
    """
    Detect if a file is a network capture file using file signatures and heuristics.
    Handles files with non-standard names like 'pcap1', 'pcap2', 'trace_data', etc.
    
    Returns:
        bool: True if the file is identified as a capture file, False otherwise.
    """
    try:
        # Get filename and extension for analysis
        filename = os.path.basename(file_path).lower()
        file_extension = os.path.splitext(file_path)[1].lower()
        
        # Comprehensive list of capture file extensions
        common_capture_extensions = [
            '.pcap', '.pcapng', '.cap', '.dmp', '.5vw', '.acp', '.apc', '.atc', 
            '.bfr', '.enc', '.erf', '.fdc', '.pkt', '.trc', '.trace', '.wpz', 
            '.snoop', '.rf5', '.ntar', '.wpc', '.logpkt', '.out', '.raw'
        ]
        
        # Check for capture-related keywords in filename (even without proper extensions)
        capture_filename_patterns = [
            'pcap', 'capture', 'trace', 'packet', 'sniff', 'dump', 'traffic',
            'network', 'wireshark', 'tcpdump', 'tshark', 'netmon', 'sniffer',
            'ethereal', 'cap', 'pkt', 'trc'
        ]
        
        # Read file header for signature detection
        with open(file_path, 'rb') as f:
            header = f.read(32)  # Read more bytes for better detection
            
        if len(header) < 4:
            logger.debug(f"File {file_path} too small to be a capture file")
            return False
            
        # === PRIMARY: File signature detection ===
        
        # PCAP file signatures (most common)
        # Classic pcap: 0xA1B2C3D4 (big endian) or 0xD4C3B2A1 (little endian)
        # Modified pcap: 0xA1B23C4D (big endian) or 0x4D3CB2A1 (little endian)
        # Nanosecond pcap: 0xA1B23C4D or 0x4D3CB2A1
        pcap_signatures = [
            b'\xa1\xb2\xc3\xd4',  # microsecond big endian
            b'\xd4\xc3\xb2\xa1',  # microsecond little endian
            b'\xa1\xb2\x3c\x4d',  # nanosecond big endian
            b'\x4d\x3c\xb2\xa1'   # nanosecond little endian
        ]
        
        # PCAP-NG file signature
        # Section Header Block: 0x0A0D0D0A followed by block length
        pcapng_signature = b'\x0a\x0d\x0d\x0a'
        
        # Check PCAP signatures
        if header[:4] in pcap_signatures:
            logger.debug(f"File {file_path} detected as PCAP file by signature")
            return True
            
        # Check PCAP-NG signature
        if header[:4] == pcapng_signature:
            logger.debug(f"File {file_path} detected as PCAP-NG file by signature")
            return True
            
        # Snoop capture file (Sun/Solaris)
        # Signature: "snoop\0\0\0" followed by version
        if header[:8] == b'snoop\x00\x00\x00':
            logger.debug(f"File {file_path} detected as Snoop capture file")
            return True
            
        # Visual Networks capture (.5vw files)
        # Signature: 0x0556 at start
        if header[:2] == b'\x05\x56':
            logger.debug(f"File {file_path} detected as Visual Networks capture file")
            return True
            
        # Microsoft Network Monitor capture (.cap files)
        # Signature: "TRSNIFF data    " followed by 0x1a
        if header[:11] == b'TRSNIFF data':
            logger.debug(f"File {file_path} detected as Network Monitor capture file")
            return True
            
        # NetXray/Sniffer capture files
        # Signature: "XCP\0" at start
        if header[:4] == b'XCP\x00':
            logger.debug(f"File {file_path} detected as NetXray capture file")
            return True
            
        # Novell LANalyzer capture files
        # Signature: 0x01100000 or similar patterns
        if header[:4] == b'\x01\x10\x00\x00':
            logger.debug(f"File {file_path} detected as LANalyzer capture file")
            return True
            
        # EtherPeek/AiroPeek capture files
        # Signature: 0x7F followed by "EtherPeek" or similar
        if header[0:1] == b'\x7f' and b'Peek' in header[:16]:
            logger.debug(f"File {file_path} detected as EtherPeek capture file")
            return True
            
        # Tektronix K12xx capture files
        # Look for specific patterns in K12 files
        if b'K12' in header[:16] or header[:3] == b'\x00\x00\x80':
            logger.debug(f"File {file_path} detected as Tektronix K12 capture file")
            return True
            
        # === SECONDARY: Extension-based detection ===
        if file_extension in common_capture_extensions:
            logger.debug(f"File {file_path} detected by known extension: {file_extension}")
            return True
            
        # === EXCLUSION CHECK: Known non-capture binary formats ===
        # Check for common binary file signatures that are NOT capture files
        
        # Parquet files (Apache Parquet)
        if header[:4] == b'PAR1' or header[-4:] == b'PAR1':
            logger.debug(f"File {file_path} detected as Parquet file - excluding")
            return False
            
        # SQLite database files
        if header[:16] == b'SQLite format 3\x00':
            logger.debug(f"File {file_path} detected as SQLite database - excluding")
            return False
            
        # Common image formats
        image_signatures = [
            (b'\xff\xd8\xff', 'JPEG'),
            (b'\x89PNG\r\n\x1a\n', 'PNG'),
            (b'GIF8', 'GIF'),
            (b'BM', 'BMP'),
            (b'RIFF', 'RIFF/WEBP'),
            (b'\x00\x00\x01\x00', 'ICO')
        ]
        
        for sig, format_name in image_signatures:
            if header.startswith(sig):
                logger.debug(f"File {file_path} detected as {format_name} image - excluding")
                return False
                
        # Archive formats (with special handling for gzipped capture files)
        archive_signatures = [
            (b'PK\x03\x04', 'ZIP'),
            (b'PK\x05\x06', 'ZIP'),
            (b'PK\x07\x08', 'ZIP'),
            (b'BZh', 'BZIP2'),
            (b'\x7fELF', 'ELF'),
            (b'\xfd7zXZ\x00', 'XZ'),
            (b'Rar!', 'RAR')
        ]
        
        for sig, format_name in archive_signatures:
            if header.startswith(sig):
                logger.debug(f"File {file_path} detected as {format_name} archive - excluding")
                return False
                
        # Special handling for GZIP files - check if they might be gzipped capture files
        if header.startswith(b'\x1f\x8b\x08'):  # GZIP signature
            # Check filename for capture-related patterns
            gzip_capture_patterns = ['.pcap.gz', '.pcapng.gz', '.cap.gz', '.trace.gz', '.dmp.gz']
            if any(filename.endswith(pattern) for pattern in gzip_capture_patterns):
                logger.debug(f"File {file_path} detected as gzipped capture file - including")
                return True
            # Check if filename without .gz has capture patterns
            if filename.endswith('.gz'):
                base_filename = filename[:-3]  # Remove .gz
                for pattern in capture_filename_patterns:
                    if pattern in base_filename:
                        logger.debug(f"File {file_path} detected as gzipped capture file by pattern - including")
                        return True
            # Otherwise exclude as regular gzip
            logger.debug(f"File {file_path} detected as regular GZIP file - excluding")
            return False
                
        # Office documents and other formats
        office_signatures = [
            (b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1', 'MS Office'),
            (b'%PDF', 'PDF'),
            (b'\x7b\x5c\x72\x74\x66', 'RTF'),
            (b'ftyp', 'MP4/MOV', 4),  # Check at offset 4
        ]
        
        for sig_data in office_signatures:
            if len(sig_data) == 3:
                sig, format_name, offset = sig_data
                if len(header) > offset + len(sig) and header[offset:offset+len(sig)] == sig:
                    logger.debug(f"File {file_path} detected as {format_name} document - excluding")
                    return False
            else:
                sig, format_name = sig_data
                if header.startswith(sig):
                    logger.debug(f"File {file_path} detected as {format_name} document - excluding")
                    return False
                    
        # Database files
        db_signatures = [
            (b'\x00\x01\x00\x00Standard Jet DB', 'MS Access'),
            (b'Microsoft C/C++ MSF 7.00', 'MS Debug'),
        ]
        
        for sig, format_name in db_signatures:
            if header.startswith(sig):
                logger.debug(f"File {file_path} detected as {format_name} database - excluding")
                return False

        # === TERTIARY: Filename pattern matching ===
        # Check for capture-related keywords in filename (handles pcap1, pcap2, trace_data, etc.)
        for pattern in capture_filename_patterns:
            if pattern in filename:
                logger.debug(f"File {file_path} detected by filename pattern: '{pattern}' in '{filename}'")
                # Do additional validation to reduce false positives
                if len(header) >= 8:
                    # Check if it's likely binary data (not a text file)
                    try:
                        # If we can decode as text and it's mostly printable, probably not a capture
                        text_content = header.decode('utf-8', errors='strict')
                        if all(c.isprintable() or c.isspace() for c in text_content):
                            logger.debug(f"File {file_path} appears to be text despite pattern match - skipping")
                            continue
                    except UnicodeDecodeError:
                        # Can't decode as text, likely binary - good sign for capture file
                        pass
                        
                    # Additional check: exclude files with known non-capture extensions
                    # But allow gzipped capture files
                    non_capture_extensions = [
                        '.parquet', '.db', '.sqlite', '.sqlite3', '.jpg', '.jpeg', '.png', '.gif', 
                        '.bmp', '.zip', '.tar', '.bz2', '.xz', '.rar', '.7z', '.pdf', 
                        '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.mp4', '.avi', 
                        '.mov', '.mkv', '.mp3', '.wav', '.exe', '.dll', '.so', '.dylib'
                    ]
                    
                    # Special handling for .gz files - allow if they appear to be gzipped captures
                    if file_extension == '.gz':
                        gzip_capture_patterns = ['.pcap.gz', '.pcapng.gz', '.cap.gz', '.trace.gz', '.dmp.gz']
                        if any(filename.endswith(pattern) for pattern in gzip_capture_patterns):
                            logger.debug(f"File {file_path} is gzipped capture file - allowing")
                            return True
                        # Check if base filename (without .gz) has capture patterns
                        base_filename = filename[:-3]  # Remove .gz
                        for capture_pattern in capture_filename_patterns:
                            if capture_pattern in base_filename:
                                logger.debug(f"File {file_path} is gzipped capture file by pattern - allowing")
                                return True
                        # Otherwise treat as regular gzip (exclude)
                        logger.debug(f"File {file_path} is regular gzip file - excluding")
                        continue
                    
                    if file_extension in non_capture_extensions:
                        logger.debug(f"File {file_path} has non-capture extension {file_extension} despite pattern match - excluding")
                        continue
                        
                    return True
                        
        # === QUATERNARY: Heuristic checks for unknown formats ===
        # Look for patterns that suggest network capture data
        if len(header) >= 16:
            # Check for repeating structures that might indicate packet headers
            # Look for common ethernet frame patterns
            if header[12:14] in [b'\x08\x00', b'\x08\x06', b'\x86\xdd']:  # IP, ARP, IPv6
                logger.debug(f"File {file_path} detected by ethernet frame pattern")
                return True
                
            # Check for common IP header patterns
            if header[0:1] in [b'\x45', b'\x46'] and len(header) > 20:  # IPv4 headers
                logger.debug(f"File {file_path} detected by IP header pattern")
                return True
        
        # === FINAL: MIME type check ===
        mime_type, _ = mimetypes.guess_type(file_path)
        if mime_type and any(capture_type in mime_type.lower() for capture_type in ['pcap', 'capture', 'tcpdump']):
            logger.debug(f"File {file_path} detected as capture file by MIME type: {mime_type}")
            return True
            
        logger.debug(f"File {file_path} not detected as capture file. Extension: {file_extension}, Filename: {filename}, Header: {header[:8].hex()}")
        return False
        
    except Exception as e:
        logger.warning(f"Error detecting file type for {file_path}: {e}")
        # Enhanced fallback - check both extension and filename patterns
        filename = os.path.basename(file_path).lower()
        file_extension = os.path.splitext(file_path)[1].lower()
        
        # Check extension
        if file_extension in ['.pcap', '.pcapng', '.cap', '.dmp', '.trc', '.trace']:
            logger.debug(f"Fallback extension check for {file_path}: True")
            return True
            
        # Check filename patterns
        fallback_patterns = ['pcap', 'capture', 'trace', 'dump', 'packet', 'sniff']
        for pattern in fallback_patterns:
            if pattern in filename:
                logger.debug(f"Fallback filename pattern check for {file_path}: True (pattern: {pattern})")
                return True
                
        logger.debug(f"Fallback check for {file_path}: False")
        return False

def update_progress(message):
    """Update progress label and log to logger."""
    logger.info(f"Progress: {message}")
    progress_label.value = message
    with progress_output:
        clear_output(wait=True)
        display(progress_label)

def update_button_states():
    """Update button states based on session existence."""
    logger.debug("Updating button states")
    capture = load_capture()
    output_dir = get_output_dir(capture) if capture else None
    session_files = list_session_files(output_dir) if output_dir else []
    ingest_button.disabled = bool(session_files)
    reingest_button.disabled = not bool(session_files)
    replay_button.disabled = not bool(session_files)
    logger.debug(f"Button states updated: ingest={ingest_button.disabled}, reingest={reingest_button.disabled}, replay={replay_button.disabled}")

# Export update_button_states to builtins
builtins.update_button_states = update_button_states

def render_page():
    """Render the operations table with smb2.cmd, smb2.create.action, and smb2.nt_status, leaving NULL as blank."""
    with output_cell:
        logger.info("Starting render_page")
        if not operations:
            clear_output(wait=True)
        
        update_progress("Rendering operations...")
        
        try:
            logger.info(f"Operations list length: {len(operations)}")
            if not operations:
                logger.info("No operations data to render. Check session selection.")
                display(HTML("<p style='color: red;'>No operations to display. Ensure a session is selected.</p>"))
                return
            
            ops_df = pd.DataFrame(operations)
            if ops_df.empty:
                logger.info("Operations DataFrame is empty. Check data source.")
                display(HTML("<p style='color: red;'>No data in operations DataFrame. Check logs for details.</p>"))
                return
            
            logger.debug(f"Columns in ops_df: {ops_df.columns.tolist()}")
            if "smb2.nt_status" in ops_df.columns:
                logger.debug(f"Sample smb2.nt_status values: {ops_df['smb2.nt_status'].head().tolist()}")
            else:
                logger.warning("smb2.nt_status not found in operations DataFrame.")
            
            mandatory_columns = ['Frame', 'Command', 'Path', 'smb2.nt_status']
            available_columns = [col for col in mandatory_columns if col in ops_df.columns]
            if not all(col in ops_df.columns for col in mandatory_columns):
                logger.debug(f"Missing columns: {set(mandatory_columns) - set(ops_df.columns)}. Using available columns: {available_columns}")
                if not available_columns:
                    raise ValueError("No mandatory columns found in DataFrame.")
            
            logger.debug("First 2 lines of operations:")
            for i, op in enumerate(operations[:2]):
                logger.debug(f"Line {i + 1}: {op}")
            
            if 'smb2.nt_status' in ops_df.columns:
                ops_df['smb2.nt_status'] = ops_df.get('Status', ops_df['smb2.nt_status']).apply(
                    lambda x: FIELD_MAPPINGS.get("smb2.nt_status", {}).get("mapping", {}).get(str(x), "") if pd.notna(x) and x != 'N/A' else ""
                )
            
            optional_columns = [col for col in ops_df.columns if col in check_fields_select.value and col not in mandatory_columns]
            display_columns = available_columns + optional_columns
            if "smb2.nt_status" in ops_df.columns and "smb2.nt_status" not in display_columns:
                display_columns.append("smb2.nt_status")
            logger.debug(f"Rendering table with columns: {display_columns}")
            
            styles = [
                {"selector": "td:nth-child(3)", "props": [("max-width", "300px"), ("word-wrap", "break-word"), ("white-space", "normal")]}
            ]
            styled_df = ops_df[display_columns].fillna('').style.set_table_styles(styles).hide(axis="index")
            html_table = styled_df.to_html()
            
            sesid = session_frames["smb2.sesid"].iloc[0] if not session_frames.empty and "smb2.sesid" in session_frames.columns else "N/A"
            if "smb2.sesid" in session_frames.columns and "smb2.sesid" in FIELD_MAPPINGS:
                sesid = FIELD_MAPPINGS["smb2.sesid"]["normalize"](session_frames["smb2.sesid"].iloc[0]) or sesid
            
            cmd_list = []
            action_list = []
            if "smb2.cmd" in session_frames.columns:
                cmd_series = session_frames["smb2.cmd"].dropna()
                for idx, value in cmd_series.items():
                    if pd.notna(value):
                        cmd_values = str(value).split(',')
                        for cmd in cmd_values:
                            cmd_stripped = cmd.strip()
                            try:
                                cmd_int = int(cmd_stripped)
                                cmd_mapping = FIELD_MAPPINGS.get("smb2.cmd", {}).get("mapping", {})
                                cmd_name = cmd_mapping.get(str(cmd_int), "")
                                cmd_list.append(cmd_name)
                            except ValueError:
                                logger.debug(f"Invalid command value '{cmd_stripped}' in frame {session_frames.loc[idx, 'frame.number']}")
            
            if "smb2.create.action" in session_frames.columns:
                action_series = session_frames["smb2.create.action"].dropna()
                for idx, value in action_series.items():
                    if pd.notna(value):
                        action_values = str(value).split(',')
                        for action in action_values:
                            action_stripped = action.strip()
                            try:
                                action_int = int(action_stripped)
                                action_name = CREATE_ACTION_DESC.get(str(action_int), "")
                                action_list.append(action_name)
                            except ValueError:
                                logger.debug(f"Invalid create action value '{action_stripped}' in frame {session_frames.loc[idx, 'frame.number']}")
            
            commands = pd.Series(cmd_list).value_counts() if cmd_list else pd.Series()
            actions = pd.Series(action_list).value_counts() if action_list else pd.Series()
            summary = f"<b>Session {sesid}</b>: {len(session_frames)} frames, {len(commands)} unique commands: {commands.to_dict() if not commands.empty else {}}, {len(actions)} unique create actions: {actions.to_dict() if not actions.empty else {}}"
            
            logger.info("Displaying summary and table")
            display(HTML(summary))
            display(HTML(html_table), display_id=True)
            logger.info("Finished rendering")
        except Exception as e:
            logger.critical(f"Error during rendering: {e}")
            display(HTML(f"<p style='color: red;'>Rendering failed: {e}</p>"))
        finally:
            update_progress("Rendering complete")

def on_case_number_change(change):
    """Handle case number changes."""
    case_number = change["new"].strip()
    logger.info(f"Case number changed to: {case_number}")
    if not case_number:
        capture_dropdown.options = ["Select a capture"]
        capture_dropdown.disabled = True
        session_dropdown.options = ["Select a session"]
        session_dropdown.disabled = True
        file_combobox.options = ["Entire Stream"]
        file_combobox.disabled = True
        check_fields_select.options = []
        check_fields_select.disabled = True
        check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
        save_button.disabled = True
        ingest_button.disabled = True
        reingest_button.disabled = True
        replay_button.disabled = True
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p>Info: No case number entered.</p>"))
        return
    
    base_dir = f"/stingray/{case_number}"
    if not os.path.isdir(base_dir):
        logger.warning(f"Case directory not found: {base_dir}. Note: Path may be remote; consider SSH validation.")
        capture_dropdown.options = ["Select a capture"]
        capture_dropdown.disabled = True
        file_combobox.options = ["Entire Stream"]
        file_combobox.disabled = True
        check_fields_select.options = []
        check_fields_select.disabled = True
        check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
        with output_cell:
            clear_output(wait=True)
            display(HTML(f"<p style='color: red;'>Case directory not found: {base_dir}</p>"))
        return
    
    pcap_files = []
    logger.info(f"Scanning {base_dir} for capture files using enhanced detection...")
    
    file_count = 0
    error_count = 0
    
    for root, _, files in os.walk(base_dir):
        for f in files:
            file_count += 1
            full_path = os.path.join(root, f)
            try:
                # Use enhanced detection, with fallback to extension-based
                if is_capture_file(full_path):
                    rel_path = os.path.relpath(full_path, base_dir).replace("\\", "/")
                    pcap_files.append(rel_path)
                    logger.debug(f"Added capture file: {rel_path}")
            except Exception as e:
                error_count += 1
                logger.warning(f"Error checking file {full_path}: {e}")
                # Fallback to extension-based detection for this file
                if f.lower().endswith(('.pcap', '.pcapng', '.cap', '.trc', '.trace')):
                    rel_path = os.path.relpath(full_path, base_dir).replace("\\", "/")
                    pcap_files.append(rel_path)
                    logger.debug(f"Added capture file (fallback): {rel_path}")
    
    logger.info(f"Scanned {file_count} files, found {len(pcap_files)} capture files, {error_count} errors")
    
    if not pcap_files:
        logger.warning(f"No capture files found in {base_dir} using enhanced detection (scanned {file_count} files, {error_count} errors)")
        capture_dropdown.options = ["Select a capture"]
        capture_dropdown.disabled = True
        file_combobox.options = ["Entire Stream"]
        file_combobox.disabled = True
        check_fields_select.options = []
        check_fields_select.disabled = True
        check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
        with output_cell:
            clear_output(wait=True)
            display(HTML(f"<p style='color: red;'>No capture files found in {base_dir} (scanned {file_count} files)</p>"))
        return
    
    logger.info(f"Found {len(pcap_files)} capture files in {base_dir}: {pcap_files[:5]}... (first 5 shown)")
    capture_dropdown.options = ["Select a capture"] + sorted(pcap_files)
    capture_dropdown.disabled = False
    capture_dropdown.value = "Select a capture"
    with output_cell:
        clear_output(wait=True)
        display(HTML(f"<p>Info: Select a capture file to proceed.</p>"))

def on_capture_change(change):
    """Handle capture selection."""
    capture_rel_path = change["new"]
    logger.info(f"Capture changed to: {capture_rel_path}")
    if capture_rel_path == "Select a capture":
        session_dropdown.options = ["Select a session"]
        session_dropdown.disabled = True
        file_combobox.options = ["Entire Stream"]
        file_combobox.disabled = True
        check_fields_select.options = []
        check_fields_select.disabled = True
        check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
        save_button.disabled = True
        ingest_button.disabled = True
        reingest_button.disabled = True
        replay_button.disabled = True
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p>Info: No capture selected.</p>"))
        return
    
    case_number = case_number_input.value.strip()
    capture_path = os.path.join("/stingray", case_number, capture_rel_path).replace("\\", "/")
    logger.debug(f"Constructed capture path: {capture_path}")
    if not os.path.exists(capture_path):
        logger.warning(f"Invalid capture path: {capture_path}")
        with output_cell:
            clear_output(wait=True)
            display(HTML(f"<p style='color: red;'>Invalid capture path: {capture_path}</p>"))
        return
    
    pcap_config["capture_path"] = capture_path
    builtins.capture = capture_path
    try:
        with open(config_file, 'wb') as f:
            pickle.dump({'pcap_config': pcap_config, 'replay_config': replay_config}, f)
        logger.info(f"Saved pcap_config to {config_file}: capture_path={capture_path}, verbose_level={pcap_config['verbose_level']}")
    except Exception as e:
        logger.critical(f"Failed to save {config_file}: {e}")
    
    capture = load_capture()
    if capture is None:
        logger.warning(f"Failed to load capture from config.pkl, using constructed path: {capture_path}")
        capture = capture_path  # Fallback to constructed path
        pcap_config["capture_path"] = capture
        builtins.capture = capture
    
    output_dir = get_output_dir(capture) if capture else None
    session_files = list_session_files(output_dir) if output_dir else []
    
    if session_files:
        session_dropdown.options = ["Select a session"] + session_files
        session_dropdown.disabled = False
        ingest_button.disabled = True
        reingest_button.disabled = False
        replay_button.disabled = False
        logger.info(f"Populated session_dropdown with {len(session_files)} sessions: {session_files[:5]}...")
    else:
        session_dropdown.options = ["Select a session"]
        session_dropdown.disabled = True
        ingest_button.disabled = False
        reingest_button.disabled = True
        replay_button.disabled = True
        logger.info("No session files found, ingest_button enabled for processing")
    
    file_combobox.options = ["Entire Stream"] + (session_files if session_files else [])
    file_combobox.disabled = not session_files
    check_fields_select.tooltip = f"Select fields to display as columns in the table for {file_combobox.value}"
    with output_cell:
        clear_output(wait=True)
        if session_files:
            display(HTML(f"<p>Info: Found {len(session_files)} sessions. Select one or click 'Re-ingest' to rebuild.</p>"))
        else:
            display(HTML("<p>Info: No sessions found. Click 'Ingest Trace' to process.</p>"))
    logger.debug(f"Button states: ingest={ingest_button.disabled}, reingest={reingest_button.disabled}, replay={replay_button.disabled}")

def on_ingest_button_clicked(b):
    """Handle ingest button click."""
    logger.info("Ingest button clicked")
    capture = load_capture()
    logger.info(f"Using capture_path = {capture} for ingestion")
    if not capture:
        logger.warning("No valid capture path available for ingestion.")
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p style='color: red;'>No valid capture path available for ingestion.</p>"))
        return
    
    ingest_button.disabled = True
    reingest_button.disabled = True
    replay_button.disabled = True
    update_progress(f"Ingesting {os.path.basename(capture)}...")
    with progress_output:
        clear_output(wait=True)
        try:
            logger.info("Starting run_ingestion")
            result = run_ingestion(capture_path=capture, reassembly_enabled=True, force_reingest=False, verbose=None)
            logger.info(f"Ingestion of {capture} completed. Result: {result is not None}")
            case_number = case_number_input.value.strip()
            if case_number:
                logger.info(f"Calling on_capture_change with {capture_dropdown.value}")
                on_capture_change({"new": capture_dropdown.value})
            else:
                logger.warning("Case number not set. Please select a case.")
            if result:
                logger.info(f"Ingestion result: {list(result.get('sessions', {}).keys())}")
        except Exception as e:
            logger.critical(f"Error during ingestion: {e}")
            with output_cell:
                clear_output(wait=True)
                display(HTML(f"<p style='color: red;'>Ingestion failed: {e}</p>"))
        finally:
            ingest_button.disabled = False
            reingest_button.disabled = False
            replay_button.disabled = False
            update_button_states()

def on_reingest_button_clicked(b):
    """Handle re-ingest button click."""
    logger.info("Re-ingest button clicked")
    capture = load_capture()
    logger.info(f"Using capture_path = {capture} for re-ingestion")
    if not capture:
        logger.warning("No valid capture path available for re-ingestion.")
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p style='color: red;'>No valid capture path available for re-ingestion.</p>"))
        return
    
    ingest_button.disabled = True
    reingest_button.disabled = True
    replay_button.disabled = True
    update_progress(f"Re-ingesting {os.path.basename(capture)}...")
    with progress_output:
        clear_output(wait=True)
        try:
            logger.info("Starting run_ingestion")
            result = run_ingestion(capture_path=capture, reassembly_enabled=True, force_reingest=True, verbose=None)
            logger.info(f"Re-ingestion of {capture} completed. Result: {result is not None}")
            case_number = case_number_input.value.strip()
            if case_number:
                logger.info(f"Calling on_capture_change with {capture_dropdown.value}")
                on_capture_change({"new": capture_dropdown.value})
            else:
                logger.warning("Case number not set. Please select a case.")
            if result:
                logger.info(f"Re-ingestion result: {list(result.get('sessions', {}).keys())}")
        except Exception as e:
            logger.critical(f"Error during re-ingestion: {e}")
            with output_cell:
                clear_output(wait=True)
                display(HTML(f"<p style='color: red;'>Re-ingestion failed: {e}</p>"))
        finally:
            ingest_button.disabled = False
            reingest_button.disabled = False
            replay_button.disabled = False
            update_button_states()

def on_replay_button_clicked(b):
    """Handle replay button click, triggering replay_session."""
    logger.info("Replay button clicked")
    if session_dropdown.value == "Select a session":
        logger.info("No session selected for replay.")
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p style='color: red;'>Select a session to replay.</p>"))
        return
    
    update_progress(f"Replaying session {session_dropdown.value}...")
    try:
        replay_session(session_dropdown.value, output_cell)
        logger.info(f"Replay of {session_dropdown.value} completed.")
    except Exception as e:
        logger.critical(f"Error during replay: {e}")
        with output_cell:
            clear_output(wait=True)
            display(HTML(f"<p style='color: red;'>Replay failed: {e}</p>"))
    finally:
        update_progress("Replay complete")
        update_button_states()

def on_session_change(change):
    """Handle session selection."""
    logger.info(f"Session changed to: {change['new']}")
    if change["new"] == "Select a session":
        file_combobox.options = ["Entire Stream"]
        file_combobox.disabled = True
        check_fields_select.options = []
        check_fields_select.disabled = True
        check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
        save_button.disabled = True
        replay_button.disabled = True
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p>Info: No session selected.</p>"))
        return
    
    capture = load_capture()
    if capture is None:
        logger.warning("Failed to load capture for session change")
        return
    
    session_frames, field_options, file_options, selected_fields = load_and_summarize_session(capture, change["new"])
    if session_frames is None:
        logger.warning(f"Session frames not loaded. Check load_and_summarize_session.")
        return
    
    file_combobox.options = ["Entire Stream"] + (file_options if file_options else [])
    file_combobox.value = "Entire Stream"
    file_combobox.disabled = False
    check_fields_select.options = field_options if field_options else []
    check_fields_select.value = selected_fields if selected_fields else []
    check_fields_select.disabled = not file_options
    check_fields_select.tooltip = f"Select fields to display as columns in the table for {file_combobox.value}"
    save_button.disabled = False
    replay_button.disabled = False
    
    with output_cell:
        clear_output(wait=True)
        if file_options:
            display(HTML(f"<p>Info: Session loaded. Viewing entire stream or select a file to filter operations.</p>"))
        else:
            display(HTML("<p>Info: No files found in session. Viewing entire stream.</p>"))
    logger.info(f"Session changed to {change['new']}, {len(file_options)} file options available")
    global operations
    operations = update_operations(capture, change["new"], None, check_fields_select.value)
    logger.info(f"Set operations to entire stream, count: {len(operations)}")
    render_page()

def on_file_change(change):
    """Handle file filter changes."""
    logger.info(f"File filter changed to: {change['new']} at {time.strftime('%H:%M:%S')}")
    check_fields_select.tooltip = f"Select fields to display as columns in the table for {change['new']}"
    if session_dropdown.value != "Select a session":
        capture = load_capture()
        if capture is None:
            logger.warning("Failed to load capture for file change")
            return
        global operations
        operations = update_operations(capture, session_dropdown.value, change["new"], check_fields_select.value)
        logger.info(f"update_operations returned, operations count: {len(operations)}")
        render_page()

def on_fields_change(change):
    """Handle field selection changes."""
    logger.info(f"Fields selection changed to: {change['new']} at {time.strftime('%H:%M:%S')}")
    if session_dropdown.value != "Select a session":
        capture = load_capture()
        if capture is None:
            logger.warning("Failed to load capture for fields change")
            return
        global operations
        operations = update_operations(capture, session_dropdown.value, file_combobox.value, change["new"])
        logger.info(f"update_operations returned, operations count: {len(operations)}")
        render_page()

def on_save_config(b):
    """Save replay server configuration to config.pkl."""
    logger.info("Save config button clicked")
    if session_dropdown.value == "Select a session":
        logger.info("No session selected to save configuration.")
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p style='color: red;'>Select a session to save configuration.</p>"))
        return
    
    replay_config.update({
        "server_ip": server_ip_input.value,
        "domain": domain_input.value,
        "username": username_input.value,
        "password": password_input.value,
        "tree_name": tree_name_input.value,
        "max_wait": max_time_input.value
    })
    try:
        # Load existing config to preserve pcap_config
        current_config = {}
        if os.path.exists(config_file):
            with open(config_file, 'rb') as f:
                current_config = pickle.load(f)
                logger.debug(f"Loaded existing config.pkl for save: {current_config}")
        current_config.update({
            'pcap_config': pcap_config,
            'replay_config': replay_config
        })
        with open(config_file, 'wb') as f:
            pickle.dump(current_config, f)
        logger.info(f"Saved configuration to {config_file}: pcap_config={pcap_config}, replay_config={replay_config}")
        with output_cell:
            clear_output(wait=True)
            display(HTML("<p style='color: green;'>Configuration saved.</p>"))
            render_page()
    except Exception as e:
        logger.critical(f"Error saving configuration to {config_file}: {e}")
        with output_cell:
            clear_output(wait=True)
            display(HTML(f"<p style='color: red;'>Error saving configuration: {e}</p>"))

def on_case_number_change(change):
    """Handle case number changes."""
    case_number = change["new"].strip()
    logger.info(f"Case number changed to: {case_number}")
    
    try:
        if not case_number:
            logger.info("Empty case number - resetting dropdowns")
            capture_dropdown.options = ["Select a capture"]
            capture_dropdown.disabled = True
            session_dropdown.options = ["Select a session"]
            session_dropdown.disabled = True
            file_combobox.options = ["Entire Stream"]
            file_combobox.disabled = True
            check_fields_select.options = []
            check_fields_select.disabled = True
            check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
            save_button.disabled = True
            ingest_button.disabled = True
            reingest_button.disabled = True
            replay_button.disabled = True
            with output_cell:
                clear_output(wait=True)
                display(HTML("<p>Info: No case number entered.</p>"))
            return
        
        base_dir = f"/stingray/{case_number}"
        logger.info(f"Checking directory: {base_dir}")
        
        if not os.path.isdir(base_dir):
            logger.warning(f"Case directory not found: {base_dir}. Note: Path may be remote; consider SSH validation.")
            capture_dropdown.options = ["Select a capture"]
            capture_dropdown.disabled = True
            file_combobox.options = ["Entire Stream"]
            file_combobox.disabled = True
            check_fields_select.options = []
            check_fields_select.disabled = True
            check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
            with output_cell:
                clear_output(wait=True)
                display(HTML(f"<p style='color: red;'>Case directory not found: {base_dir}</p>"))
            return
        
        logger.info(f"Directory {base_dir} exists, scanning for files...")
        
        pcap_files = []
        logger.info(f"Scanning {base_dir} for capture files using enhanced detection...")
        
        file_count = 0
        error_count = 0
        
        for root, _, files in os.walk(base_dir):
            logger.debug(f"Scanning directory: {root}, found {len(files)} files")
            for f in files:
                file_count += 1
                full_path = os.path.join(root, f)
                logger.debug(f"Checking file {file_count}: {f}")
                try:
                    # Use enhanced detection, with fallback to extension-based
                    if is_capture_file(full_path):
                        rel_path = os.path.relpath(full_path, base_dir).replace("\\", "/")
                        pcap_files.append(rel_path)
                        logger.info(f"Added capture file: {rel_path}")
                except Exception as e:
                    error_count += 1
                    logger.warning(f"Error checking file {full_path}: {e}")
                    # Fallback to extension-based detection for this file
                    if f.lower().endswith(('.pcap', '.pcapng', '.cap', '.trc', '.trace', '.trc0')):
                        rel_path = os.path.relpath(full_path, base_dir).replace("\\", "/")
                        pcap_files.append(rel_path)
                        logger.info(f"Added capture file (fallback): {rel_path}")
        
        logger.info(f"Scanned {file_count} files, found {len(pcap_files)} capture files, {error_count} errors")
        
        if not pcap_files:
            logger.warning(f"No capture files found in {base_dir} using enhanced detection (scanned {file_count} files, {error_count} errors)")
            capture_dropdown.options = ["Select a capture"]
            capture_dropdown.disabled = True
            file_combobox.options = ["Entire Stream"]
            file_combobox.disabled = True
            check_fields_select.options = []
            check_fields_select.disabled = True
            check_fields_select.tooltip = "Select fields to display as columns in the table for Entire Stream"
            with output_cell:
                clear_output(wait=True)
                display(HTML(f"<p style='color: red;'>No capture files found in {base_dir} (scanned {file_count} files)</p>"))
            return
        
        logger.info(f"Found {len(pcap_files)} capture files in {base_dir}: {pcap_files[:5]}... (first 5 shown)")
        capture_dropdown.options = ["Select a capture"] + sorted(pcap_files)
        capture_dropdown.disabled = False
        capture_dropdown.value = "Select a capture"
        
        # Auto-load the first capture file if there's only one, or if we have a loaded capture that matches
        loaded_capture = load_capture()
        if loaded_capture:
            # Extract relative path from the loaded capture
            try:
                loaded_rel_path = os.path.relpath(loaded_capture, base_dir).replace("\\", "/")
                if loaded_rel_path in pcap_files:
                    logger.info(f"Auto-selecting loaded capture: {loaded_rel_path}")
                    capture_dropdown.value = loaded_rel_path
                    # Trigger the capture change event to populate sessions
                    on_capture_change({"new": loaded_rel_path})
                    with output_cell:
                        clear_output(wait=True)
                        display(HTML(f"<p>Info: Auto-loaded capture: {loaded_rel_path}</p>"))
                    return
            except Exception as e:
                logger.warning(f"Error auto-loading capture {loaded_capture}: {e}")
        
        with output_cell:
            clear_output(wait=True)
            display(HTML(f"<p>Info: Select a capture file to proceed.</p>"))
            
    except Exception as e:
        logger.critical(f"Critical error in on_case_number_change: {e}")
        import traceback
        logger.critical(f"Traceback: {traceback.format_exc()}")
        with output_cell:
            clear_output(wait=True)
            display(HTML(f"<p style='color: red;'>Error processing case number: {e}</p>"))

def initialize_dashboard():
    """Initialize dashboard by loading capture if case number and capture path are pre-set."""
    logger.info("Initializing dashboard")
    capture = load_capture()
    
    # Extract case number from capture path if not already set
    case_number = case_number_input.value.strip() if case_number_input else ""
    if not case_number and capture:
        # Try to extract case number from the capture path
        parts = capture.split(os.sep)
        if len(parts) >= 3 and parts[1] == 'stingray':
            case_number = parts[2]
            logger.info(f"Extracted case number from capture path: {case_number}")
            case_number_input.value = case_number
    
    logger.debug(f"Initial pcap_config: {pcap_config}")
    if case_number and capture:
        logger.info(f"Pre-filled case number: {case_number}, capture path: {capture}")
        base_dir = f"/stingray/{case_number}"
        if os.path.isdir(base_dir):
            found = False
            for root, _, files in os.walk(base_dir):
                for f in files:
                    full_path = os.path.join(root, f)
                    # Use the enhanced capture file detection instead of just extension checking
                    try:
                        if is_capture_file(full_path) and full_path == capture:
                            capture_rel_path = os.path.relpath(capture, base_dir).replace("\\", "/")
                            logger.info(f"Auto-loading capture: {capture_rel_path}")
                            
                            # First populate the dropdown options by calling the case number change handler
                            on_case_number_change({"new": case_number})
                            
                            # Then set the dropdown value to the found capture
                            capture_dropdown.value = capture_rel_path
                            logger.info(f"Set capture dropdown value to: {capture_rel_path}")
                            
                            # Finally trigger the capture change event to populate sessions
                            on_capture_change({"new": capture_rel_path})
                            found = True
                            break
                    except Exception as e:
                        logger.warning(f"Error checking file {full_path} during initialization: {e}")
                        # Fallback check - if the full path matches exactly, assume it's the right file
                        if full_path == capture:
                            capture_rel_path = os.path.relpath(capture, base_dir).replace("\\", "/")
                            logger.info(f"Auto-loading capture (fallback): {capture_rel_path}")
                            
                            # First populate the dropdown options by calling the case number change handler
                            on_case_number_change({"new": case_number})
                            
                            # Then set the dropdown value to the found capture
                            capture_dropdown.value = capture_rel_path
                            logger.info(f"Set capture dropdown value to: {capture_rel_path} (fallback)")
                            
                            # Finally trigger the capture change event to populate sessions
                            on_capture_change({"new": capture_rel_path})
                            found = True
                            break
                if found:
                    break
            if not found:
                logger.warning(f"Capture path {capture} not found in {base_dir} or its subdirectories")
                # Try to trigger case number change handler to populate the capture dropdown
                logger.info(f"Triggering case number change handler for case {case_number}")
                on_case_number_change({"new": case_number})
        else:
            logger.warning(f"Case directory {base_dir} not found")
    else:
        logger.info("No valid case number or capture path to auto-load")

# Clear existing bindings and re-apply
ingest_button.on_click(None)
reingest_button.on_click(None)
ingest_button.on_click(on_ingest_button_clicked)
reingest_button.on_click(on_reingest_button_clicked)
logger.info("Ingest and Re-ingest buttons bound")

# Attach handlers
case_number_input.observe(on_case_number_change, names="value")
capture_dropdown.observe(on_capture_change, names="value")
session_dropdown.observe(on_session_change, names="value")
file_combobox.observe(on_file_change, names="value")
check_fields_select.observe(on_fields_change, names="value")
save_button.on_click(on_save_config)
replay_button.on_click(on_replay_button_clicked)

# Trigger initial load
initialize_dashboard()

In [None]:
# Cell 11: Dashboard Display for Session Visualization
from IPython.display import display
import ipywidgets as widgets
import builtins
import logging

# Import logger and required widgets from builtins (set in Cells 1–10)
try:
    from builtins import (
        logger, pcap_config, replay_config, log_output, output_cell, progress_output,
        case_number_input, capture_dropdown, session_dropdown,
        button_box, file_combobox, check_fields_select, server_ip_input,
        domain_input, username_input, password_input, tree_name_input,
        max_time_input, save_button, dashboard
    )
except ImportError as e:
    logging.critical(f"Failed to import from builtins: {e}. Ensure Cells 1–10 are executed.")
    raise

def update_dashboard_layout(verbose_level: int = None) -> None:
    """Update and display the dashboard layout with dynamic log visibility.

    Args:
        verbose_level: Verbosity level (0: minimal, 1: standard, 2+: verbose logs).
                       If None, uses pcap_config['verbose_level'] from Cell 1.
    """
    # Use verbose_level from pcap_config if not provided
    effective_verbose_level = verbose_level if verbose_level is not None else pcap_config.get("verbose_level", 0)
    logger.info(f"Updating dashboard layout with verbose_level={effective_verbose_level}")

    # Create replay configuration section with clear labeling
    replay_config_box = widgets.VBox([
        widgets.HTML("<h3>Replay Server Configuration</h3>", layout={'margin': '10px 0'}),
        widgets.HBox([
            server_ip_input,
            domain_input,
            username_input
        ], layout={'margin': '5px 0'}),
        widgets.HBox([
            password_input,
            tree_name_input,
            max_time_input
        ], layout={'margin': '5px 0'}),
        save_button
    ], layout={'border': '1px solid #ddd', 'padding': '10px', 'margin': '10px 0'})

    # Define core dashboard components (remove debug_slider)
    core_components = [
        widgets.HTML("<h3>Session Visualization Dashboard</h3>"),
        progress_output,
        case_number_input,
        capture_dropdown,
        session_dropdown,
        button_box,
        file_combobox,
        check_fields_select,
        output_cell,
        replay_config_box
    ]

    # Conditionally include log_output based on verbose_level
    if effective_verbose_level > 0:
        dashboard_components = core_components + [log_output]
    else:
        dashboard_components = core_components + [widgets.Label(value="Logs hidden (set Debug Level > 0 in Configuration Cell to show)")]

    # Update dashboard children with error handling
    try:
        dashboard.children = dashboard_components
        logger.debug(f"Dashboard updated with {len(dashboard_components)} components")
    except Exception as e:
        logger.critical(f"Failed to update dashboard layout: {e}")
        raise

    # Export to builtins for reuse
    builtins.update_dashboard_layout = update_dashboard_layout
    logger.info("Exported update_dashboard_layout to builtins")

# Initialize and display dashboard
try:
    verbose_level = pcap_config.get("verbose_level", 0)  # Default to CRITICAL (from Cell 1)
    update_dashboard_layout(verbose_level)
    display(dashboard)
    logger.info("Dashboard displayed successfully")
except Exception as e:
    logger.critical(f"Error displaying dashboard: {e}")
    raise

In [None]:
# Debugging cell
capture = load_capture()
case_number = case_number_input.value.strip()
print(f"Loaded capture: {capture}")
print(f"Case number: {case_number}")
base_dir = f"/stingray/{case_number}"
pcap_files = [f for root, _, files in os.walk(base_dir) for f in files if f.endswith(('.pcap', '.pcapng'))]
print(f"Found PCAP files: {pcap_files}")
if capture and os.path.basename(capture) in pcap_files:
    capture_dropdown.options = ["Select a capture"] + [os.path.relpath(capture, base_dir).replace("\\", "/")]
    capture_dropdown.value = os.path.relpath(capture, base_dir).replace("\\", "/")
    logger.info(f"Manually added and selected capture: {capture_dropdown.value}")

In [None]:
# case_number = "2010373016"
