# Interactive Visualization

This notebook creates interactive visualizations of the clustering results using:
- UMAP and t-SNE for dimensionality reduction
- Plotly for interactive plots
- 2D and 3D visualizations
- Cluster characteristic analysis

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from suricata_rule_clustering import viz

# Set display options
pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
# Load feature matrix and clustered DataFrame
X = np.load('../data/feature_matrix.npy')
df = pd.read_pickle('../data/clustered_rules.pkl')
labels = np.load('../data/cluster_labels.npy')

print(f"Feature matrix shape: {X.shape}")
print(f"DataFrame shape: {df.shape}")
print(f"Number of clusters: {len(set(labels))}")

In [None]:
# Import cluster analysis module
from suricata_rule_clustering.cluster_analysis import ClusterDescriptor
from suricata_rule_clustering import features

# Recreate feature extractor for TF-IDF term extraction
print("Generating cluster labels...")
extractor = features.RuleFeatureExtractor()
X_temp = extractor.create_feature_matrix(df, include_tfidf=True, tfidf_max_features=100)

# Generate cluster descriptions (lightweight - just for labels)
descriptor = ClusterDescriptor(feature_extractor=extractor)
descriptions = descriptor.describe_all_clusters(X, labels, df)

# Create label mapping
cluster_labels_map = {cid: desc['label'] for cid, desc in descriptions.items()}

# Add labels to dataframe
df['cluster_label'] = df['cluster'].map(cluster_labels_map)

print(f"Generated labels for {len(cluster_labels_map)} clusters:")
for cid, label in sorted(cluster_labels_map.items())[:10]:
    print(f"  Cluster {cid}: {label}")
if len(cluster_labels_map) > 10:
    print(f"  ... and {len(cluster_labels_map) - 10} more")

## 1.5 Generate Cluster Labels

Generate automatic labels for clusters to make visualizations more interpretable.

# Create interactive 2D plot with sampled data
fig_umap_2d = viz.plot_clusters_2d(
    X_umap_2d,
    labels_2d,
    df=df_2d,
    title="Suricata Rule Clusters - UMAP 2D",
    hover_data=['cluster_label', 'msg', 'classtype', 'priority', 'action', 'protocol']
)

fig_umap_2d.show()

In [None]:
import time

# Reduce dimensions using UMAP to 2D with optimizations
print(f"Original dataset: {X.shape[0]} samples, {X.shape[1]} features")

start_time = time.time()
X_umap_2d, sample_indices_2d = viz.reduce_dimensions(
    X,
    method='umap',
    n_components=2,
    n_neighbors=15,
    min_dist=0.1,
    max_samples=50000,  # Sample to 50k for faster visualization
    apply_pca=True,  # Apply PCA preprocessing
    pca_components=50,  # Reduce to 50 dims first
    n_jobs=-1  # Use all CPU cores
)
elapsed_time = time.time() - start_time

# Get corresponding labels and dataframe subset
if sample_indices_2d is not None:
    labels_2d = labels[sample_indices_2d]
    df_2d = df.iloc[sample_indices_2d]
else:
    labels_2d = labels
    df_2d = df

print(f"UMAP 2D completed in {elapsed_time:.2f} seconds")
print(f"UMAP 2D shape: {X_umap_2d.shape}")

In [None]:
# Create interactive 2D plot with sampled data
fig_umap_2d = viz.plot_clusters_2d(
    X_umap_2d,
    labels_2d,
    df=df_2d,
    title="Suricata Rule Clusters - UMAP 2D",
    hover_data=['msg', 'classtype', 'priority', 'action', 'protocol']
)

fig_umap_2d.show()

In [None]:
# Save the figure
viz.save_figure(fig_umap_2d, 'umap_2d_clusters', '../outputs')

# Create interactive 3D plot with sampled data
fig_umap_3d = viz.plot_clusters_3d(
    X_umap_3d,
    labels_3d,
    df=df_3d,
    title="Suricata Rule Clusters - UMAP 3D",
    hover_data=['cluster_label', 'msg', 'classtype', 'priority']
)

fig_umap_3d.show()

In [None]:
# Reduce dimensions using UMAP to 3D with optimizations
start_time = time.time()
X_umap_3d, sample_indices_3d = viz.reduce_dimensions(
    X,
    method='umap',
    n_components=3,
    n_neighbors=15,
    min_dist=0.1,
    max_samples=50000,
    apply_pca=True,
    pca_components=50,
    n_jobs=-1
)
elapsed_time = time.time() - start_time

# Get corresponding labels and dataframe subset
if sample_indices_3d is not None:
    labels_3d = labels[sample_indices_3d]
    df_3d = df.iloc[sample_indices_3d]
else:
    labels_3d = labels
    df_3d = df

print(f"UMAP 3D completed in {elapsed_time:.2f} seconds")
print(f"UMAP 3D shape: {X_umap_3d.shape}")

In [None]:
# Create interactive 3D plot with sampled data
fig_umap_3d = viz.plot_clusters_3d(
    X_umap_3d,
    labels_3d,
    df=df_3d,
    title="Suricata Rule Clusters - UMAP 3D",
    hover_data=['msg', 'classtype', 'priority']
)

fig_umap_3d.show()

In [None]:
# Save the figure
viz.save_figure(fig_umap_3d, 'umap_3d_clusters', '../outputs')

# Create interactive 2D plot with sampled data
fig_tsne_2d = viz.plot_clusters_2d(
    X_tsne_2d,
    labels_tsne,
    df=df_tsne,
    title="Suricata Rule Clusters - t-SNE 2D",
    hover_data=['cluster_label', 'msg', 'classtype', 'priority', 'action', 'protocol']
)

fig_tsne_2d.show()

In [None]:
# Reduce dimensions using t-SNE to 2D with optimizations
start_time = time.time()
X_tsne_2d, sample_indices_tsne = viz.reduce_dimensions(
    X,
    method='tsne',
    n_components=2,
    perplexity=30,
    max_samples=50000,  # Sample to 50k for faster visualization
    apply_pca=True,  # Apply PCA preprocessing
    pca_components=50,
    n_jobs=-1  # Use all CPU cores
)
elapsed_time = time.time() - start_time

# Get corresponding labels and dataframe subset
if sample_indices_tsne is not None:
    labels_tsne = labels[sample_indices_tsne]
    df_tsne = df.iloc[sample_indices_tsne]
else:
    labels_tsne = labels
    df_tsne = df

print(f"t-SNE 2D completed in {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
print(f"t-SNE 2D shape: {X_tsne_2d.shape}")

In [None]:
# Create interactive 2D plot with sampled data
fig_tsne_2d = viz.plot_clusters_2d(
    X_tsne_2d,
    labels_tsne,
    df=df_tsne,
    title="Suricata Rule Clusters - t-SNE 2D",
    hover_data=['msg', 'classtype', 'priority', 'action', 'protocol']
)

fig_tsne_2d.show()

In [None]:
# Save the figure
viz.save_figure(fig_tsne_2d, 'tsne_2d_clusters', '../outputs')

## 5. Cluster Analysis Visualizations

In [None]:
# Plot cluster sizes
fig_sizes = viz.plot_cluster_sizes(labels, "Cluster Sizes")
fig_sizes.show()
viz.save_figure(fig_sizes, 'cluster_sizes', '../outputs')

In [None]:
# Plot priority distribution by cluster
if 'priority' in df.columns:
    fig_priority = viz.plot_cluster_characteristics(
        df,
        labels,
        feature='priority',
        plot_type='box'
    )
    fig_priority.show()
    viz.save_figure(fig_priority, 'priority_by_cluster', '../outputs')

In [None]:
# Plot classtype distribution by cluster
if 'classtype' in df.columns:
    fig_classtype = viz.plot_classtype_distribution(
        df,
        labels,
        top_n=15
    )
    fig_classtype.show()
    viz.save_figure(fig_classtype, 'classtype_distribution', '../outputs')

In [None]:
# Plot message length distribution by cluster
if 'msg_length' in df.columns:
    fig_msg_length = viz.plot_cluster_characteristics(
        df,
        labels,
        feature='msg_length',
        plot_type='violin'
    )
    fig_msg_length.show()
    viz.save_figure(fig_msg_length, 'msg_length_by_cluster', '../outputs')

## 6. Comprehensive Dashboard

Create a dashboard with multiple visualizations.

In [None]:
# Prepare metrics (using sampled data if available)
from suricata_rule_clustering import clustering

# Use the 2D sampled data for dashboard
X_dash = X[sample_indices_2d] if sample_indices_2d is not None else X
labels_dash = labels_2d

metrics = clustering.evaluate_clustering(X_dash, labels_dash)

# Create dashboard
fig_dashboard = viz.create_cluster_dashboard(
    X_dash,
    X_umap_2d,
    labels_dash,
    df_2d,
    metrics
)

fig_dashboard.show()

In [None]:
# Save dashboard
viz.save_figure(fig_dashboard, 'clustering_dashboard', '../outputs')

## 7. Export Cluster Examples

Export representative rules from each cluster for manual inspection.

In [None]:
# Get representative rules from each cluster
msg_col = 'msg' if 'msg' in df.columns else 'message'

cluster_examples = []

for cluster_id in sorted(df['cluster'].unique()):
    cluster_rules = df[df['cluster'] == cluster_id]
    
    # Get top 5 rules from this cluster
    examples = cluster_rules.head(5)[
        ['cluster', msg_col, 'classtype', 'priority', 'action', 'protocol', 'raw_rule']
    ]
    
    cluster_examples.append(examples)

# Combine and save
examples_df = pd.concat(cluster_examples, ignore_index=True)
examples_df.to_csv('../outputs/cluster_examples.csv', index=False)

print(f"Exported {len(examples_df)} example rules to cluster_examples.csv")
examples_df.head()

## 8. Save Reduced Dimensions

Save the reduced dimension representations for future use.

In [None]:
# Save reduced dimensions
np.save('../data/umap_2d.npy', X_umap_2d)
np.save('../data/umap_3d.npy', X_umap_3d)
np.save('../data/tsne_2d.npy', X_tsne_2d)

print("Reduced dimensions saved")

## Summary

We have successfully created interactive visualizations including:
- 2D and 3D UMAP projections
- 2D t-SNE projection
- Cluster size distributions
- Cluster characteristic analysis (priority, classtype, message length)
- Comprehensive dashboard

All visualizations have been saved to the `outputs/` directory as interactive HTML files that can be opened in any web browser.

## Next Steps

- Examine the cluster examples in `outputs/cluster_examples.csv`
- Open the HTML visualizations in a browser for interactive exploration
- Fine-tune clustering parameters based on the visual results
- Identify patterns and relationships between similar rules