# Clustering Analysis

This notebook applies various clustering algorithms to group similar Suricata rules.

We will:
1. Load the feature matrix
2. Apply K-Means clustering
3. Apply DBSCAN clustering
4. Apply Hierarchical clustering
5. Compare algorithm performance
6. Analyze cluster characteristics

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from suricata_rule_clustering import parser, clustering

# Set display options
pd.set_option('display.max_columns', None)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load feature matrix and processed DataFrame
X = np.load('../data/feature_matrix.npy')
df = pd.read_pickle('../data/processed_rules.pkl')

print(f"Feature matrix shape: {X.shape}")
print(f"DataFrame shape: {df.shape}")

## 2. K-Means Clustering

First, let's find the optimal number of clusters using the elbow method and silhouette score.

In [None]:
# Find optimal k using elbow method
optimal_k_elbow, elbow_scores = clustering.find_optimal_k(
    X,
    k_range=range(2, 15),
    method='elbow'
)

# Plot elbow curve
plt.figure(figsize=(10, 5))
plt.plot(list(elbow_scores.keys()), list(elbow_scores.values()), 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
plt.show()

print(f"Optimal k (elbow): {optimal_k_elbow}")

In [None]:
# Find optimal k using silhouette score
optimal_k_silhouette, silhouette_scores = clustering.find_optimal_k(
    X,
    k_range=range(2, 15),
    method='silhouette'
)

# Plot silhouette scores
plt.figure(figsize=(10, 5))
plt.plot(list(silhouette_scores.keys()), list(silhouette_scores.values()), 'go-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score For Different k Values')
plt.grid(True)
plt.show()

print(f"Optimal k (silhouette): {optimal_k_silhouette}")

In [None]:
# Apply K-Means with chosen k
k = optimal_k_silhouette

kmeans_clusterer = clustering.RuleClusterer(
    algorithm='kmeans',
    n_clusters=k
)
kmeans_clusterer.fit(X)

print(f"K-Means clustering completed with {k} clusters")

In [None]:
# Evaluate K-Means clustering
kmeans_metrics = clustering.evaluate_clustering(X, kmeans_clusterer.labels_)
print("K-Means Evaluation Metrics:")
for metric, value in kmeans_metrics.items():
    print(f"  {metric}: {value}")

In [None]:
# Get cluster summary
kmeans_summary = kmeans_clusterer.get_cluster_summary(df)
print("\nK-Means Cluster Summary:")
kmeans_summary

## 3. DBSCAN Clustering

DBSCAN is a density-based algorithm that can find clusters of arbitrary shapes and automatically detect outliers.

In [None]:
# Apply DBSCAN
# You may need to experiment with eps and min_samples parameters
dbscan_clusterer = clustering.RuleClusterer(
    algorithm='dbscan',
    eps=0.5,
    min_samples=5
)
dbscan_clusterer.fit(X)

print("DBSCAN clustering completed")

In [None]:
# Evaluate DBSCAN clustering
dbscan_metrics = clustering.evaluate_clustering(X, dbscan_clusterer.labels_)
print("DBSCAN Evaluation Metrics:")
for metric, value in dbscan_metrics.items():
    print(f"  {metric}: {value}")

In [None]:
# Get cluster summary
dbscan_summary = dbscan_clusterer.get_cluster_summary(df)
print("\nDBSCAN Cluster Summary:")
dbscan_summary

## 4. Hierarchical Clustering

Takes a lot of time, stopped at 40m runtime and disabled for now.

In [None]:
# # Apply Hierarchical clustering
# hierarchical_clusterer = clustering.RuleClusterer(
#     algorithm='hierarchical',
#     n_clusters=k,  # Use same k as K-Means for comparison
#     linkage='ward'
# )
# hierarchical_clusterer.fit(X)
#
# print("Hierarchical clustering completed")

In [None]:
# # Evaluate Hierarchical clustering
# hierarchical_metrics = clustering.evaluate_clustering(X, hierarchical_clusterer.labels_)
# print("Hierarchical Clustering Evaluation Metrics:")
# for metric, value in hierarchical_metrics.items():
#     print(f"  {metric}: {value}")

In [None]:
# # Get cluster summary
# hierarchical_summary = hierarchical_clusterer.get_cluster_summary(df)
# print("\nHierarchical Cluster Summary:")
# hierarchical_summary

In [None]:
# # Plot dendrogram (with sampled data for performance)
# fig = clustering.plot_dendrogram(X, max_samples=500)
# plt.show()

## 5. Compare Algorithms

In [None]:
# Compare different algorithms and parameters
comparison = clustering.compare_algorithms(X, df)
print("\nAlgorithm Comparison:")
comparison

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot silhouette scores
valid_silhouette = comparison.dropna(subset=['silhouette_score'])
if not valid_silhouette.empty:
    axes[0].barh(valid_silhouette['algorithm'], valid_silhouette['silhouette_score'])
    axes[0].set_xlabel('Silhouette Score')
    axes[0].set_title('Silhouette Score Comparison (Higher is Better)')

# Plot number of clusters
axes[1].barh(comparison['algorithm'], comparison['n_clusters'])
axes[1].set_xlabel('Number of Clusters')
axes[1].set_title('Number of Clusters Found')

plt.tight_layout()
plt.show()

## 6. Analyze Best Clustering Result

Choose the best performing algorithm and analyze its clusters in detail.

In [None]:
# Select the best clusterer (adjust based on comparison above)
best_clusterer = kmeans_clusterer  # or dbscan_clusterer or hierarchical_clusterer
best_labels = best_clusterer.labels_

# Add cluster labels to DataFrame
df['cluster'] = best_labels

print(f"Using clustering algorithm: {best_clusterer.algorithm}")
print(f"Number of clusters: {len(set(best_labels))}")

In [None]:
# Analyze cluster sizes
cluster_sizes = df['cluster'].value_counts().sort_index()
print("Cluster sizes:")
print(cluster_sizes)

# Plot
plt.figure(figsize=(10, 5))
cluster_sizes.plot(kind='bar')
plt.xlabel('Cluster')
plt.ylabel('Number of Rules')
plt.title('Rules per Cluster')
plt.show()

In [None]:
# Sample rules from each cluster
msg_col = 'msg' if 'msg' in df.columns else 'message'

print("Sample rules from each cluster:\n")
for cluster_id in sorted(df['cluster'].unique()):
    if cluster_id == -1:
        print(f"Cluster {cluster_id} (NOISE):")
    else:
        print(f"Cluster {cluster_id}:")
    
    cluster_rules = df[df['cluster'] == cluster_id]
    sample_messages = cluster_rules[msg_col].head(3).tolist()
    
    for i, msg in enumerate(sample_messages, 1):
        print(f"  {i}. {msg}")
    print()

## 7. Save Results

In [None]:
# Save DataFrame with cluster labels
df.to_pickle('../data/clustered_rules.pkl')
df.to_csv('../data/clustered_rules.csv', index=False)

# Save cluster labels separately
np.save('../data/cluster_labels.npy', best_labels)

print("Clustering results saved")

## Summary

We have successfully:
- Applied K-Means clustering with optimal k selection
- Applied DBSCAN for density-based clustering
- Applied Hierarchical clustering
- Compared algorithm performance
- Analyzed cluster characteristics

## Next Steps

Proceed to **04_visualization.ipynb** to create interactive visualizations of the clustering results using UMAP/t-SNE dimensionality reduction.