# Data Exploration

This notebook explores the Suricata rules dataset to understand:
- How many rules we have
- Rule structure and components
- Distribution of rule types, protocols, actions
- Common patterns and anomalies

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from suricata_rule_clustering import parser

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Rules

Parse all Suricata rules from the rules directory.

In [None]:
# For initial exploration, you might want to limit the number of files
# Remove max_files parameter to process all files
df = parser.parse_all_rules(rules_dir='../rules/active', max_files=None)

# Save the parsed rules for later use
parser.save_parsed_rules(df, '../data/parsed_rules.pkl')

In [None]:
# Display basic information
print(f"Total rules parsed: {len(df)}")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# Display first few rows
df.head()

## 2. Basic Statistics

In [None]:
# Rule actions distribution
print("Action distribution:")
print(df['action'].value_counts())

# Plot
df['action'].value_counts().plot(kind='bar', title='Rule Actions')
plt.ylabel('Count')
plt.show()

In [None]:
# Protocol distribution
print("Protocol distribution:")
print(df['protocol'].value_counts())

# Plot
df['protocol'].value_counts().plot(kind='bar', title='Protocols')
plt.ylabel('Count')
plt.show()

In [None]:
# Classtype distribution (if available)
if 'classtype' in df.columns:
    # Get classtype counts, excluding None/NaN values
    classtype_counts = df['classtype'].dropna().value_counts().head(20)
    
    if len(classtype_counts) > 0:
        print("Top 20 classtypes:")
        print(classtype_counts)
        
        # Plot top 15 if we have data
        top_15 = df['classtype'].dropna().value_counts().head(15)
        if len(top_15) > 0:
            top_15.plot(kind='barh', title='Top 15 Classtypes')
            plt.xlabel('Count')
            plt.tight_layout()
            plt.show()
    else:
        print("No classtype values found in the dataset (all None/NaN)")

In [None]:
# Priority distribution
if 'priority' in df.columns:
    # Check if we have non-null priority values
    priority_counts = df['priority'].value_counts(dropna=False).sort_index()
    
    if len(priority_counts) > 0:
        print("Priority distribution:")
        print(priority_counts)
        
        # Only plot if we have data
        if len(priority_counts[priority_counts.index.notna()]) > 0:
            # Plot only non-None values
            valid_priorities = df['priority'].dropna()
            if len(valid_priorities) > 0:
                valid_priorities.value_counts().sort_index().plot(
                    kind='bar', 
                    title='Rule Priority Distribution'
                )
                plt.xlabel('Priority')
                plt.ylabel('Count')
                plt.show()
            else:
                print("\nNo rules have priority values set.")
        else:
            print("\nAll rules have priority = None")
    else:
        print("No priority data available")

## 3. Rule Sources

In [None]:
# Rules per file
rules_per_file = df['file_name'].value_counts().head(20)
print("Top 20 files by rule count:")
print(rules_per_file)

# Plot
rules_per_file.plot(kind='barh', title='Top 20 Files by Rule Count')
plt.xlabel('Number of Rules')
plt.tight_layout()
plt.show()

## 4. Rule Messages Analysis

In [None]:
# Get message column (could be 'msg' or 'message')
msg_col = 'msg' if 'msg' in df.columns else 'message' if 'message' in df.columns else None

if msg_col:
    # Message length statistics
    df['msg_length'] = df[msg_col].astype(str).str.len()
    
    print("Message length statistics:")
    print(df['msg_length'].describe())
    
    # Plot distribution
    plt.figure(figsize=(12, 4))
    plt.hist(df['msg_length'], bins=50, edgecolor='black')
    plt.title('Distribution of Rule Message Lengths')
    plt.xlabel('Message Length')
    plt.ylabel('Count')
    plt.show()
    
    # Sample messages
    print("\nSample rule messages:")
    print(df[msg_col].sample(10).tolist())

## 5. Rule Options Analysis

In [None]:
# Explore rule options structure
if 'options' in df.columns:
    # Sample a rule and display its options
    sample_rule = df.iloc[0]
    print("Sample rule options:")
    print(sample_rule['options'])
    print(f"\nType: {type(sample_rule['options'])}")

## 6. Missing Values

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if len(missing) > 0:
    print("Columns with missing values:")
    print(missing)
    print(f"\nPercentage of missing values:")
    print((missing / len(df) * 100).round(2))
else:
    print("No missing values found!")

## 7. Sample Rules Inspection

In [None]:
# Display some complete raw rules
print("Sample raw rules:")
for i, rule in enumerate(df['raw_rule'].sample(5), 1):
    print(f"\n{i}. {rule[:200]}..." if len(rule) > 200 else f"\n{i}. {rule}")

## Next Steps

Now that we've explored the data, proceed to:
- **02_feature_engineering.ipynb**: Extract and engineer features for clustering
- **03_clustering.ipynb**: Apply clustering algorithms
- **04_visualization.ipynb**: Visualize clustering results