## üìö Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("üì¶ Libraries imported successfully!")

## üìä Load and Inspect Data

In [None]:
# Load one CSV file for initial analysis
sample_file = '../data/raw_csv/02-14-2018.csv'
print(f"Loading sample data from: {sample_file}")

# Read sample data
df_sample = pd.read_csv(sample_file)

print(f"‚úÖ Data loaded successfully!")
print(f"Shape: {df_sample.shape}")
print(f"Memory usage: {df_sample.memory_usage().sum() / 1024**2:.2f} MB")

In [None]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Number of records: {len(df_sample):,}")
print(f"Number of features: {len(df_sample.columns):,}")
print(f"\nColumn names:")
for i, col in enumerate(df_sample.columns):
    print(f"{i+1:2d}. {col}")

In [None]:
# Display first few rows
print("=== FIRST 5 ROWS ===")
df_sample.head()

In [None]:
# Data types and missing values
print("=== DATA TYPES & MISSING VALUES ===")
info_df = pd.DataFrame({
    'Column': df_sample.columns,
    'Data_Type': df_sample.dtypes,
    'Non_Null_Count': df_sample.count(),
    'Null_Count': df_sample.isnull().sum(),
    'Null_Percentage': (df_sample.isnull().sum() / len(df_sample) * 100).round(2)
})

# Show columns with missing values
missing_cols = info_df[info_df['Null_Count'] > 0].sort_values('Null_Percentage', ascending=False)
if len(missing_cols) > 0:
    print("\nColumns with missing values:")
    display(missing_cols)
else:
    print("\n‚úÖ No missing values found!")

print(f"\nData type distribution:")
print(info_df['Data_Type'].value_counts())

## üè∑Ô∏è Label Analysis

In [None]:
# Analyze the Label column (target variable)
print("=== LABEL DISTRIBUTION ===")
label_counts = df_sample['Label'].value_counts()
label_percentages = df_sample['Label'].value_counts(normalize=True) * 100

label_summary = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages.round(2)
})

print(label_summary)

# Check for class imbalance
benign_ratio = label_percentages.get('Benign', 0)
attack_ratio = 100 - benign_ratio

print(f"\nüìä Class Distribution:")
print(f"Benign Traffic: {benign_ratio:.2f}%")
print(f"Attack Traffic: {attack_ratio:.2f}%")

if benign_ratio > 90 or benign_ratio < 10:
    print("‚ö†Ô∏è Highly imbalanced dataset detected!")
elif benign_ratio > 80 or benign_ratio < 20:
    print("‚ö° Moderately imbalanced dataset")
else:
    print("‚úÖ Relatively balanced dataset")

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
axes[0].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Label Distribution (Pie Chart)', fontsize=14, fontweight='bold')

# Bar chart
sns.countplot(data=df_sample, x='Label', ax=axes[1])
axes[1].set_title('Label Distribution (Bar Chart)', fontsize=14, fontweight='bold')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Interactive plotly chart
fig = px.bar(x=label_counts.index, y=label_counts.values, 
             title='Interactive Label Distribution',
             labels={'x': 'Attack Type', 'y': 'Count'},
             color=label_counts.values,
             color_continuous_scale='viridis')
fig.update_layout(showlegend=False)
fig.show()

## üìà Numerical Features Analysis

In [None]:
# Get numerical columns
numerical_cols = df_sample.select_dtypes(include=[np.number]).columns.tolist()
if 'Label' in numerical_cols:
    numerical_cols.remove('Label')

print(f"Found {len(numerical_cols)} numerical features")

# Basic statistics
print("\n=== NUMERICAL FEATURES STATISTICS ===")
stats = df_sample[numerical_cols].describe()
stats

In [None]:
# Check for infinite values
print("=== INFINITE VALUES CHECK ===")
inf_counts = {}
for col in numerical_cols:
    inf_count = np.isinf(df_sample[col]).sum()
    if inf_count > 0:
        inf_counts[col] = inf_count

if inf_counts:
    inf_df = pd.DataFrame(list(inf_counts.items()), columns=['Column', 'Infinite_Count'])
    inf_df['Percentage'] = (inf_df['Infinite_Count'] / len(df_sample) * 100).round(2)
    print("Columns with infinite values:")
    display(inf_df.sort_values('Infinite_Count', ascending=False))
else:
    print("‚úÖ No infinite values found!")

In [None]:
# Distribution of key features
key_features = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 
               'Total Length of Fwd Packets', 'Total Length of Bwd Packets']

# Find actual column names (they might have slight differences)
actual_features = []
for feature in key_features:
    matches = [col for col in df_sample.columns if feature.lower().replace(' ', '') in col.lower().replace(' ', '').replace('_', '')]
    if matches:
        actual_features.append(matches[0])

print(f"Analyzing distributions for: {actual_features[:5]}")

# Plot distributions
if actual_features:
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for i, feature in enumerate(actual_features[:6]):
        if i < len(axes):
            # Remove infinite and very large values for visualization
            clean_data = df_sample[feature].replace([np.inf, -np.inf], np.nan).dropna()
            if len(clean_data) > 0:
                # Use log scale for better visualization
                log_data = np.log1p(clean_data)
                axes[i].hist(log_data, bins=50, alpha=0.7, edgecolor='black')
                axes[i].set_title(f'Log Distribution: {feature}', fontweight='bold')
                axes[i].set_xlabel('Log(value + 1)')
                axes[i].set_ylabel('Frequency')
            else:
                axes[i].text(0.5, 0.5, 'No valid data', ha='center', va='center', transform=axes[i].transAxes)
                axes[i].set_title(f'No Data: {feature}')
    
    # Hide empty subplots
    for j in range(len(actual_features), len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## üîó Feature Correlations

In [None]:
# Select a subset of features for correlation analysis (to avoid memory issues)
sample_features = numerical_cols[:20]  # First 20 numerical features

print(f"Calculating correlations for {len(sample_features)} features...")

# Calculate correlation matrix (handle infinite values)
df_corr = df_sample[sample_features].replace([np.inf, -np.inf], np.nan)
correlation_matrix = df_corr.corr()

# Plot correlation heatmap
plt.figure(figsize=(15, 12))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .5})
plt.title('Feature Correlation Heatmap (Lower Triangle)', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Find highly correlated features
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if not np.isnan(corr_value) and abs(corr_value) > 0.8:
            high_corr_pairs.append({
                'Feature_1': correlation_matrix.columns[i],
                'Feature_2': correlation_matrix.columns[j],
                'Correlation': round(corr_value, 3)
            })

if high_corr_pairs:
    print(f"\n‚ö†Ô∏è Found {len(high_corr_pairs)} highly correlated feature pairs (|r| > 0.8):")
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('Correlation', key=abs, ascending=False)
    display(high_corr_df.head(10))
else:
    print("\n‚úÖ No highly correlated features found (|r| > 0.8)")

## üéØ Attack vs Benign Comparison

In [None]:
# Create binary target
df_sample['is_attack'] = (df_sample['Label'] != 'Benign').astype(int)

# Compare key features between benign and attack traffic
comparison_features = actual_features[:6] if actual_features else numerical_cols[:6]

print("=== BENIGN vs ATTACK COMPARISON ===")
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, feature in enumerate(comparison_features):
    if i < len(axes) and feature in df_sample.columns:
        # Clean data
        clean_data = df_sample[[feature, 'is_attack']].replace([np.inf, -np.inf], np.nan).dropna()
        
        if len(clean_data) > 0:
            # Box plot comparison
            sns.boxplot(data=clean_data, x='is_attack', y=feature, ax=axes[i])
            axes[i].set_title(f'{feature}\nBenign (0) vs Attack (1)', fontweight='bold')
            axes[i].set_yscale('log')  # Log scale for better visualization
        else:
            axes[i].text(0.5, 0.5, 'No valid data', ha='center', va='center', transform=axes[i].transAxes)

# Hide empty subplots
for j in range(len(comparison_features), len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Statistical comparison
print("\n=== STATISTICAL COMPARISON ===")
comparison_stats = []

for feature in comparison_features:
    if feature in df_sample.columns:
        clean_data = df_sample[[feature, 'is_attack']].replace([np.inf, -np.inf], np.nan).dropna()
        
        if len(clean_data) > 0:
            benign_data = clean_data[clean_data['is_attack'] == 0][feature]
            attack_data = clean_data[clean_data['is_attack'] == 1][feature]
            
            if len(benign_data) > 0 and len(attack_data) > 0:
                comparison_stats.append({
                    'Feature': feature,
                    'Benign_Mean': benign_data.mean(),
                    'Attack_Mean': attack_data.mean(),
                    'Benign_Std': benign_data.std(),
                    'Attack_Std': attack_data.std(),
                    'Mean_Ratio': attack_data.mean() / benign_data.mean() if benign_data.mean() != 0 else np.inf
                })

if comparison_stats:
    comp_df = pd.DataFrame(comparison_stats)
    comp_df = comp_df.round(4)
    display(comp_df)
else:
    print("No valid data for comparison")

## üìä Data Quality Report

In [None]:
print("=== DATA QUALITY REPORT ===")

# Calculate quality metrics
total_rows = len(df_sample)
total_cols = len(df_sample.columns)

# Missing values
total_missing = df_sample.isnull().sum().sum()
missing_percentage = (total_missing / (total_rows * total_cols)) * 100

# Duplicate rows
duplicate_rows = df_sample.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

# Infinite values in numerical columns
total_inf = 0
for col in numerical_cols:
    total_inf += np.isinf(df_sample[col]).sum()

# Constant columns (zero variance)
constant_cols = []
for col in numerical_cols:
    if df_sample[col].nunique() <= 1:
        constant_cols.append(col)

# Quality summary
quality_report = {
    'Total Records': f"{total_rows:,}",
    'Total Features': f"{total_cols:,}",
    'Missing Values': f"{total_missing:,} ({missing_percentage:.2f}%)",
    'Duplicate Rows': f"{duplicate_rows:,} ({duplicate_percentage:.2f}%)",
    'Infinite Values': f"{total_inf:,}",
    'Constant Features': f"{len(constant_cols)}",
    'Numerical Features': f"{len(numerical_cols)}",
    'Categorical Features': f"{len(df_sample.select_dtypes(include=['object']).columns)}"
}

for key, value in quality_report.items():
    print(f"{key:<20}: {value}")

# Data quality score
quality_score = 100
if missing_percentage > 5:
    quality_score -= min(missing_percentage * 2, 30)
if duplicate_percentage > 1:
    quality_score -= min(duplicate_percentage, 20)
if total_inf > 0:
    quality_score -= 10
if len(constant_cols) > 0:
    quality_score -= len(constant_cols)

quality_score = max(quality_score, 0)

print(f"\nüìä Overall Data Quality Score: {quality_score:.1f}/100")

if quality_score >= 90:
    print("‚úÖ Excellent data quality!")
elif quality_score >= 70:
    print("‚ú® Good data quality with minor issues")
elif quality_score >= 50:
    print("‚ö†Ô∏è Moderate data quality - needs attention")
else:
    print("‚ùå Poor data quality - significant preprocessing required")

## üìã Preprocessing Recommendations

In [None]:
print("=== PREPROCESSING RECOMMENDATIONS ===")
recommendations = []

# Missing values
if total_missing > 0:
    recommendations.append("üîß Handle missing values using imputation or removal")

# Infinite values
if total_inf > 0:
    recommendations.append("üîß Replace infinite values with NaN or large finite numbers")

# Duplicates
if duplicate_rows > 0:
    recommendations.append("üîß Remove duplicate rows to avoid data leakage")

# Constant features
if constant_cols:
    recommendations.append(f"üîß Remove {len(constant_cols)} constant features (zero variance)")

# High correlations
if high_corr_pairs:
    recommendations.append(f"üîß Consider removing {len(high_corr_pairs)} highly correlated feature pairs")

# Class imbalance
if benign_ratio > 90 or benign_ratio < 10:
    recommendations.append("üîß Address severe class imbalance using SMOTE, undersampling, or weighted models")
elif benign_ratio > 80 or benign_ratio < 20:
    recommendations.append("üîß Consider techniques for moderate class imbalance")

# Feature scaling
recommendations.append("üîß Apply feature scaling (StandardScaler or MinMaxScaler)")

# Feature selection
recommendations.append("üîß Perform feature selection to reduce dimensionality")

# Time series
recommendations.append("üîß Create time-based windows for time series analysis")

# Display recommendations
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

print(f"\nüìà Ready to proceed with Big Data preprocessing using Apache Spark!")
print(f"üíæ Next step: Run the Spark preprocessing pipeline to handle the full dataset")

## üöÄ Next Steps

1. **Run Spark Preprocessing**: Execute `spark/merge_clean.py` to process all CSV files
2. **Feature Engineering**: Run `spark/feature_engineering.py` for advanced features
3. **Time Series Analysis**: Create temporal windows and sequence features
4. **Model Building**: Train Random Forest, XGBoost, and LSTM models
5. **Real-time IDS**: Implement streaming detection pipeline

This EDA provides the foundation for understanding the CSE-CIC-IDS2018 dataset structure and quality.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

print("=== PCA ANALYSIS ===")

# Sample data for PCA (use subset for performance)
sample_size = min(10000, len(df_sample))
df_pca = df_sample.sample(n=sample_size, random_state=42)

# Select numerical features and clean
pca_features = numerical_cols[:20]  # Use first 20 features
X_pca = df_pca[pca_features].replace([np.inf, -np.inf], np.nan).fillna(0)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_pca)

# Apply PCA
pca = PCA(n_components=10)
X_pca_transformed = pca.fit_transform(X_scaled)

print(f"Original dimensions: {X_scaled.shape}")
print(f"Reduced dimensions: {X_pca_transformed.shape}")
print(f"\nExplained variance ratio:")
for i, var in enumerate(pca.explained_variance_ratio_[:5], 1):
    print(f"  PC{i}: {var:.4f} ({var*100:.2f}%)")
print(f"  Cumulative (first 5): {sum(pca.explained_variance_ratio_[:5]):.4f} ({sum(pca.explained_variance_ratio_[:5])*100:.2f}%)")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scree plot
ax1 = axes[0]
ax1.bar(range(1, 11), pca.explained_variance_ratio_, alpha=0.7, color='steelblue', edgecolor='black')
ax1.plot(range(1, 11), np.cumsum(pca.explained_variance_ratio_), 'r-o', linewidth=2, label='Cumulative')
ax1.set_xlabel('Principal Component', fontsize=12)
ax1.set_ylabel('Explained Variance Ratio', fontsize=12)
ax1.set_title('PCA - Scree Plot', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2D PCA scatter
ax2 = axes[1]
y_pca = df_pca['is_attack'].values
scatter = ax2.scatter(X_pca_transformed[:, 0], X_pca_transformed[:, 1], 
                     c=y_pca, cmap='coolwarm', alpha=0.6, s=20, edgecolors='k', linewidth=0.5)
ax2.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
ax2.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
ax2.set_title('PCA - 2D Projection (Benign vs Attack)', fontsize=14, fontweight='bold')
plt.colorbar(scatter, ax=ax2, label='Class (0=Benign, 1=Attack)')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ PCA analysis complete")

In [None]:
from sklearn.ensemble import IsolationForest

print("=== OUTLIER DETECTION ===")

# Use subset of features for outlier detection
outlier_features = comparison_features[:4] if comparison_features else numerical_cols[:4]
X_outlier = df_sample[outlier_features].replace([np.inf, -np.inf], np.nan).fillna(0)

# Apply Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
outlier_predictions = iso_forest.fit_predict(X_outlier)

# Count outliers
outliers_count = (outlier_predictions == -1).sum()
outliers_percentage = (outliers_count / len(df_sample)) * 100

print(f"Total samples: {len(df_sample):,}")
print(f"Detected outliers: {outliers_count:,} ({outliers_percentage:.2f}%)")

# Outliers by class
df_sample['is_outlier'] = (outlier_predictions == -1).astype(int)

outlier_by_class = df_sample.groupby(['is_attack', 'is_outlier']).size().unstack(fill_value=0)
print(f"\nOutliers by class:")
print(outlier_by_class)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Outlier distribution
ax1 = axes[0]
outlier_counts = df_sample.groupby(['is_attack', 'is_outlier']).size().unstack(fill_value=0)
outlier_counts.plot(kind='bar', ax=ax1, color=['green', 'red'], alpha=0.7, edgecolor='black')
ax1.set_title('Outliers by Class', fontsize=14, fontweight='bold')
ax1.set_xlabel('Class (0=Benign, 1=Attack)')
ax1.set_ylabel('Count')
ax1.legend(['Normal', 'Outlier'])
ax1.set_xticklabels(['Benign', 'Attack'], rotation=0)
ax1.grid(True, alpha=0.3)

# PCA with outliers highlighted
ax2 = axes[1]
normal_mask = outlier_predictions == 1
outlier_mask = outlier_predictions == -1

ax2.scatter(X_pca_transformed[normal_mask, 0], X_pca_transformed[normal_mask, 1],
           c='blue', alpha=0.3, s=10, label='Normal', edgecolors='none')
ax2.scatter(X_pca_transformed[outlier_mask, 0], X_pca_transformed[outlier_mask, 1],
           c='red', alpha=0.7, s=30, label='Outlier', marker='x', linewidths=2)
ax2.set_xlabel('PC1', fontsize=12)
ax2.set_ylabel('PC2', fontsize=12)
ax2.set_title('Outliers in PCA Space', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Outlier detection complete")

In [None]:
print("=== TEMPORAL ATTACK PATTERNS ===")

# Create synthetic timestamps (since we don't have real timestamps in sample)
df_sample['timestamp'] = pd.date_range(start='2018-02-14 00:00:00', periods=len(df_sample), freq='1s')
df_sample['hour'] = df_sample['timestamp'].dt.hour
df_sample['day_of_week'] = df_sample['timestamp'].dt.dayofweek
df_sample['date'] = df_sample['timestamp'].dt.date

# Attack counts by hour
hourly_attacks = df_sample[df_sample['is_attack'] == 1].groupby('hour').size()
hourly_total = df_sample.groupby('hour').size()
hourly_attack_rate = (hourly_attacks / hourly_total * 100).fillna(0)

print(f"\nAttacks by hour of day:")
print(f"Peak attack hour: {hourly_attacks.idxmax()} with {hourly_attacks.max()} attacks")
print(f"Lowest attack hour: {hourly_attacks.idxmin()} with {hourly_attacks.min()} attacks")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# 1. Hourly attack counts
ax1 = axes[0, 0]
hourly_attacks.plot(kind='bar', ax=ax1, color='crimson', alpha=0.7, edgecolor='black')
ax1.set_title('Attack Count by Hour of Day', fontsize=14, fontweight='bold')
ax1.set_xlabel('Hour')
ax1.set_ylabel('Attack Count')
ax1.grid(True, alpha=0.3)

# 2. Attack rate by hour
ax2 = axes[0, 1]
hourly_attack_rate.plot(kind='line', ax=ax2, color='darkred', linewidth=3, marker='o')
ax2.fill_between(range(24), hourly_attack_rate.values, alpha=0.3, color='red')
ax2.set_title('Attack Rate by Hour of Day', fontsize=14, fontweight='bold')
ax2.set_xlabel('Hour')
ax2.set_ylabel('Attack Rate (%)')
ax2.set_xticks(range(24))
ax2.grid(True, alpha=0.3)

# 3. Day of week analysis
ax3 = axes[1, 0]
day_attacks = df_sample[df_sample['is_attack'] == 1].groupby('day_of_week').size()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
day_attacks.index = [day_names[i] for i in day_attacks.index]
day_attacks.plot(kind='bar', ax=ax3, color='orange', alpha=0.7, edgecolor='black')
ax3.set_title('Attack Count by Day of Week', fontsize=14, fontweight='bold')
ax3.set_xlabel('Day')
ax3.set_ylabel('Attack Count')
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=45)
ax3.grid(True, alpha=0.3)

# 4. Attack type distribution over time
ax4 = axes[1, 1]
# Get top attack types
top_attacks = df_sample[df_sample['Label'] != 'Benign']['Label'].value_counts().head(5)
attack_timeline = df_sample[df_sample['Label'].isin(top_attacks.index)].groupby(['hour', 'Label']).size().unstack(fill_value=0)

if not attack_timeline.empty:
    attack_timeline.plot(kind='area', stacked=True, ax=ax4, alpha=0.7)
    ax4.set_title('Attack Types Distribution by Hour', fontsize=14, fontweight='bold')
    ax4.set_xlabel('Hour')
    ax4.set_ylabel('Attack Count')
    ax4.legend(title='Attack Type', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax4.grid(True, alpha=0.3)
else:
    ax4.text(0.5, 0.5, 'No attack type data available', ha='center', va='center', transform=ax4.transAxes)

plt.tight_layout()
plt.show()

print("\n‚úÖ Temporal pattern analysis complete")

In [None]:
print("=== FEATURE DISTRIBUTIONS BY ATTACK TYPE ===")

# Get top 5 attack types + Benign
top_labels = df_sample['Label'].value_counts().head(6).index.tolist()
df_attack_analysis = df_sample[df_sample['Label'].isin(top_labels)]

# Select features for analysis
analysis_features = comparison_features[:3] if comparison_features else numerical_cols[:3]

print(f"\nAnalyzing {len(analysis_features)} features across {len(top_labels)} classes")
print(f"Classes: {', '.join(top_labels)}")

fig, axes = plt.subplots(len(analysis_features), 1, figsize=(16, 5*len(analysis_features)))

if len(analysis_features) == 1:
    axes = [axes]

for idx, feature in enumerate(analysis_features):
    if feature in df_attack_analysis.columns:
        # Clean data
        plot_data = df_attack_analysis[[feature, 'Label']].replace([np.inf, -np.inf], np.nan).dropna()
        
        if len(plot_data) > 0:
            # Violin plot
            sns.violinplot(data=plot_data, x='Label', y=feature, ax=axes[idx], palette='Set2')
            axes[idx].set_title(f'Distribution of {feature} by Attack Type', fontsize=14, fontweight='bold')
            axes[idx].set_xlabel('Attack Type')
            axes[idx].set_ylabel(feature)
            axes[idx].tick_params(axis='x', rotation=45)
            axes[idx].set_yscale('log')
            axes[idx].grid(True, alpha=0.3)
        else:
            axes[idx].text(0.5, 0.5, f'No valid data for {feature}', 
                          ha='center', va='center', transform=axes[idx].transAxes)

plt.tight_layout()
plt.show()

print("\n‚úÖ Feature distribution analysis complete")

## üìä Advanced Feature Analysis - Distributions by Attack Type

## ‚è∞ Temporal Patterns & Attack Timeline

## üîç Outlier Detection & Analysis

## üé® Advanced Visualizations - PCA & Dimensionality Reduction