# Data Exploration for Intrusion Detection System

This notebook explores the network traffic dataset, performs exploratory data analysis, and visualizes attack patterns.


In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))

from src.preprocessing import DataPreprocessor
from src.visualization import IDSVisualizer

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
try:
    plt.style.use('seaborn-v0_8')
except OSError:
    plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 1. Load Data


In [None]:
# Load data
data_path = '../data/raw/sample_data.csv'  # Update with your dataset path

preprocessor = DataPreprocessor()
df = preprocessor.load_data(data_path)

if df is None:
    print("Creating sample data...")
    from src.preprocessing import main as create_sample
    create_sample()
    df = preprocessor.load_data(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


## 2. Data Overview


In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\n" + "="*60)
print("\nBasic Statistics:")
print(df.describe())
print("\n" + "="*60)
print("\nMissing Values:")
print(df.isnull().sum())
print("\n" + "="*60)
print("\nData Types:")
print(df.dtypes)


## 3. Class Distribution


In [None]:
# Identify target column
target_col = None
for col in ['label', 'attack', 'class', 'target']:
    if col in df.columns:
        target_col = col
        break

if target_col:
    visualizer = IDSVisualizer()
    visualizer.plot_class_distribution(df, target_column=target_col)
    
    print("\nClass distribution:")
    print(df[target_col].value_counts())
    print(f"\nClass proportions:")
    print(df[target_col].value_counts(normalize=True))
else:
    print("Target column not found in dataset")


## 4. Feature Distributions


In [None]:
# Plot feature distributions
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

if numeric_cols:
    visualizer.plot_feature_distributions(df, features=numeric_cols[:6])
else:
    print("No numeric features found")


## 5. Correlation Analysis


In [None]:
# Plot correlation matrix
if numeric_cols:
    visualizer.plot_correlation_matrix(df, features=numeric_cols[:15])


## 6. Attack Pattern Visualization


In [None]:
# Visualize attack patterns
if target_col and len(numeric_cols) >= 2:
    visualizer.plot_attack_patterns(
        df, 
        numeric_cols[0], 
        numeric_cols[1], 
        target_column=target_col
    )
