# Machine Learning Pipeline for Network Intrusion Detection - Test Version

This is a test version of the notebook that loads only 200 records from the dataset to verify functionality.

## 1. Setup and Imports

In [None]:
# Basic data manipulation libraries
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 2. Load Sample Data

In [None]:
def load_sample_data(sample_size=200):
    """
    Create a sample dataset for testing purposes.
    
    Args:
        sample_size (int): Number of samples to generate
        
    Returns:
        pd.DataFrame: Sample DataFrame
    """
    # Create sample features
    np.random.seed(42)
    
    sample_data = {
        'Flow Duration': np.random.randint(1, 100000, sample_size),
        'Total Fwd Packets': np.random.randint(1, 100, sample_size),
        'Total Backward Packets': np.random.randint(1, 100, sample_size),
        'Total Length of Fwd Packets': np.random.randint(1, 10000, sample_size),
        'Total Length of Bwd Packets': np.random.randint(1, 10000, sample_size),
        'Fwd Packet Length Max': np.random.randint(1, 1500, sample_size),
        'Fwd Packet Length Min': np.random.randint(0, 100, sample_size),
        'Fwd Packet Length Mean': np.random.uniform(10, 500, sample_size),
        'Bwd Packet Length Max': np.random.randint(1, 1500, sample_size),
        'Bwd Packet Length Min': np.random.randint(0, 100, sample_size),
        'Bwd Packet Length Mean': np.random.uniform(10, 500, sample_size),
        'Flow Bytes/s': np.random.uniform(0, 10000, sample_size),
        'Flow Packets/s': np.random.uniform(0, 1000, sample_size),
        'Flow IAT Mean': np.random.uniform(0, 1000, sample_size),
        'Flow IAT Std': np.random.uniform(0, 500, sample_size),
        'Flow IAT Max': np.random.uniform(0, 2000, sample_size),
        'Flow IAT Min': np.random.uniform(0, 100, sample_size),
        'Fwd Header Length.1': np.random.randint(20, 100, sample_size),  # Duplicate column
        'Fwd Header Length': np.random.randint(20, 100, sample_size),
        'Protocol': np.random.choice(['TCP', 'UDP', 'ICMP'], sample_size),
        'Destination Port': np.random.choice([80, 443, 22, 53, 8080], sample_size),
        'Label': np.random.choice(['BENIGN', 'DoS', 'PortScan', 'Brute Force', 'Web Attack'], sample_size, 
                                 p=[0.7, 0.1, 0.1, 0.05, 0.05])
    }
    
    # Create DataFrame
    df = pd.DataFrame(sample_data)
    print(f"Created sample dataset: {df.shape[0]} rows, {df.shape[1]} columns")
    
    return df

# Load sample data
df = load_sample_data(200)
df.head()

## 3. Data Preprocessing

In [None]:
# Normalize column names
def normalize_column_names(df):
    """
    Normalize column names by removing leading/trailing whitespace,
    replacing spaces with underscores, and converting to lowercase.
    """
    original_columns = df.columns.tolist()
    df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
    
    print("Column name mapping:")
    for orig, norm in zip(original_columns, df.columns):
        if orig != norm:
            print(f"  {orig} -> {norm}")
    
    return df

# Normalize column names
df = normalize_column_names(df)
df.head()

In [None]:
# Remove duplicate columns
def remove_duplicate_columns(df):
    """
    Identify and remove duplicate columns from the DataFrame.
    """
    columns = df.columns.tolist()
    duplicate_columns = []
    
    for col in columns:
        if col.endswith(('.1', '.2', '.3', '.4', '.5')):
            base_col = col.rsplit('.', 1)[0]
            if base_col in columns:
                if df[col].equals(df[base_col]):
                    duplicate_columns.append(col)
                    print(f"Found duplicate column: {col} (duplicate of {base_col})")
    
    if duplicate_columns:
        df = df.drop(columns=duplicate_columns)
        print(f"Removed {len(duplicate_columns)} duplicate columns")
    else:
        print("No duplicate columns found")
    
    return df

# Remove duplicate columns
df = remove_duplicate_columns(df)
print(f"DataFrame shape after removing duplicate columns: {df.shape}")

In [None]:
# Encode categorical features
def encode_categorical_features(df):
    """
    Encode categorical features using Label Encoding.
    """
    df_encoded = df.copy()
    label_encoders = {}
    
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if 'label' in categorical_columns:
        categorical_columns.remove('label')
    
    print(f"Found {len(categorical_columns)} categorical columns: {categorical_columns}")
    
    for col in categorical_columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
        print(f"Encoded {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    
    if 'label' in df.columns:
        le = LabelEncoder()
        df_encoded['label'] = le.fit_transform(df['label'].astype(str))
        label_encoders['label'] = le
        print(f"Encoded label: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    
    return df_encoded, label_encoders

# Encode categorical features
df_encoded, label_encoders = encode_categorical_features(df)
df_encoded.head()

In [None]:
# Convert all data types to float
def convert_to_float(df):
    """
    Convert all columns (except the target variable) to float.
    """
    df_float = df.copy()
    
    columns = df.columns.tolist()
    if 'label' in columns:
        columns.remove('label')
    
    for col in columns:
        df_float[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
    
    return df_float

# Convert all data types to float
df_float = convert_to_float(df_encoded)
print("Data types after conversion:")
print(df_float.dtypes)

In [None]:
# Scale features
def scale_features(df):
    """
    Scale all numerical features using StandardScaler.
    """
    df_scaled = df.copy()
    
    X = df.drop(columns=['label'] if 'label' in df.columns else [])
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    if 'label' in df.columns:
        df_scaled['label'] = df['label'].values
    
    print(f"Features scaled using StandardScaler")
    
    return df_scaled, scaler

# Scale numerical features
df_scaled, scaler = scale_features(df_float)
df_scaled.head()

## 4. Exploratory Data Analysis

In [None]:
# Display class distribution
if 'label' in df_scaled.columns:
    print("Class Distribution:")
    class_counts = df_scaled['label'].value_counts()
    class_percentages = class_counts / len(df_scaled) * 100
    
    class_distribution = pd.DataFrame({
        'Count': class_counts,
        'Percentage': class_percentages
    })
    
    if 'label' in label_encoders:
        class_distribution.index = [label_encoders['label'].inverse_transform([i])[0] for i in class_distribution.index]
    
    print(class_distribution)

In [None]:
# Create a correlation matrix heatmap
plt.figure(figsize=(12, 10))
corr_matrix = df_scaled.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 5. Machine Learning

In [None]:
# Split dataset into features and target
X = df_scaled.drop(columns=['label'])
y = df_scaled['label']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

In [None]:
# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Random Forest Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()

## 6. Model Usage

In [None]:
# Function to predict using the trained model
def predict_sample(sample, model, scaler, label_encoder=None):
    """
    Predict the class of a single data sample.
    """
    # Scale the sample
    sample_scaled = scaler.transform(sample)
    
    # Make prediction
    pred = model.predict(sample_scaled)[0]
    
    # Get prediction confidence
    if hasattr(model, 'predict_proba'):
        prob = model.predict_proba(sample_scaled)[0]
        confidence = prob[pred]
    else:
        confidence = None
    
    # Convert numeric prediction to class name
    if label_encoder is not None:
        pred_class = label_encoder.inverse_transform([pred])[0]
    else:
        pred_class = pred
    
    return pred_class, confidence

# Get a sample from the test set
sample_idx = 0
sample = X_test.iloc[[sample_idx]]
true_label = y_test.iloc[sample_idx]

# Convert numeric label to class name
if 'label' in label_encoders:
    true_class = label_encoders['label'].inverse_transform([true_label])[0]
else:
    true_class = true_label

# Predict with the model
pred_class, confidence = predict_sample(
    sample, rf, scaler, 
    label_encoders['label'] if 'label' in label_encoders else None
)

print(f"True label: {true_class}")
print(f"Predicted label: {pred_class}")
print(f"Confidence: {confidence:.4f}")