In [1]:

# Optimized IDS Notebook (Enhanced Multiclass Version) for CSE-CIC-IDS2018
# This code is designed to be run in Google Colab.

# --- Step 0: Initial Setup and Library Imports ---
# Make sure you have the necessary libraries installed.
# If you run into "ModuleNotFoundError", you might need to install them:
# !pip install pandas scikit-learn numpy seaborn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    matthews_corrcoef
)
import glob
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings that might clutter output (e.g., DtypeWarning from pandas)
warnings.filterwarnings('ignore')

# --- Step 1: Mount Google Drive ---
# print("Mounting Google Drive...")
# drive.mount('/content/drive')
# print("Google Drive mounted successfully.")

# --- Step 2: Configuration for CSE-CIC-IDS2018 Dataset ---
dataset_path = './kaggle/input/ids-intrusion-csv'
label_column_name = 'Label'
sample_size = 50000

# --- Step 3: Data Loading and Concatenation ---
print(f"\n--- Loading Data ---")
print(f"Searching for CSV files in: {dataset_path}")
all_files = glob.glob(os.path.join(dataset_path, "*.csv"))



--- Loading Data ---
Searching for CSV files in: ./kaggle/input/ids-intrusion-csv


In [2]:

if not all_files:
    print(f"ERROR: No CSV files found in {dataset_path}. Please check the path and ensure files exist.")


In [3]:
print(f"Found {len(all_files)} CSV files. Starting to load them...")
df_list = []
for f in all_files:
    try:
        df_temp = pd.read_csv(f, low_memory=False, encoding='utf-8')
        df_list.append(df_temp)
        print(f"Successfully loaded: {os.path.basename(f)}")
    except Exception as e:
        print(f"WARNING: Error loading {os.path.basename(f)} with UTF-8: {e}")
        print(f"Attempting with 'latin1' encoding...")
        try:
            df_temp = pd.read_csv(f, low_memory=False, encoding='latin1')
            df_list.append(df_temp)
            print(f"Successfully loaded with 'latin1': {os.path.basename(f)}")
        except Exception as e_latin1:
            print(f"ERROR: Failed to load {os.path.basename(f)} with 'latin1' encoding either: {e_latin1}")
            print("Skipping this file.")
            continue



Found 10 CSV files. Starting to load them...
Successfully loaded: 02-14-2018.csv
Successfully loaded: 02-15-2018.csv
Successfully loaded: 02-16-2018.csv
Successfully loaded: 02-20-2018.csv
Successfully loaded: 02-21-2018.csv
Successfully loaded: 02-22-2018.csv
Successfully loaded: 02-23-2018.csv
Successfully loaded: 02-28-2018.csv
Successfully loaded: 03-01-2018.csv
Successfully loaded: 03-02-2018.csv


In [4]:
if df_list:
    df_combined = pd.concat(df_list, axis=0, ignore_index=True)
    print("\nAll datasets loaded and concatenated successfully.")
    print(f"Original combined shape: {df_combined.shape}")
    print("\nFirst 5 rows of the combined dataframe:")
    print(df_combined.head())
    print(f"\nColumns in the combined dataframe (showing first/last for brevity):")
    print(df_combined.columns.tolist()[:5], "...", df_combined.columns.tolist()[-5:])
    
else:
    print("ERROR: No dataframes were successfully loaded. Cannot proceed with analysis.")


MemoryError: Unable to allocate 16.0 MiB for an array with shape (2, 1048575) and data type object

In [None]:
# --- Step 4: Data Preprocessing ---
print(f"\n--- Data Preprocessing ---")
if label_column_name not in df_combined.columns:
    print(f"ERROR: Label column '{label_column_name}' not found. Checking for common alternatives...")
    if 'Attack' in df_combined.columns:
        label_column_name = 'Attack'
        print(f"Using 'Attack' as the label column.")
    else:
        raise KeyError(f"Required label column ('{label_column_name}' or 'Attack') not found in the dataset.")

print(f"Value counts for the original '{label_column_name}' column:")
print(df_combined[label_column_name].value_counts())


In [None]:

initial_rows = df_combined.shape[0]
df_combined.dropna(subset=[label_column_name], inplace=True)
rows_dropped_label_na = initial_rows - df_combined.shape[0]
if rows_dropped_label_na > 0:
    print(f"Dropped {rows_dropped_label_na} rows with NA values in the '{label_column_name}' column.")
print(f"Shape after dropping NA in label column: {df_combined.shape}")

if df_combined.shape[0] > sample_size:
    df = df_combined.sample(n=sample_size, random_state=42).reset_index(drop=True)
    print(f"Sampled down to {df.shape[0]} rows for faster processing.")
else:
    df = df_combined.copy()
    print("Dataset size is smaller than or equal to sample_size, no sampling performed.")

columns_to_drop_initial = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'SimillarHTTP', 'Unnamed: 0']
columns_to_drop_initial = [col for col in columns_to_drop_initial if col in df.columns]
df.drop(columns=columns_to_drop_initial, errors='ignore', inplace=True)
print(f"Shape after dropping identifier/non-informative columns: {df.shape}")

X = df.drop(columns=[label_column_name])
y = df[label_column_name]


In [None]:

print("Converting feature columns to numeric types...")
for col in X.columns:
    if not pd.api.types.is_numeric_dtype(X[col]):
        X[col] = pd.to_numeric(X[col], errors='coerce')

print("Replacing infinity values with NaN and dropping corresponding rows...")
X.replace([np.inf, -np.inf], np.nan, inplace=True)
initial_rows_X = X.shape[0]
X.dropna(inplace=True)
rows_dropped_X_na = initial_rows_X - X.shape[0]
if rows_dropped_X_na > 0:
    print(f"Dropped {rows_dropped_X_na} rows from features (X) due to NA/infinity values.")

y = y[X.index]
print(f"Shape after dropping NAs/Infinities in features and aligning y: X={X.shape}, y={y.shape}")


In [None]:

print("\nMapping target labels to numerical values (Multiclass: BENIGN=0, others as unique classes)...")
y = y.astype(str).str.strip().str.upper()
unique_labels_before_map = y.unique()
print(f"Unique labels before mapping: {unique_labels_before_map}")
label_mapping = {label: idx for idx, label in enumerate(y.unique())}
y = y.map(label_mapping)
print(f"Label mapping: {label_mapping}")
print(f"Value counts for target after mapping:\n{y.value_counts()}")

X = X.select_dtypes(include=[np.number])
if X.empty or X.shape[1] == 0:
    raise ValueError("ERROR: No numeric features remaining after preprocessing.")
else:
    print(f"Final feature set (X) shape before split: {X.shape}")

# --- Step 5: Anomaly Detection Layer (Isolation Forest) ---
print(f"\n--- Running Anomaly Detection with Isolation Forest ---")
iso_forest = IsolationForest(contamination=0.1, random_state=42)
y_pred_anomaly = iso_forest.fit_predict(X)
X_anomaly_filtered = X[y_pred_anomaly == 1]  # Keep only inliers
y_anomaly_filtered = y[y_pred_anomaly == 1]
print(f"Removed {sum(y_pred_anomaly == -1)} outliers. Shape after anomaly filtering: X={X_anomaly_filtered.shape}, y={y_anomaly_filtered.shape}")

# --- Step 6: Feature Selection (SelectKBest) ---
k_features = 20
if X_anomaly_filtered.shape[1] > k_features:
    print(f"\n--- Performing Feature Selection (SelectKBest with k={k_features}) ---")
    selector = SelectKBest(mutual_info_classif, k=k_features)
    if X_anomaly_filtered.shape[0] == 0 or y_anomaly_filtered.shape[0] == 0:
        print("WARNING: No data available for feature selection after cleaning. Skipping.")
        X_selected = X_anomaly_filtered.values
    else:
        X_selected = selector.fit_transform(X_anomaly_filtered, y_anomaly_filtered)
        selected_feature_indices = selector.get_support(indices=True)
        selected_feature_names = X_anomaly_filtered.columns[selected_feature_indices].tolist()
        print(f"Shape after feature selection: {X_selected.shape}")
        print(f"Selected features: {selected_feature_names}")
else:
    print(f"\n--- Skipping Feature Selection ---")
    print(f"Number of available features ({X_anomaly_filtered.shape[1]}) is less than or equal to k_features ({k_features}). Using all numeric features.")
    X_selected = X_anomaly_filtered.values

# --- Step 7: Train-Test Split ---
print(f"\n--- Performing Train-Test Split ---")
if X_selected.shape[0] == 0 or X_selected.shape[1] == 0:
    print("ERROR: No valid data available for splitting after all preprocessing steps.")
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y_anomaly_filtered, test_size=0.2, random_state=42, stratify=y_anomaly_filtered
    )
    print(f"Train-test split complete:")
    print(f"  X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"  X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    print(f"  y_train value counts:\n{y_train.value_counts(normalize=True)}")
    print(f"  y_test value counts:\n{y_test.value_counts(normalize=True)}")

    # --- Step 8: Model Training and Evaluation (Multiclass Decision Tree with OneVsRest) ---
    print(f"\n--- Training Multiclass Decision Tree with OneVsRest ---")
    clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=42))
    clf.fit(X_train, y_train)
    print("Model training complete.")

    print(f"\n--- Evaluating Model Performance ---")
    y_pred = clf.predict(X_test)
    # Ensure no NaN in predictions
    y_pred = np.nan_to_num(y_pred, nan=0)  # Replace NaN with 0 (default class)
    y_pred_proba = clf.predict_proba(X_test)
    # Ensure no NaN in probabilities
    y_pred_proba = np.nan_to_num(y_pred_proba, nan=0.0)

    print("--- Standard Metrics ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    # Attack Type Analysis: Confusion Matrix Heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_mapping.keys(), yticklabels=label_mapping.keys())
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix Heatmap by Attack Type')
    plt.show()

    print("\nClassification Report (Precision, Recall, F1-Score for each class):")
    print(classification_report(y_test, y_pred, target_names=list(label_mapping.keys())))

    print("\n--- Realistic Measures for Real-World Scenarios ---")
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)

    print(f"Precision (Weighted Average): {precision:.4f}")
    print(f"Recall (Weighted Average): {recall:.4f}")
    print(f"F1-Score (Weighted Average): {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")

    # ROC AUC for multiclass (one-vs-rest)
    try:
        from sklearn.preprocessing import label_binarize
        y_test_bin = label_binarize(y_test, classes=range(len(label_mapping)))
        # Ensure no NaN in y_test_bin or y_pred_proba
        y_test_bin = np.nan_to_num(y_test_bin, nan=0)
        y_pred_proba = np.nan_to_num(y_pred_proba, nan=0.0)
        roc_auc_ovr = roc_auc_score(y_test_bin, y_pred_proba, multi_class='ovr', average='weighted')
        print(f"ROC AUC Score (One-vs-Rest): {roc_auc_ovr:.4f}")

        # Plot ROC Curve for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(len(label_mapping)):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        plt.figure(figsize=(10, 8))
        colors = ['blue', 'red', 'green', 'orange', 'purple']
        for i, color in zip(range(len(label_mapping)), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve class {list(label_mapping.keys())[i]} (area = {roc_auc[i]:.2f})')
        plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Classifier')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate (FPR)')
        plt.ylabel('True Positive Rate (TPR) / Recall')
        plt.title('Receiver Operating Characteristic (ROC) Curve (Multiclass)')
        plt.legend(loc="lower right")
        plt.grid(True)
        plt.show()

    except ValueError as e:
        print(f"Could not calculate ROC AUC or plot ROC curve: {e}")
