<a href="https://colab.research.google.com/github/SianC7/LAIDS/blob/main/Models_K_fold_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load in CICIDS2017 dataset

In [1]:
import pandas as pd

# --- Data Collection ---

# Set pandas display options for wide output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


# Get Data file path
# file_path = '/content/drive/MyDrive/Colab Notebooks/Honours Project/Datasets/cicids2017_cleaned.csv'
file_path = '/content/drive/MyDrive/Colab Notebooks/Honours Project/Datasets/CICIDS2017 ADASYN Dataset/ADASYN_CICIDS2017_Dataset6.csv'

#file_path = '/Users/siancaine/Downloads/cicids2017_cleaned.csv'
# file_path = '/Users/siancaine/Downloads/ADASYN_CICIDS2017_Dataset6.csv'

cicids2017_df = pd.read_csv(file_path, sep=",", comment="#", header=0)
cicids2017_df.columns = cicids2017_df.columns.str.strip()  # Strip whitespace from column names

print("\nInitial samples:")
print(f"cicids2017_df shape: {cicids2017_df.shape}")
# print(cicids2017_df.head().to_string())
# print(cicids2017_df.info())

# Print unique values and their counts for 'Attack Type'
print("\nAttack Type Distribution:")
print(cicids2017_df['Attack Type'].value_counts())

# --- Label Encoding ---

# Get unique attack types
attack_types = cicids2017_df['Attack Type'].unique()

# Create a mapping from attack type to integer label
attack_type_map = {'Normal Traffic': 0, 'Port Scanning': 1, 'Web Attacks': 2, 'Brute Force': 3, 'DDoS': 4, 'Bots': 5, 'DoS': 6} # Use the specified mapping

# Apply label encoding
cicids2017_df['Attack Type'] = cicids2017_df['Attack Type'].map(attack_type_map)

print("\nLabel Encoding Mapping:")
print(attack_type_map)


Initial samples:
cicids2017_df shape: (4397849, 53)

Attack Type Distribution:
Attack Type
Normal Traffic    2000000
DoS                400853
Brute Force        400059
Web Attacks        400007
Bots               399828
DDoS               399757
Port Scanning      397345
Name: count, dtype: int64

Label Encoding Mapping:
{'Normal Traffic': 0, 'Port Scanning': 1, 'Web Attacks': 2, 'Brute Force': 3, 'DDoS': 4, 'Bots': 5, 'DoS': 6}


# Instantiate StandardScaler

In [2]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import numpy as np

scaler = StandardScaler()

#BASELINE K-FOLD CV

In [3]:
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

X = cicids2017_df.drop('Attack Type', axis=1)
y = cicids2017_df['Attack Type']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # Define a StratifiedKFold object 'skf' that will do 5-fold cross-validation
accuracy_scores = [] # Create a list to keep track of accuraccies

splits = list(skf.split(X, y)) # Generate a list of the indices for each fold. Each element of the splits list is a (train_index, test_index) tuple for a fold.

for fold in range(len(splits)): # Start folding

    train_index, test_index = splits[fold] # Unpack the current indieces fold from splits and place them in train_index, test_index. Eg: if splits[fold] = (array([0,1,2,4]), array([3,5])) then train_index = array([0,1,2,4]) & test_index = array([3,5])

    # Sanity check
    print(f"Fold {fold+1}:")
    print(f"Train_index: {train_index}")
    print(f"Test_index: {test_index}")


    # Split data into the training and testing folds (use iloc since the dataframe is still a panda dataframw)
    x_train_fold = X.iloc[train_index]  # Feature rows for training
    y_train_fold = y.iloc[train_index]  # Label rows for training
    x_test_fold = X.iloc[test_index]    # Feature rows for testing
    y_test_fold = y.iloc[test_index]    # Label rows for testing

    # Standarise
    x_train_fold = scaler.fit_transform(x_train_fold) # Standardise the folds according to the test fold
    x_test_fold = scaler.transform(x_test_fold)

    # Shuffle the data
    x_train_fold, y_train_fold = shuffle(x_train_fold, y_train_fold, random_state=42)
    x_test_fold, y_test_fold = shuffle(x_test_fold, y_test_fold, random_state=42)

    # Shape the data for CNN Input (A 1D Convolutional Neural Network (Conv1D) expects 3D input:(samples, timesteps, channels))
    x_train_fold = x_train_fold.reshape((x_train_fold.shape[0], x_train_fold.shape[1], 1)) # The actual 3D training array fed into the CNN for training, therefore the shape must be (samples, timesteps, 1)
    x_test_fold = x_test_fold.reshape((x_test_fold.shape[0], x_test_fold.shape[1], 1))

    input_shape = (x_train_fold.shape[1], 1) # The shape of one input sample, which is fed to the CNN layer, therefore shape: (timesteps, channels)
    num_classes = len(attack_type_map)

    # Sanity check
    print(f"Training fold input shape: {x_train_fold.shape}")
    print(f"Testing fold input shape: {x_test_fold.shape}")
    print(f"Single sample's input shape: {input_shape}")
    print(f"Number of classes: {num_classes}")

    # Define the Baseline CNN model
    baseline_model = Sequential([
      Input(shape=input_shape),

      Conv1D(filters=32, kernel_size=2, activation='relu'),
      BatchNormalization(),
      MaxPooling1D(pool_size=2),

      Conv1D(filters=16, kernel_size=3, activation='relu'),
      BatchNormalization(),
      MaxPooling1D(pool_size=2),

      Flatten(),
      Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0)),
      Dropout(0.5),
      Dense(num_classes, activation='softmax')
    ])

    baseline_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=3,
                                   restore_best_weights=True)

    # Train the baseline model
    history = baseline_model.fit(
        x_train_fold, y_train_fold,
        epochs=5,
        batch_size=64,
        validation_split=0.1,   # Reserve 10% of training for validation
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate the baseline model
    y_pred_probs = baseline_model.predict(x_test_fold,verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)

    acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")
    accuracy_scores.append(acc)

    cm = confusion_matrix(y_test_fold, y_pred)
    cm_report =  classification_report(y_test_fold, y_pred)
    print(f"Fold {fold+1} Confusion Matric report:\n{cm_report}")


# Final results
print('\nList of accuracy scores:', accuracy_scores)
print('Maximum Accuracy: {:.2f}%'.format(max(accuracy_scores) * 100))
print('Minimum Accuracy: {:.2f}%'.format(min(accuracy_scores) * 100))
print('Mean Accuracy: {:.2f}%'.format(mean(accuracy_scores) * 100))
print('Standard Deviation: {:.4f}'.format(stdev(accuracy_scores)))

Fold 1:
Train_index: [      0       1       3 ... 4397845 4397847 4397848]
Test_index: [      2      11      14 ... 4397841 4397842 4397846]
Training fold input shape: (3518279, 52, 1)
Testing fold input shape: (879570, 52, 1)
Single sample's input shape: (52, 1)
Number of classes: 7
Epoch 1/5
[1m49476/49476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 3ms/step - accuracy: 0.8980 - loss: 0.2555 - val_accuracy: 0.9720 - val_loss: 0.1020
Epoch 2/5
[1m49476/49476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 3ms/step - accuracy: 0.9534 - loss: 0.1399 - val_accuracy: 0.7881 - val_loss: 0.6012
Epoch 3/5
[1m49476/49476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 3ms/step - accuracy: 0.9628 - loss: 0.1189 - val_accuracy: 0.9785 - val_loss: 0.0701
Epoch 4/5
[1m49476/49476[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 3ms/step - accuracy: 0.9677 - loss: 0.1074 - val_accuracy: 0.9507 - val_loss: 0.1252
Epoch 5/5
[1m49476/49476[0m [32m━━━━━━━━━━━━

# PCA-CNN K-FOLD CV

In [4]:
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

X = cicids2017_df.drop('Attack Type', axis=1)
y = cicids2017_df['Attack Type']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # Define a StratifiedKFold object 'skf' that will do 5-fold cross-validation
accuracy_scores = [] # Create a list to keep track of accuraccies

splits = list(skf.split(X, y)) # Generate a list of the indices for each fold. Each element of the splits list is a (train_index, test_index) tuple for a fold.

for fold in range(len(splits)): # Start folding

    train_index, test_index = splits[fold] # Unpack the current indieces fold from splits and place them in train_index, test_index. Eg: if splits[fold] = (array([0,1,2,4]), array([3,5])) then train_index = array([0,1,2,4]) & test_index = array([3,5])

    # Sanity check
    print(f"Fold {fold+1}:")
    print(f"Train_index: {train_index}")
    print(f"Test_index: {test_index}")


    # Split data into the training and testing folds (use iloc since the dataframe is still a panda dataframw)
    x_train_fold = X.iloc[train_index]  # Feature rows for training
    y_train_fold = y.iloc[train_index]  # Label rows for training
    x_test_fold = X.iloc[test_index]    # Feature rows for testing
    y_test_fold = y.iloc[test_index]    # Label rows for testing

    # Standarise
    x_train_fold = scaler.fit_transform(x_train_fold) # Standardise the folds according to the test fold
    x_test_fold = scaler.transform(x_test_fold)

    # Apply PCA
    pca = PCA(n_components = 25)
    x_train_fold = pca.fit_transform(x_train_fold)
    x_test_fold = pca.transform(x_test_fold)

    # Shuffle the data
    x_train_fold, y_train_fold = shuffle(x_train_fold, y_train_fold, random_state=42)
    x_test_fold, y_test_fold = shuffle(x_test_fold, y_test_fold, random_state=42)

    # Shape the data for CNN Input (A 1D Convolutional Neural Network (Conv1D) expects 3D input:(samples, timesteps, channels))
    x_train_fold = x_train_fold.reshape((x_train_fold.shape[0], x_train_fold.shape[1], 1)) # The actual 3D training array fed into the CNN for training, therefore the shape must be (samples, timesteps, 1)
    x_test_fold = x_test_fold.reshape((x_test_fold.shape[0], x_test_fold.shape[1], 1))

    input_shape = (x_train_fold.shape[1], 1) # The shape of one input sample, which is fed to the CNN layer, therefore shape: (timesteps, channels)
    num_classes = len(attack_type_map)

    # Sanity check
    print(f"Training fold input shape: {x_train_fold.shape}")
    print(f"Testing fold input shape: {x_test_fold.shape}")
    print(f"Single sample's input shape: {input_shape.shape}")
    print(f"Number of classes: {num_classes}")

    # Define the Baseline CNN model
    cnn_model = Sequential([
        Input(shape=input_shape),

        Conv1D(filters=32, kernel_size=2, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),

        Conv1D(filters=16, kernel_size=3, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),

        Flatten(),
        Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0)),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    cnn_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=3,
                                   restore_best_weights=True)

    # Train the baseline model
    history = cnn_model.fit(
        x_train_fold, y_train_fold,
        epochs=5,
        batch_size=64,
        validation_split=0.1,   # Reserve 10% of training for validation
        callbacks=[early_stopping],
        verbose=0
    )

    # Evaluate the baseline model
    y_pred_probs = cnn_model.predict(x_test_fold)
    y_pred = np.argmax(y_pred_probs, axis=1)

    acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")
    accuracy_scores.append(acc)

    cm = confusion_matrix(y_test_fold, y_pred)
    cm_report =  classification_report(y_test_fold, y_pred)
    print(f"Fold {fold+1} Confusion Matric report:\n{cm_report}")


# Final results
print('\nList of accuracy scores:', accuracy_scores)
print('Maximum Accuracy: {:.2f}%'.format(max(accuracy_scores) * 100))
print('Minimum Accuracy: {:.2f}%'.format(min(accuracy_scores) * 100))
print('Mean Accuracy: {:.2f}%'.format(mean(accuracy_scores) * 100))
print('Standard Deviation: {:.4f}'.format(stdev(accuracy_scores)))

Fold 1:
Train_index: [      0       1       3 ... 4397845 4397847 4397848]
Test_index: [      2      11      14 ... 4397841 4397842 4397846]


KeyboardInterrupt: 

# AE K_FOLD CV

In [None]:
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# --- Separate Benign and Malicious Traffic ---

# Separate benign and malicious samples
benign_df = cicids2017_df[cicids2017_df['Attack Type'] == 0]
malicious_df = cicids2017_df[cicids2017_df['Attack Type'] != 0]

# Separate features (X) and labels (y) for benign data
X_benign = benign_df.drop('Attack Type', axis=1)
y_benign = benign_df['Attack Type']

# Separate features (X) and labels (y) for malicious data
X_malicious = malicious_df.drop('Attack Type', axis=1)
y_malicious = malicious_df['Attack Type']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # Define a StratifiedKFold object 'skf' that will do 5-fold cross-validation
accuracy_scores = [] # Create a list to keep track of accuraccies

splits = list(skf.split(X_benign, y_benign)) # Generate a list of the indices for each fold. Each element of the splits list is a (train_index, test_index) tuple for a fold.

for fold in range(len(splits)): # Start folding

    train_index, test_index = splits[fold] # Unpack the current indieces fold from splits and place them in train_index, test_index. Eg: if splits[fold] = (array([0,1,2,4]), array([3,5])) then train_index = array([0,1,2,4]) & test_index = array([3,5])

    # Sanity check
    print(f"Fold {fold+1}:")

    # Split data into the training and testing folds (use iloc since the dataframe is still a panda dataframe)
    x_train_fold = X.iloc[train_index]  # Feature rows for training
    y_train_fold = y.iloc[train_index]  # Label rows for training
    x_test_fold = X.iloc[test_index]    # Feature rows for testing
    y_test_fold = y.iloc[test_index]    # Label rows for testing

    # Concatenate benign and malicious data for classifier and test sets
    x_test_fold = pd.concat([x_test_fold, X_malicious], ignore_index=True)
    y_test_fold = pd.concat([y_test_fold, y_malicious], ignore_index=True)

    # Shuffle the data
    x_train_fold, y_train_fold = shuffle(x_train_fold, y_train_fold, random_state=42)
    x_test_fold, y_test_fold = shuffle(x_test_fold, y_test_fold, random_state=42)

    # Reset indexes
    x_test_fold = x_test_fold.reset_index(drop=True)
    y_test_fold = y_test_fold.reset_index(drop=True)

    # Normalise data
    x_train_fold = scaler.fit_transform(x_train_fold)
    x_test_fold = scaler.transform(x_test_fold)

    # Define the Baseline CNN model
    autoencoder_input_dim = x_train_fold.shape[1]

    ae_model = Sequential([
        Input(shape=(autoencoder_input_dim,)),
        Dense(64, activation='relu'),   # Encoder layer
        Dense(32, activation='relu'),   # Encoder layer
        Dense(16, activation='relu'),   # Encoder layer
        Dense(8, activation='relu'),    # Encoder layer
        Dense(4, activation='relu'),    # Bottleneck layer
        Dense(8, activation='relu'),    # Decoder layer
        Dense(16, activation='relu'),   # Decoder layer
        Dense(32, activation='relu'),   # Decoder layer
        Dense(64, activation='relu'),   # Decoder layers
        Dense(autoencoder_input_dim, activation='sigmoid')  # Output layer sigmoid
])

    ae_model.compile(optimizer='adam', loss='mse')

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=3,
                                   restore_best_weights=True)

    # Train the baseline model
    history = ae_model.fit(
        x_train_fold, y_train_fold,
        epochs=5,
        batch_size=64,
        validation_split=0.1,   # Reserve 10% of training for validation
        callbacks=[early_stopping],
        verbose=0
    )

    # Get threshold
    y_pred_probs = ae_model.predict(x_train_fold)
    y_pred = np.argmax(y_pred_probs, axis=1)

    train_reconstructions = ae_model.predict(x_train_fold, verbose=0) #try reconstruct validation dataset
    train_reconstruction_errors = np.abs(train_reconstructions - x_train_fold) # get the Mean Absolute Error (MAE) per feature, per sample

    # Mean per feature thresholds ---
    mean_feature_errors = np.mean(train_reconstruction_errors, axis=0)  # mean reconstruction error per feature. axis = 0 because we're averaging over all samples for each feature.
    std_feature_errors = np.std(train_reconstruction_errors, axis=0)    # std of reconstruction error per feature

    pre_feature_thresholds = mean_feature_errors + std_feature_errors

    # Evaluate AE
    test_reconstructions = ae_model.predict(x_test_fold, verbose=1)
    test_reconstruction_errors = np.abs(test_reconstructions - x_test_fold)
    AE_y_pred = (test_reconstruction_errors > pre_feature_thresholds).any(axis=1).astype(int)

    y_test_binary = (x_test_fold != 0).astype(int)

    acc = accuracy_score(y_test_binary, AE_y_pred)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")
    accuracy_scores.append(acc)

    cm = confusion_matrix(y_test_binary, AE_y_pred)
    cm_report =  classification_report(y_test_binary, AE_y_pred)
    print(f"Fold {fold+1} Confusion Matric report:\n{cm_report}")


# Final results
print('\nList of accuracy scores:', accuracy_scores)
print('Maximum Accuracy: {:.2f}%'.format(max(accuracy_scores) * 100))
print('Minimum Accuracy: {:.2f}%'.format(min(accuracy_scores) * 100))
print('Mean Accuracy: {:.2f}%'.format(mean(accuracy_scores) * 100))
print('Standard Deviation: {:.4f}'.format(stdev(accuracy_scores)))
