# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
# ---------------
# --- Imports ---
# ---------------

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import ADASYN
from sklearn.utils import resample
from collections import Counter
from imblearn.utils import check_sampling_strategy


# File Paths

In [None]:
# Get Data file path
file_path = 'cicids2017_cleaned.csv' # Set to full file directory location of orginal preprocessed CICIDS2017 dataset by ERIC ANACLETO RIBEIRO

baseline_save_path = "/content/drive/MyDrive/CNXSIA001_LAIDS_SOURCE_CODE/Baseline 1D CNN Model Files/Datasets" # A pca-cnn dataset save path is not needed as pca is applied to the baseline's dataset and then saved as seperate pca-cnn datasets
ae_mlp_save_path = "/content/drive/MyDrive/CNXSIA001_LAIDS_SOURCE_CODE/AE-MLP Model Files/Datasets and Numpy Arrays"
os.makedirs(baseline_save_path, exist_ok=True)
os.makedirs(ae_mlp_save_path, exist_ok=True)

# Load in Datset

Dataset developed by ERIC ANACLETO RIBEIRO can be downloaded from: https://www.kaggle.com/datasets/ericanacletoribeiro/cicids2017-cleaned-and-preprocessed

In [None]:
# --------------------------
# --- Load in CICIDS2017 ---
# --------------------------

# Read in the preprocessed CICIDS2017 dataset as a panda dataframe
cicids2017_df = pd.read_csv(file_path, sep=",", comment="#", header=0)
cicids2017_df.columns = cicids2017_df.columns.str.strip()  # Strip any whitespaces from column names

print("\nAttack Type Distribution:")
attack_type_counts = cicids2017_df['Attack Type'].value_counts()
print(attack_type_counts)

# Calc. and print total benign and malicious samples
total_benign = attack_type_counts.get('Normal Traffic', 0)
total_malicious = attack_type_counts.sum() - total_benign

print(f"\nTotal Samples: {total_benign + total_malicious}")
print(f"Total Benign Samples: {total_benign}")
print(f"Total Malicious Samples: {total_malicious}")

# ----------------------
# --- Label Encoding ---
# ----------------------

attack_type_map = {'Normal Traffic': 0, 'Port Scanning': 1, 'Web Attacks': 2, 'Brute Force': 3, 'DDoS': 4, 'Bots': 5, 'DoS': 6} # Create a mapping from attack type to an integer label

cicids2017_df['Attack Type'] = cicids2017_df['Attack Type'].map(attack_type_map)# Apply the label encoding

# Display the encodings
print("\nLabel Encoding Mapping:")
print(attack_type_map)

# -----------------------------------------
# --- Split the Labels from the Samples ---
# -----------------------------------------
X = cicids2017_df.drop('Attack Type', axis=1)
y = cicids2017_df['Attack Type']



Attack Type Distribution:
Attack Type
Normal Traffic    2095057
DoS                193745
DDoS               128014
Port Scanning       90694
Brute Force          9150
Web Attacks          2143
Bots                 1948
Name: count, dtype: int64

Total Samples: 2520751
Total Benign Samples: 2095057
Total Malicious Samples: 425694

Label Encoding Mapping:
{'Normal Traffic': 0, 'Port Scanning': 1, 'Web Attacks': 2, 'Brute Force': 3, 'DDoS': 4, 'Bots': 5, 'DoS': 6}


In [None]:
# ---------------------------------------------
# --- Separate Benign and Malicious Traffic ---
# ---------------------------------------------

# Separate benign and malicious samples into seperate dataframes
benign_df = cicids2017_df[cicids2017_df['Attack Type'] == 0]
malicious_df = cicids2017_df[cicids2017_df['Attack Type'] != 0]

# Separate features and labels for benign data
X_benign = benign_df.drop('Attack Type', axis=1) # Samples
y_benign = benign_df['Attack Type'] # Labels

# Separate features and labels for malicious data
X_malicious = malicious_df.drop('Attack Type', axis=1) # Samples
y_malicious = malicious_df['Attack Type'] # Labels

In [None]:
# ---------------------------------
# --- Train_Val_Test_splits set ---
# ---------------------------------
X_benign_temp, X_test_benign, y_benign_temp, y_test_benign = train_test_split(X_benign, y_benign, test_size=0.10, random_state=42) # 10% of the benign data allocated to the test set

X_benign_temp, X_classifier_train_benign, y_benign_temp, y_classifier_train_benign = train_test_split(X_benign_temp, y_benign_temp, test_size=0.20, random_state=42) # 80:20 split of rest of the benign data between the AE and classifier sets (because the benign dataset is so large)

X_classifier_train_benign, X_classifier_val_benign, y_classifier_train_benign, y_classifier_val_benign = train_test_split(X_classifier_train_benign, y_classifier_train_benign, test_size=0.20, random_state=42) # 80:20 split of the classifier's benign data allocation between its training and validation set

X_train, X_val, y_train, y_val = train_test_split(X_benign_temp, y_benign_temp, test_size=0.10, random_state=42) #90:10 split of the AE's benign data allocaton between it's training and validation class (because the benign dataset is so large)

X_mal_temp, X_malicious_test, y_mal_temp, y_malicious_test = train_test_split(X_malicious, y_malicious, test_size=0.2, random_state=42, stratify = y_malicious) # 20% of the malicious data samples are allocated to the test set

X_mal_train, X_mal_val, y_mal_train, y_mal_val = train_test_split(X_mal_temp, y_mal_temp, test_size=0.20, random_state=42, stratify = y_mal_temp) # 80:20 split of the classifiers' malicious data allocation between its training and validation set


# Concatenate benign and malicious dataframes together for the classifier and test sets
X_test = pd.concat([X_test_benign, X_malicious_test], ignore_index=True)
y_test = pd.concat([y_test_benign, y_malicious_test], ignore_index=True)

X_classifier_train = pd.concat([X_classifier_train_benign, X_mal_train], ignore_index=True)
y_classifier_train = pd.concat([y_classifier_train_benign, y_mal_train], ignore_index=True)

X_classifier_val = pd.concat([X_classifier_val_benign, X_mal_val], ignore_index=True)
y_classifier_val = pd.concat([y_classifier_val_benign, y_mal_val], ignore_index=True)


In [None]:
# ------------------------
# --- Shuffle the data ---
# ------------------------

AE_X_train, AE_y_train = shuffle(X_train, y_train, random_state=42) #Autoencoder
AE_X_val, AE_y_val = shuffle(X_val, y_val, random_state=42) #Autoencoder

Classifier_X_train, Classifier_y_train = shuffle(X_classifier_train, y_classifier_train, random_state=42) #Classifier
Classifier_X_val, Classifier_y_val = shuffle(X_classifier_val, y_classifier_val, random_state=42) #Classifier

X_test, y_test = shuffle(X_test, y_test, random_state=42) #Test

In [None]:
# -----------------------
# ---- Reset Indexes ----
# -----------------------

AE_X_train = AE_X_train.reset_index(drop=True)
AE_y_train = AE_y_train.reset_index(drop=True)

AE_X_val = AE_X_val.reset_index(drop=True)
AE_y_val = AE_y_val.reset_index(drop=True)

Classifier_X_train = Classifier_X_train.reset_index(drop=True)
Classifier_y_train = Classifier_y_train.reset_index(drop=True)

Classifier_X_val = Classifier_X_val.reset_index(drop=True)
Classifier_y_val = Classifier_y_val.reset_index(drop=True)

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
# --------------------------------
# ---- Show Data Distribution ----
# --------------------------------

reverse_attack_type_map = {v: k for k, v in attack_type_map.items()} # Invert the dictionary to map int -> attack name

# Map encoded labels values to the orginal label names
AE_y_train_named = AE_y_train.map(reverse_attack_type_map)
AE_y_val_named   = AE_y_val.map(reverse_attack_type_map)

Classifier_y_train_named = Classifier_y_train.map(reverse_attack_type_map)
Classifier_y_val_named   = Classifier_y_val.map(reverse_attack_type_map)

y_test_named = y_test.map(reverse_attack_type_map)

# Show distributions
print("AE Training set class distribution:")
print(AE_y_train_named.value_counts(), "\n")

print("AE Validation set class distribution:")
print(AE_y_val_named.value_counts(), "\n")

print("Classifier Training set class distribution:")
print(Classifier_y_train_named.value_counts(), "\n")

print("Classifier Validation set class distribution:")
print(Classifier_y_val_named.value_counts(), "\n")

print("Test set class distribution:")
print(y_test_named.value_counts(), "\n")


AE Training set class distribution:
Attack Type
Normal Traffic    1357596
Name: count, dtype: int64 

AE Validation set class distribution:
Attack Type
Normal Traffic    150844
Name: count, dtype: int64 

Classifier Training set class distribution:
Attack Type
Normal Traffic    301688
DoS               123997
DDoS               81929
Port Scanning      58044
Brute Force         5856
Web Attacks         1371
Bots                1247
Name: count, dtype: int64 

Classifier Validation set class distribution:
Attack Type
Normal Traffic    75423
DoS               30999
DDoS              20482
Port Scanning     14511
Brute Force        1464
Web Attacks         343
Bots                312
Name: count, dtype: int64 

Test set class distribution:
Attack Type
Normal Traffic    209506
DoS                38749
DDoS               25603
Port Scanning      18139
Brute Force         1830
Web Attacks          429
Bots                 389
Name: count, dtype: int64 



# Apply ADASYN to the Classfier Training Set

In [None]:
# Create temp vars
X_down = Classifier_X_train
y_down = Classifier_y_train

# Get class counts
class_counts = y_down.value_counts().to_dict()
majority_class_count = class_counts.get(0, max(class_counts.values()))  # 0 = benign

# ---------------------------------
# --- Apply ADASYN oversampling ---
# ---------------------------------
adasyn = ADASYN(sampling_strategy='auto', # Specify which classes to oversample
                n_neighbors=5,            # Number of neighbors used in ADASYN
                random_state=42)

X_res, y_res = adasyn.fit_resample(X_down, y_down) # Perform oversampling

print("\nClass distribution after ADASYN (numeric):")
print(Counter(y_res))


# Convert results back to pandas DataFrame
X_resampled = pd.DataFrame(X_res, columns=X_down.columns)
y_resampled = pd.DataFrame(y_res, columns=['Attack Type'])

print(f"\nFinal dataset shape: {X_resampled.shape}")
print("\nFinal class distribution:")
print(Counter(y_resampled['Attack Type']))

Adasyn_Classifier_X_train = X_resampled
Adasyn_Classifier_y_train = y_resampled



Class distribution after ADASYN (numeric):
Counter({6: 301889, 4: 301715, 5: 301702, 2: 301702, 3: 301697, 0: 301688, 1: 301212})

Final dataset shape: (2111605, 52)

Final class distribution:
Counter({6: 301889, 4: 301715, 5: 301702, 2: 301702, 3: 301697, 0: 301688, 1: 301212})


#Normalise and Save Baseline and PCA-CNN Datasets

(normalise using Standard Scaler)

In [None]:
# --- Shuffle & reset index Training Set --- (just incase)
Adasyn_Classifier_X_train, Adasyn_Classifier_y_train = shuffle(Adasyn_Classifier_X_train, Adasyn_Classifier_y_train, random_state=42)
Adasyn_Classifier_X_train = Adasyn_Classifier_X_train.reset_index(drop=True)
Adasyn_Classifier_y_train = Adasyn_Classifier_y_train.reset_index(drop=True)

print("Adasyn datasets shuffled and re-indexed!")

Adasyn datasets shuffled and re-indexed!


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# --------------------------------------
# --- StandardScaler Standardisation ---
# --------------------------------------

scaler_standard = StandardScaler()

# Apply standardisation
adasyn_baseline_X_train = scaler_standard.fit_transform(Adasyn_Classifier_X_train)
baseline_X_val = scaler_standard.transform(Classifier_X_val)
baseline_X_test = scaler_standard.transform(X_test)

# Assign labels to corresponding vars
adasyn_baseline_y_train = Adasyn_Classifier_y_train
baseline_y_val = Classifier_y_val
baseline_y_test = y_test


# ----------------------------------
# --- Save datasets ---
# ----------------------------------

# ADASYN-resampled baseline CNN training set
pd.DataFrame(adasyn_baseline_X_train).to_csv(os.path.join(baseline_save_path, "adasyn_baseline_X_train.csv"), index=False)
pd.DataFrame(adasyn_baseline_y_train).to_csv(os.path.join(baseline_save_path, "adasyn_baseline_y_train.csv"), index=False)

# Validation set
pd.DataFrame(baseline_X_val).to_csv(os.path.join(baseline_save_path, "baseline_X_val.csv"), index=False)
pd.DataFrame(baseline_y_val).to_csv(os.path.join(baseline_save_path, "baseline_y_val.csv"), index=False)

# Test set
pd.DataFrame(baseline_X_test).to_csv(os.path.join(baseline_save_path, "baseline_X_test.csv"), index=False)
pd.DataFrame(baseline_y_test).to_csv(os.path.join(baseline_save_path, "baseline_y_test.csv"), index=False)

print("Baseline datasets saved successfully!")

Baseline datasets saved successfully!


#Normalise and save AE-MLP Datasets

(normalise using MinMaxScaler)

In [None]:
from sklearn.preprocessing import MinMaxScaler
import os
import pandas as pd

# ----------------------------------
# --- MinMaxScaler Normalisation ---
# ----------------------------------
scaler_minmax = MinMaxScaler()

# Apply normalisation

# AE sets
ae_X_train = scaler_minmax.fit_transform(AE_X_train)
ae_X_val = scaler_minmax.transform(AE_X_val)

# Classifier sets
adasyn_mlp_X_train = scaler_minmax.transform(Adasyn_Classifier_X_train)
mlp_X_val = scaler_minmax.transform(Classifier_X_val)

# Test set
ae_mlp_X_test = scaler_minmax.transform(X_test)

# Assign labels to vars
ae_y_train = AE_y_train
ae_y_val   = AE_y_val
adasyn_mlp_y_train = Adasyn_Classifier_y_train
mlp_y_val  = Classifier_y_val
ae_mlp_y_test = y_test

# ---------------------
# --- Save datasets ---
# ---------------------

# AE sets
pd.DataFrame(ae_X_train).to_csv(os.path.join(ae_mlp_save_path, "ae_x_train.csv"), index=False)
pd.DataFrame(ae_y_train).to_csv(os.path.join(ae_mlp_save_path, "ae_y_train.csv"), index=False)
pd.DataFrame(ae_X_val).to_csv(os.path.join(ae_mlp_save_path, "ae_x_val.csv"), index=False)
pd.DataFrame(ae_y_val).to_csv(os.path.join(ae_mlp_save_path, "ae_y_val.csv"), index=False)

# ADASYN-resampled Classifier training sets
pd.DataFrame(adasyn_mlp_X_train).to_csv(os.path.join(ae_mlp_save_path, "adasyn_mlp_x_train.csv"), index=False)
pd.DataFrame(adasyn_mlp_y_train).to_csv(os.path.join(ae_mlp_save_path, "adasyn_mlp_y_train.csv"), index=False)

# Classifier validation sets
pd.DataFrame(mlp_X_val).to_csv(os.path.join(ae_mlp_save_path, "mlp_x_val.csv"), index=False)
pd.DataFrame(mlp_y_val).to_csv(os.path.join(ae_mlp_save_path, "mlp_y_val.csv"), index=False)

# Test sets
pd.DataFrame(ae_mlp_X_test).to_csv(os.path.join(ae_mlp_save_path, "ae_mlp_x_test.csv"), index=False)
pd.DataFrame(ae_mlp_y_test).to_csv(os.path.join(ae_mlp_save_path, "ae_mlp_y_test.csv"), index=False)

print("All AE-MLP and ADASYN MLP datasets saved successfully!")

All AE-MLP and ADASYN MLP datasets saved successfully!
