In [1]:
import pandas as pd
import tensorflow as tf
tf.config.run_functions_eagerly(True)
from tensorflow import keras
from imblearn.over_sampling import SMOTE
!pip install keras-self-attention
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, classification_report
import os
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras_self_attention import SeqSelfAttention
import logging
from keras.callbacks import Callback
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
import seaborn as sns
import matplotlib.pyplot as plt
import collections
from keras.optimizers import SGD

# List of parquet file paths
parquet_files = [
    '/content/drive/MyDrive/Parquetfile/Botnet-Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/Bruteforce-Wednesday-14-02-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/DDoS1-Tuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/DDoS2-Wednesday-21-02-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/DoS1-Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/DoS2-Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/Infil1-Wednesday-28-02-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/Infil2-Thursday-01-03-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/Web1-Thursday-22-02-2018_TrafficForML_CICFlowMeter.parquet',
    '/content/drive/MyDrive/Parquetfile/Web2-Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet',
]

# Destination directory for saving CSV files
csv_directory = '/content/drive/MyDrive/CSV_Data/'

# Convert and save parquet files as CSV
for parquet_file in parquet_files:
    file_name = parquet_file.split('/')[-1].replace('.parquet', '.csv')
    df = pd.read_parquet(parquet_file)
    df.to_csv(csv_directory + file_name, index=False)



In [8]:
#
LABELS_7 = {
    'Benign': 0,
    'SSH-Bruteforce': 1,
    'FTP-BruteForce': 1,
    'Brute Force -Web': 2,
    'Brute Force -XSS': 2,
    'SQL Injection': 2,
    'DDOS attack-HOIC': 3,
    'DDOS attack-LOIC-UDP': 3,
    'DDoS attacks-LOIC-HTTP': 3,
    'DoS attacks-Slowloris': 4,
    'DoS attacks-Hulk': 4,
    'DoS attacks-GoldenEye': 4,
    'DoS attacks-SlowHTTPTest': 4,
    'Bot': 5,
    'Infilteration': 6,
}

def preprocess_labels(df):
    df['Label'] = df['Label'].replace([k for k, w in LABELS_7.items()], [w for k, w in LABELS_7.items()])

def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    preprocess_labels(df)
    return df

# List of CSV file paths
csv_files = [
    csv_directory + 'Botnet-Friday-02-03-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'Bruteforce-Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'DDoS1-Tuesday-20-02-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'DDoS2-Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'DoS1-Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'DoS2-Friday-16-02-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'Infil1-Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'Infil2-Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'Web1-Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv',
    csv_directory + 'Web2-Friday-23-02-2018_TrafficForML_CICFlowMeter.csv',
]

# Define the load_dataset function
def load_dataset(filenames):
    dataframes = [load_and_preprocess(csv_file) for csv_file in csv_files]
    combined_df = pd.concat(dataframes, axis=0, ignore_index=True)
    class_num = 7
    train_labels = combined_df.pop("Label")
    train_x = combined_df.values
    train_y = tf.keras.utils.to_categorical(train_labels, class_num)
    return train_x, train_y




In [None]:
# Assuming train_y is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(train_y, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")

In [9]:
# Load dataset using the load_dataset function
train_x, train_y = load_dataset(csv_files)


# Specify the classes you want to oversample (classes 1 to 6)
classes_to_oversample = [1, 2, 4, 5, 6]

# Initialize the SMOTE sampler with a sampling strategy of 300,000 for the selected classes
oversample = SMOTE(sampling_strategy={cls: 300000 for cls in classes_to_oversample})

# Perform oversampling on the selected classes
train_x_resampled, train_y_resampled = oversample.fit_resample(train_x, train_y)


In [10]:

# Assuming train_y_resampled is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(train_y_resampled, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")

Label Distribution:
Label 0: Count 5329008
Label 5: Count 300000
Label 1: Count 300000
Label 3: Count 775955
Label 4: Count 300000
Label 6: Count 300000
Label 2: Count 300000


In [11]:
# Create a logger
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)

from sklearn.model_selection import train_test_split

# Split your data into training (80%) and temporary (20%) sets
temp_x, test_x, temp_y, test_y = train_test_split(train_x_resampled, train_y_resampled, test_size=0.2, random_state=42, shuffle=True, stratify=train_y_resampled)

# Split the temporary set into training (80%) and validation (20%) sets
train_x, val_x, train_y, val_y = train_test_split(temp_x, temp_y, test_size=0.2, random_state=42, shuffle=True, stratify=temp_y)

# Now, train_x, train_y, val_x, val_y, test_x, and test_y have a stratified split of your data.


# Debugging: Print shapes
print("Train Data Shapes:")
print("train_x shape:", train_x.shape)
print("train_y shape:", train_y.shape)

print("Validation Data Shapes:")
print("val_x shape:", val_x.shape)
print("val_y shape:", val_y.shape)

print("Test Data Shapes:")
print("test_x shape:", test_x.shape)
print("test_y shape:", test_y.shape)

# Assuming train_y_resampled is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(train_y_resampled, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")

Train Data Shapes:
train_x shape: (4867176, 77)
train_y shape: (4867176, 7)
Validation Data Shapes:
val_x shape: (1216794, 77)
val_y shape: (1216794, 7)
Test Data Shapes:
test_x shape: (1520993, 77)
test_y shape: (1520993, 7)
Label Distribution:
Label 0: Count 5329008
Label 5: Count 300000
Label 1: Count 300000
Label 3: Count 775955
Label 4: Count 300000
Label 6: Count 300000
Label 2: Count 300000


In [12]:
# Reshape the datasets
train_x = train_x.reshape(-1, 77, 1)
val_x = val_x.reshape(-1, 77, 1)
test_x = test_x.reshape(-1, 77, 1)
# Debugging: Print shapes
print("Train Data Shapes:")
print("train_x shape:", train_x.shape)
print("train_y shape:", train_y.shape)

print("Validation Data Shapes:")
print("val_x shape:", val_x.shape)
print("val_y shape:", val_y.shape)

print("Test Data Shapes:")
print("test_x shape:", test_x.shape)
print("test_y shape:", test_y.shape)

# Assuming train_y_resampled is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(train_y_resampled, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")

Train Data Shapes:
train_x shape: (4867176, 77, 1)
train_y shape: (4867176, 7)
Validation Data Shapes:
val_x shape: (1216794, 77, 1)
val_y shape: (1216794, 7)
Test Data Shapes:
test_x shape: (1520993, 77, 1)
test_y shape: (1520993, 7)
Label Distribution:
Label 0: Count 5329008
Label 5: Count 300000
Label 1: Count 300000
Label 3: Count 775955
Label 4: Count 300000
Label 6: Count 300000
Label 2: Count 300000


In [58]:

# Check shapes and data types
print("test_x shape:", test_x.shape)
print("test_y shape:", test_y.shape)
print("test_x data type:", test_x.dtype)
print("test_y data type:", test_y.dtype)

# Check for NaN and infinite values
if np.isnan(test_x).any() or np.isinf(test_x).any():
    print("test_x contains NaN or infinite values.")
else:
    print("test_x does not contain NaN or infinite values.")

if np.isnan(test_y).any() or np.isinf(test_y).any():
    print("test_y contains NaN or infinite values.")
else:
    print("test_y does not contain NaN or infinite values.")


test_x shape: (1520993, 77, 1)
test_y shape: (1520993, 7)
test_x data type: float64
test_y data type: int64
test_x does not contain NaN or infinite values.
test_y does not contain NaN or infinite values.


In [13]:
# Apply data reduction techniques here
sample_fraction = 0.7
num_samples_train = int(len(train_x) * sample_fraction)
train_x, train_y = train_x[:num_samples_train], train_y[:num_samples_train]
# Debugging: Print shapes
print("Train Data Shapes:")
print("train_x shape:", train_x.shape)
print("train_y shape:", train_y.shape)

print("val Data Shapes:")
print("val_x shape:", val_x.shape)
print("val_y shape:", val_y.shape)

print("Test Data Shapes:")
print("test_x shape:", test_x.shape)
print("test_y shape:", test_y.shape)

# Assuming train_y_resampled is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(train_y_resampled, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")

Train Data Shapes:
train_x shape: (3407023, 77, 1)
train_y shape: (3407023, 7)
val Data Shapes:
val_x shape: (1216794, 77, 1)
val_y shape: (1216794, 7)
Test Data Shapes:
test_x shape: (1520993, 77, 1)
test_y shape: (1520993, 7)
Label Distribution:
Label 0: Count 5329008
Label 5: Count 300000
Label 1: Count 300000
Label 3: Count 775955
Label 4: Count 300000
Label 6: Count 300000
Label 2: Count 300000


In [14]:
# Define the parameters dictionary
sampling_parameters = [
    {2: 100000},
    {2: 150000},
    {2: 200000},
    {2: 250000},
]

DNN_parameters = [
    {'lstm1': 256, 'lstm2': 256, 'att': 256, 'lstm3': 128, 'dense1': 100, 'dense2': 80},
    {'lstm1': 128, 'lstm2': 128, 'att': 128, 'lstm3': 92, 'dense1': 80, 'dense2': 80},
    {'lstm1': 64, 'lstm2': 64, 'att': 64, 'lstm3': 64, 'dense1': 64, 'dense2': 32}
]

In [61]:
# Assuming train_y_resampled is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(train_y, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")

Label Distribution:
Label 0: Count 2386814
Label 2: Count 134697
Label 3: Count 347521
Label 1: Count 134676
Label 4: Count 134460
Label 6: Count 134519
Label 5: Count 134336


In [42]:

# Initialize the predictions dictionary
predictions = {}
models = {}  # Dictionary to store models

for num1, samp_par in enumerate(sampling_parameters):
    for num2, net_par in enumerate(DNN_parameters):
        # Set up callbacks
        log_dir = os.path.join('logs/long_fit_GRU/' + "s%s_n%s/" % (num1, num2))
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=10000)

        # Create and compile the model
        model = Sequential([
            LSTM(net_par['lstm1'], input_shape=(77, 1), return_sequences=True),
            LSTM(net_par['lstm2'], return_sequences=True, dropout=0.1),
            SeqSelfAttention(attention_width=net_par['att'], attention_activation='sigmoid', name='Attention'),
            LSTM(net_par['lstm3'], dropout=0.1),
            Dense(net_par['dense1'], activation='relu'),
            Dense(net_par['dense2'], activation='relu'),
            Dense(7, activation='softmax')
        ])

        # Compile the model
        sgdm_optimizer = SGD(learning_rate=0.1, momentum=0.9)
        model.compile(optimizer=sgdm_optimizer, loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=True)

        # Store the model with a unique name
        model_name = "GRU_model_s%s_n%s" % (num1, num2)
        models[model_name] = model

        # Now, you can use train_x, train_y, val_x, val_y, test_x, and test_y to fit and evaluate the model.
    # Print shapes for debugging
print("Train X Shape:", train_x.shape)
print("Train Y Shape:", train_y.shape)
print("Val X Shape:", val_x.shape)
print("Val Y Shape:", val_y.shape)
print("Test X Shape:", test_x.shape)
print("Test Y Shape:", test_y.shape)

# Assuming train_y_resampled is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(train_y_resampled, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")




Train X Shape: (3407023, 77, 1)
Train Y Shape: (3407023, 7)
Val X Shape: (1216794, 77, 1)
Val Y Shape: (1216794, 7)
Test X Shape: (1520993, 77, 1)
Test Y Shape: (1520993, 7)
Label Distribution:
Label 0: Count 5329008
Label 5: Count 300000
Label 1: Count 300000
Label 3: Count 775955
Label 4: Count 300000
Label 6: Count 300000
Label 2: Count 300000


In [None]:
# Train the model
history = model.fit(
            train_x,
            train_y,
            epochs=1,
            batch_size=512,
            validation_data=(val_x, val_y),
            callbacks=[tensorboard_callback]
        )

# Save the trained model in the recommended Keras format
save_dir = 'GRU_models'
os.makedirs(save_dir, exist_ok=True)
model.save(os.path.join(save_dir, model_name + '.keras'))







In [71]:
# Assuming train_y_resampled is a numpy array with one-hot encoded labels
# You can convert it back to a list of labels
labels = np.argmax(test_y, axis=1)

# Calculate the label distribution using Counter
label_distribution = collections.Counter(labels)

# Print the label distribution
print("Label Distribution:")
for label, count in label_distribution.items():
    print(f"Label {label}: Count {count}")

Label Distribution:
Label 3: Count 155191
Label 0: Count 1065802
Label 5: Count 60000
Label 6: Count 60000
Label 1: Count 60000
Label 2: Count 60000
Label 4: Count 60000


In [None]:
# Load the model
model = load_model(os.path.join(save_dir, model_name + '.keras'), custom_objects={'SeqSelfAttention': SeqSelfAttention})


# Make predictions
predictions[model_name] = model.predict(test_x)

# Print evaluation metrics
print('===============')
print("Loss:", history.history['loss'][0])
print("Accuracy:", history.history['accuracy'][0])
print(confusion_matrix(test_y.argmax(axis=1), predictions[model_name].argmax(axis=1)))
print(classification_report(test_y.argmax(axis=1), predictions[model_name].argmax(axis=1)))
print(num1, num2)

In [None]:
# Convert true labels and predictions to class indices
true_labels = np.argmax(test_y, axis=1)
predicted_labels = np.argmax(predictions[model_name], axis=1)

# Calculate the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

# Print classification report
print(classification_report(true_labels, predicted_labels))