In [1]:
# preproccess
import pandas as pd 
import os 
import glob 
from IPython.display import clear_output

# Model
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
import time

# Loading the Dataset

In [2]:
rel_path = '/datasets/CICIoT2023/'          # If your dataset is within your python project directory, change this to the relative path to your dataset
path = os.getcwd() + rel_path   # If your dataset is somewhere else, change this to that path
csv_filepaths = glob.glob(os.path.join(path, "*.csv"))  # Makes a list of all CSVs within the directory above

# Features that hold values 0/1.
column_datatypes = { 'fin_flag_number': 'bool', 'syn_flag_number': 'bool', 'rst_flag_number': 'bool',
                     'psh_flag_number': 'bool', 'ack_flag_number': 'bool', 'ece_flag_number': 'bool',
                     'cwr_flag_number': 'bool', 
                     'HTTP': 'bool', 'HTTPS': 'bool', 'DNS': 'bool', 'Telnet': 'bool', 'SMTP': 'bool',
                     'SSH': 'bool',  'IRC': 'bool',   'TCP': 'bool', 'UDP': 'bool',    'DHCP': 'bool',
                     'ARP': 'bool',  'ICMP': 'bool',  'IPv': 'bool', 'LLC': 'bool'
                   }

# Load the first csv file
df = pd.read_csv(csv_filepaths[0]).astype(column_datatypes)

# Load csv files in 10-file batches 
batch_size = 10

for i in range(1, len(csv_filepaths)):
    clear_output(wait=False) # Pretty output
    print(f'Loading CSV {i}')
    
    # First file of each batch, restart the batch list
    if i % batch_size == 1:
        batch = [df]
    
    batch.append(pd.read_csv(csv_filepaths[i]).astype(column_datatypes))    # Load a CSV and change relevant columns to bools
    
    # every #batch_size# file, add it to the df dataframe
    if i % batch_size == 0:
        df = pd.concat(batch)
        batch.clear()   # Get rid of old batch files to free memory
        print(f'Loaded to {i}')

# Load any remaining data in batch
if len(batch) != 0:
    print("Loading data from final batch.")
    df = pd.concat(batch)

clear_output(wait=False)
del batch

# Dataframe Memory Size

In [3]:
tot_mem = df.memory_usage().sum()
print(f'{tot_mem / 1000000000} gb')

11.064719223 gb


# Encoding labels 

In [4]:
label_maps = { 'Backdoor_Malware': '0',         'BenignTraffic': '1',           'BrowserHijacking': '2',
               'CommandInjection': '3',         'DDoS-ACK_Fragmentation': '4',  'DDoS-HTTP_Flood': '5',
               'DDoS-ICMP_Flood': '6',          'DDoS-ICMP_Fragmentation': '7', 'DDoS-PSHACK_Flood': '8',
               'DDoS-RSTFINFlood': '9',         'DDoS-SYN_Flood': '10',         'DDoS-SlowLoris': '11',
               'DDoS-SynonymousIP_Flood': '12', 'DDoS-TCP_Flood': '13',         'DDoS-UDP_Flood': '14',
               'DDoS-UDP_Fragmentation': '15',  'DNS_Spoofing': '16',           'DictionaryBruteForce': '17',
               'DoS-HTTP_Flood': '18',          'DoS-SYN_Flood': '19',          'DoS-TCP_Flood': '20',
               'DoS-UDP_Flood': '21',           'MITM-ArpSpoofing': '22',       'Mirai-greeth_flood': '23',
               'Mirai-greip_flood': '24',       'Mirai-udpplain': '25',         'Recon-HostDiscovery': '26',
               'Recon-OSScan': '27',            'Recon-PingSweep': '28',        'Recon-PortScan': '29',
               'SqlInjection': '30',            'Uploading_Attack': '31',       'VulnerabilityScan': '32', 
               'XSS': '33'
             }

df['label'] = df['label'].map(label_maps)

df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.037456,15099.00,17.0,64.0,10001.102371,10001.102371,0.0,False,False,False,...,0.000000,50.0,8.310215e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,14
1,0.000000,54.00,6.0,64.0,0.000000,0.000000,0.0,False,False,False,...,0.000000,54.0,8.333177e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,8
2,0.010346,9662.50,17.0,64.0,21380.056228,21380.056228,0.0,False,False,False,...,0.000000,50.0,8.309879e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,14
3,0.000000,54.00,6.0,64.0,241.333973,241.333973,0.0,False,False,False,...,0.000000,54.0,8.295112e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,20
4,0.195109,95.58,6.0,64.0,6.762174,6.762174,0.0,False,True,False,...,0.000000,54.0,8.336540e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224084,0.000000,54.00,6.0,64.0,12.305208,12.305208,0.0,True,False,True,...,0.000000,54.0,8.334406e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,9
224085,0.260599,195.28,17.0,64.0,5.019235,5.019235,0.0,False,False,False,...,6.256217,91.6,8.300770e+07,9.5,13.225017,8.802365,287.083940,0.14,141.55,21
224086,59.963741,67179.40,12.0,102.8,5.632240,5.632240,0.0,False,False,False,...,66.941103,160.3,1.665198e+08,13.5,14.871934,94.837019,4892.474222,1.00,244.60,1
224087,0.537183,85.86,6.0,64.0,2.198275,2.198275,0.0,False,True,False,...,0.000000,54.0,8.336535e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,12


# Model

### Hyper-Parameters

In [9]:
# Define the number of neurons in the initial layer of the generator
input_shape = 47
num_epochs = 2
batch_size = 256 # Define your batch size here
num_samples = 100
epochs = 7000
critic_updates = 5  # Number of critic updates per generator update
attack_classes = ["DDoS","DoS","Recon","Brute Force", "Web-Based", "Spoofing", "Mirai"]
num_classes = len(attack_classes)

result = df

In [13]:
# GAN class
# This class contains the generator and discriminator models, as well as the training loop for the GAN
class GAN:
    def __init__(self, hidden1, hidden2, hidden3, layer0_num_neurons, num_classes):
        # store the parameters as instance variables
        self.hidden1 = hidden1
        self.hidden2 = hidden2
        self.hidden3 = hidden3
        self.layer0_num_neurons = layer0_num_neurons
        self.num_classes = num_classes

        # build the generator and discriminator
        self.generator = self.build_generator(self.hidden1, self.hidden2, self.hidden3, self.layer0_num_neurons)
        self.discriminator = self.build_discriminator()

        # compile the generator and discriminator
        optimizer = Adam(0.0002, 0.5)
        self.generator.compile(optimizer=optimizer, loss='binary_crossentropy')
        self.discriminator.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


    def build_generator(self, hidden1, hidden2, hidden3, input_dim):
        model = Sequential()
        model.add(Dense(hidden1, input_dim=input_dim))  
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden3))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(input_dim, activation='relu'))  # Changed from output_dim to input_dim

        noise = Input(shape=(input_dim,))
        attack = model(noise)
        return Model(noise, attack)

    def build_discriminator(self):
        model = Sequential()
        model.add(Dense(input_shape, input_dim=input_shape, activation='relu'))  
        model.add(Dense(30, activation='relu'))
        model.add(Dense(15, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))  

        attack = Input(shape=(input_shape,))
        validity = model(attack)

        return Model(attack, validity)
    
    # define baseline model
    def baseline_model():
        # create model
        model = Sequential()
        inputs = input_shape
        hidden_layer1 = 10
        hidden_layer2 = 5
        hidden_layer3 = 0
        outputs = num_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
        
        model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
        if hidden_layer2 != 0:
            model.add(Dense(hidden_layer2, activation='relu'))
        if hidden_layer3 != 0:
            model.add(Dense(hidden_layer3, activation='relu'))
        model.add(Dense(outputs, activation='softmax'))
        
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
        return model
    
   
    def discriminator_loss(self, real_output, fake_output):
        return tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

    def generator_loss(self, fake_output):
        return -tf.reduce_mean(fake_output)


    def trainGAN(self, gen_hidden1, gen_hidden2, gen_hidden3, input_dim):
        optimizer = Adam(0.0002, 0.5)
        
        # Directly use 'result' DataFrame. Ensure it's accessible within this scope.
        # Sampling 500 data points randomly from 'result'
        sampled_df = result.sample(500)


        # Splitting the data into features and labels
        X_train = sampled_df.values.astype(float)
        Y_train = sampled_df['label'].values

        # Setting up labels for valid (real) and fake data for training
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        # Building the discriminator
        discriminator = self.build_discriminator()
        discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Building the generator
        generator = self.build_generator(gen_hidden1, gen_hidden2, gen_hidden3, input_dim)

        # Setting up the combined model
        z = Input(shape=(input_shape,))
        attack = generator(z)
        validity = discriminator(attack)
        combined = Model(z, validity)
        combined.compile(loss='binary_crossentropy', optimizer=optimizer)

        for epoch in range(epochs):
            clear_output(wait=False)
            # Train Discriminator
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            real_attacks = X_train[idx]

            noise = tf.random.normal((batch_size, input_shape))
            gen_attacks = generator.predict(noise)

            d_loss_real = discriminator.train_on_batch(real_attacks, valid)
            d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            g_loss = combined.train_on_batch(noise, valid)

            if epoch % 100 == 0:
                print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}%] [G loss: {g_loss}]")

            # if our generator loss icreased this iteration, increment the counter by 1
            if (g_loss - prev_g_loss) > 0:
                loss_increase_count = loss_increase_count + 1
            else: 
                loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
                
            prev_g_loss = g_loss
                
            if loss_increase_count > 5:
                print('Stoping on iteration: ', epoch)
                break
                
            if epoch % 20 == 0:
                f = open("Results/GANresultsportsweep.txt", "a")
                np.savetxt("Results/GANresultsportsweep.txt", gen_attacks, fmt="%.0f")
                f.close()

        # peek at our results
        results = np.loadtxt("Results/GANresultsportsweep.txt")
        print("Generated portsweep attacks: ")
        print(results[:2])

In [14]:
# Randomly select hidden layer sizes for the generator
gen_hidden1 = np.random.randint(1, 101)
gen_hidden2 = np.random.randint(1, 101)
gen_hidden3 = np.random.randint(1, 101)

# Create the GAN with the selected hidden layer sizes
gan = GAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape, num_classes)

clear_output(wait=False)

print("Hidden Layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

Hidden Layers:  51 11 74


### Train

In [17]:
# Call the trainGAN function directly to start training
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

# Start the timer
start_time = time.time()

gan.trainGAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape)

end_time = time.time()

clear_output(wait=False)
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)
print("Training Complete in {:.2f} seconds!!!".format(end_time - start_time))

Training GAN with hidden layers:  51 11 74
Training Complete in 322.37 seconds!!!


In [16]:
def getAccuracies()  :
    accuracy_scores = []
    f1_scores = []
    for i in range(100) :
        # Generate samples from the trained generator
        noise = tf.random.normal((num_samples, input_shape))
        generated_samples = gan.generator(noise)

        # Pass the generated samples through the discriminator
        discriminator_predictions = gan.discriminator.predict(generated_samples)

        # The ideal output for generated samples is 1
        ideal_output = np.ones((num_samples,))

        # Correcting the prediction rounding
        discriminator_predictions_rounded = np.round(discriminator_predictions).flatten()

        # Now, calculating the accuracy should not throw an error
        accuracy = accuracy_score(ideal_output, discriminator_predictions_rounded)
        f1 = f1_score(ideal_output, discriminator_predictions_rounded)
        accuracy_scores.append(accuracy)
        f1_scores.append(f1)
    
    accuracy = np.mean(accuracy_scores)
    f1 = np.mean(f1_scores)
    return accuracy, f1

### Results

In [18]:
accuracy,f1 = getAccuracies()

clear_output(wait=False)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1)

Accuracy:  0.92
F1 Score:  0.9583333333333334


### Save Model

In [None]:
generator_save_path = "../model/comp_sec/generator"
discriminator_save_path = "../model/comp_sec/discriminator"

# Save the generator
gan.generator.save(generator_save_path)
# Save the discriminator
gan.discriminator.save(discriminator_save_path)

### Load Model

In [None]:
generator_load_path = "../model/comp_sec/generator"
discriminator_load_path = "../model/comp_sec/discriminator"

gan.generator = load_model(generator_load_path)
gan.discriminator = load_model(discriminator_load_path)

gan.generator.summary()
gan.discriminator.summary()

### Best Model Loop

Will continue to run until a better model is found

In [15]:
class Looper:
    def random_numbers():
        gen_hidden1 = np.random.randint(1, 101)
        gen_hidden2 = np.random.randint(1, 101)
        gen_hidden3 = np.random.randint(1, 101)
        return [gen_hidden1, gen_hidden2, gen_hidden3]
    
    def evaluate(gan):
        noise = tf.random.normal((num_samples, input_shape))
        generated_samples = gan.generator(noise)
        discriminator_predictions = gan.discriminator.predict(generated_samples)
        ideal_output = np.ones((num_samples,))
        discriminator_predictions_rounded = np.round(discriminator_predictions).flatten()
        ideal_output = np.ones((num_samples,))
        accuracy = accuracy_score(ideal_output, discriminator_predictions_rounded)
        f1 = f1_score(ideal_output, discriminator_predictions_rounded)
        return accuracy, f1
    
    def save(gan):
        generator_save_path = "model/best_generator"
        discriminator_save_path = "model/best_discriminator"
        gan.generator.save(generator_save_path)
        gan.discriminator.save(discriminator_save_path)

In [None]:
if gan is None:
    best_accuracy = 0
    best_f1 = 0
else:
    best_accuracy, best_f1 = Looper.evaluate(gan)

while (True):
    # Randomly select hidden layer sizes for the generator
    [gen_hidden1, gen_hidden2, gen_hidden3] = Looper.random_numbers()

    # Create the GAN with the selected hidden layer sizes
    gan = GAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape, num_classes)
    # Call the trainGAN function directly to start training
    gan.trainGAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape)
    accuracy, f1 = getAccuracies(gan)   
    print("Accuracy: ", accuracy, "F1 Score: ", f1, "Hidden Layers: ", gen_hidden1, gen_hidden2, gen_hidden3)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_f1 = f1
        Looper.save(gan)
        print("Saved New Model")
        break
    

clear_output(wait=False)
print("Accuracy: ", best_accuracy, "F1 Score: ", best_f1)