In [1]:
# preproccess
import pandas as pd 
import os 
import glob 
from IPython.display import clear_output

# Model
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
import time


# gpus = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(gpus[0], True)

# Loading the Dataset

In [3]:
rel_path = '/datasets/CICIoT2023/'  
path = os.path.join(os.path.dirname(os.getcwd()), rel_path.lstrip('/'))  
csv_filepaths = glob.glob(os.path.join(path, "*.csv"))  

# Features that hold values 0/1.
column_datatypes = { 'fin_flag_number': 'bool', 'syn_flag_number': 'bool', 'rst_flag_number': 'bool',
                     'psh_flag_number': 'bool', 'ack_flag_number': 'bool', 'ece_flag_number': 'bool',
                     'cwr_flag_number': 'bool', 
                     'HTTP': 'bool', 'HTTPS': 'bool', 'DNS': 'bool', 'Telnet': 'bool', 'SMTP': 'bool',
                     'SSH': 'bool',  'IRC': 'bool',   'TCP': 'bool', 'UDP': 'bool',    'DHCP': 'bool',
                     'ARP': 'bool',  'ICMP': 'bool',  'IPv': 'bool', 'LLC': 'bool'
                   }

# Load the first csv file
df = pd.read_csv(csv_filepaths[0]).astype(column_datatypes)

# Load csv files in 10-file batches 
batch_size = 10

for i in range(1, len(csv_filepaths)):
    clear_output(wait=False) # Pretty output
    print(f'Loading CSV {i}')
    
    # First file of each batch, restart the batch list
    if i % batch_size == 1:
        batch = [df]
    
    batch.append(pd.read_csv(csv_filepaths[i]).astype(column_datatypes))    # Load a CSV and change relevant columns to bools
    
    # every #batch_size# file, add it to the df dataframe
    if i % batch_size == 0:
        df = pd.concat(batch)
        batch.clear()   # Get rid of old batch files to free memory
        print(f'Loaded to {i}')

# Load any remaining data in batch
if len(batch) != 0:
    print("Loading data from final batch.")
    df = pd.concat(batch)

clear_output(wait=False)
del batch

# Dataframe Memory Size

In [4]:
tot_mem = df.memory_usage().sum()
print(f'{tot_mem / 1000000000} gb')

11.064719223 gb


# Encoding labels 

In [5]:
label_maps = { 'Backdoor_Malware': '0',         'BenignTraffic': '1',           'BrowserHijacking': '2',
               'CommandInjection': '3',         'DDoS-ACK_Fragmentation': '4',  'DDoS-HTTP_Flood': '5',
               'DDoS-ICMP_Flood': '6',          'DDoS-ICMP_Fragmentation': '7', 'DDoS-PSHACK_Flood': '8',
               'DDoS-RSTFINFlood': '9',         'DDoS-SYN_Flood': '10',         'DDoS-SlowLoris': '11',
               'DDoS-SynonymousIP_Flood': '12', 'DDoS-TCP_Flood': '13',         'DDoS-UDP_Flood': '14',
               'DDoS-UDP_Fragmentation': '15',  'DNS_Spoofing': '16',           'DictionaryBruteForce': '17',
               'DoS-HTTP_Flood': '18',          'DoS-SYN_Flood': '19',          'DoS-TCP_Flood': '20',
               'DoS-UDP_Flood': '21',           'MITM-ArpSpoofing': '22',       'Mirai-greeth_flood': '23',
               'Mirai-greip_flood': '24',       'Mirai-udpplain': '25',         'Recon-HostDiscovery': '26',
               'Recon-OSScan': '27',            'Recon-PingSweep': '28',        'Recon-PortScan': '29',
               'SqlInjection': '30',            'Uploading_Attack': '31',       'VulnerabilityScan': '32', 
               'XSS': '33'
             }

df['label'] = df['label'].map(label_maps)

df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,54.00,6.00,64.00,0.329807,0.329807,0.0,True,False,True,...,0.000000,54.00,8.334383e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,9
1,0.000000,57.04,6.33,64.00,4.290556,4.290556,0.0,False,False,False,...,2.822973,57.04,8.292607e+07,9.5,10.464666,4.010353,160.987842,0.05,141.55,20
2,0.000000,0.00,1.00,64.00,33.396799,33.396799,0.0,False,False,False,...,0.000000,42.00,8.312799e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,6
3,0.328175,76175.00,17.00,64.00,4642.133010,4642.133010,0.0,False,False,False,...,0.000000,50.00,8.301570e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,21
4,0.117320,101.73,6.11,65.91,6.202211,6.202211,0.0,False,True,False,...,23.113111,57.88,8.297300e+07,9.5,11.346876,32.716243,3016.808286,0.19,141.55,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234740,0.000000,54.00,6.00,64.00,25.224003,25.224003,0.0,False,False,False,...,0.000000,54.00,8.307636e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,13
234741,0.000000,0.00,1.00,64.00,0.469918,0.469918,0.0,False,False,False,...,0.000000,42.00,8.312488e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,6
234742,4.274490,107.85,6.00,64.00,0.463217,0.463217,0.0,False,False,False,...,0.172084,54.39,8.294658e+07,9.5,10.396218,0.246316,1.516787,0.02,141.55,20
234743,0.130775,44335.00,17.00,64.00,6981.093452,6981.093452,0.0,False,False,False,...,0.000000,50.00,8.309869e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,14


# Model

### Hyper-Parameters

In [14]:
# Define the number of neurons in the initial layer of the generator
input_shape = 46
num_epochs = 2
batch_size = 256 # Define your batch size here
num_samples = 100
epochs = 7000
critic_updates = 5  # Number of critic updates per generator update
attack_classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
num_classes = len(attack_classes)

result = df

In [15]:
# GAN class
# This class contains the generator and discriminator models, as well as the training loop for the GAN
class GAN:
    def __init__(self, hidden1, hidden2, hidden3, layer0_num_neurons, num_classes):
        # store the parameters as instance variables
        self.hidden1 = hidden1
        self.hidden2 = hidden2
        self.hidden3 = hidden3
        self.layer0_num_neurons = layer0_num_neurons
        self.num_classes = num_classes

        # build the generator and discriminator
        self.generator = self.build_generator(self.hidden1, self.hidden2, self.hidden3, self.layer0_num_neurons)
        self.discriminator = self.build_discriminator()

        # compile the generator and discriminator
        optimizer = Adam(0.0002, 0.5)
        self.generator.compile(optimizer=optimizer, loss='binary_crossentropy')
        self.discriminator.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


    def build_generator(self, hidden1, hidden2, hidden3, input_dim):
        model = Sequential()
        model.add(Dense(hidden1, input_dim=input_dim))  
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden3))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(input_dim, activation='relu'))  # Changed from output_dim to input_dim

        noise = Input(shape=(input_dim,))
        attack = model(noise)
        return Model(noise, attack)

    def build_discriminator(self):
        model = Sequential()
        model.add(Dense(input_shape, input_dim=input_shape, activation='relu'))  
        model.add(Dense(30, activation='relu'))
        model.add(Dense(15, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))  

        attack = Input(shape=(input_shape,))
        validity = model(attack)

        return Model(attack, validity)
    
    # define baseline model
    def baseline_model():
        # create model
        model = Sequential()
        inputs = input_shape
        hidden_layer1 = 10
        hidden_layer2 = 5
        hidden_layer3 = 0
        outputs = num_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
        
        model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
        if hidden_layer2 != 0:
            model.add(Dense(hidden_layer2, activation='relu'))
        if hidden_layer3 != 0:
            model.add(Dense(hidden_layer3, activation='relu'))
        model.add(Dense(outputs, activation='softmax'))
        
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
        return model
    
   
    def discriminator_loss(self, real_output, fake_output):
        return tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

    def generator_loss(self, fake_output):
        return -tf.reduce_mean(fake_output)


    def trainGAN(self, gen_hidden1, gen_hidden2, gen_hidden3, input_dim):
        optimizer = Adam(0.0002, 0.5)
        
        # Directly use 'result' DataFrame. Ensure it's accessible within this scope.
        # Sampling 500 data points randomly from 'result'
        sampled_df = result.sample(500)

        # Encode labels if not already encoded. Assuming 'label' needs encoding.
        # le = LabelEncoder()
        # sampled_df['label'] = le.fit_transform(sampled_df['label'])
    
        # Splitting the data into features and labels
        X_train = sampled_df.drop('label', axis=1).values.astype(float)
        Y_train = sampled_df['label'].values

        # Setting up labels for valid (real) and fake data for training
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        # Building the discriminator
        discriminator = self.build_discriminator()
        discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Building the generator
        generator = self.build_generator(gen_hidden1, gen_hidden2, gen_hidden3, input_dim)

        # Setting up the combined model
        z = Input(shape=(input_shape,))
        attack = generator(z)
        validity = discriminator(attack)
        combined = Model(z, validity)
        combined.compile(loss='binary_crossentropy', optimizer=optimizer)
        
        #break condition for training (when diverging)
        loss_increase_count = 0
        prev_g_loss = 0
        
        for epoch in range(epochs):
            # Train Discriminator
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            real_attacks = X_train[idx]

            # run the generator
            noise = tf.random.normal((batch_size, input_shape))
            gen_attacks = generator.predict(noise)
            
            # get the discriminator loss
            d_loss_real = discriminator.train_on_batch(real_attacks, valid)
            d_loss_fake = discriminator.train_on_batch(gen_attacks, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            g_loss = combined.train_on_batch(noise, valid)
            
            # at the end of 100 epochs print the losses and accuracy
            if epoch % 100 == 0:
                print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}%] [G loss: {g_loss}]")
            
            # Conditions to stop the loop if generator loss increases 5 times
            if (g_loss - prev_g_loss) > 0:
                loss_increase_count = loss_increase_count + 1
            else: 
                loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
                
            prev_g_loss = g_loss
                
            if loss_increase_count > 5:
                print('Stoping on iteration: ', epoch)
                break
            
            # saving the generated output
            if epoch % 20 == 0:
                f = open("Results/GeneratedAttackResults.txt", "a")
                np.savetxt("Results/GeneratedAttackResults.txt", gen_attacks, fmt="%.0f")
                f.close()
        clear_output(wait=False)
        # peek at our results
        results = np.loadtxt("Results/GANresultsportsweep.txt")  # save final output
        print("Generated attacks: ")
        print(results[:2])

       


In [16]:
# Randomly select hidden layer sizes for the generator
gen_hidden1 = np.random.randint(1, 101)
gen_hidden2 = np.random.randint(1, 101)
gen_hidden3 = np.random.randint(1, 101)

# Create the GAN with the selected hidden layer sizes
gan = GAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape, num_classes)

print("Hidden Layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

Hidden Layers:  73 66 31


### Train

In [17]:
# Call the trainGAN function directly to start training
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

# Start the timer
start_time = time.time()

gan.trainGAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape)

end_time = time.time()

clear_output(wait=False)
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)
print("Training Complete in {:.2f} seconds!!!".format(end_time - start_time))

Training GAN with hidden layers:  73 66 31
Training Complete in 285.23 seconds!!!


In [18]:
def getAccuracies()  :
    accuracy_scores = []
    f1_scores = []
    for i in range(100) :
        # Generate samples from the trained generator
        noise = tf.random.normal((num_samples, input_shape))
        generated_samples = gan.generator(noise)

        # Pass the generated samples through the discriminator
        discriminator_predictions = gan.discriminator.predict(generated_samples)

        # The ideal output for generated samples is 1
        ideal_output = np.ones((num_samples,))

        # Correcting the prediction rounding
        discriminator_predictions_rounded = np.round(discriminator_predictions).flatten()

        # Now, calculating the accuracy should not throw an error
        accuracy = accuracy_score(ideal_output, discriminator_predictions_rounded)
        f1 = f1_score(ideal_output, discriminator_predictions_rounded)
        accuracy_scores.append(accuracy)
        f1_scores.append(f1)
    
    accuracy = np.mean(accuracy_scores)
    f1 = np.mean(f1_scores)
    return accuracy, f1

### Results

In [19]:
accuracy,f1 = getAccuracies()

clear_output(wait=False)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1)

Accuracy:  0.6809000000000001
F1 Score:  0.809419910501734


### Save Model

In [36]:
generator_save_path = "../model/comp_sec/generator"
discriminator_save_path = "../model/comp_sec/discriminator"

# Save the generator
gan.generator.save(generator_save_path)
# Save the discriminator
gan.discriminator.save(discriminator_save_path)

INFO:tensorflow:Assets written to: ../model/comp_sec/generator\assets
INFO:tensorflow:Assets written to: ../model/comp_sec/discriminator\assets


### Load Model

In [37]:
generator_load_path = "../model/comp_sec/generator"
discriminator_load_path = "../model/comp_sec/discriminator"

gan.generator = load_model(generator_load_path)
gan.discriminator = load_model(discriminator_load_path)

gan.generator.summary()
gan.discriminator.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 46)]              0         
                                                                 
 sequential_4 (Sequential)   (None, 46)                8958      
                                                                 
Total params: 8,958
Trainable params: 8,686
Non-trainable params: 272
_________________________________________________________________
Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 46)]              0         
                                                                 
 sequential_5 (Sequential)   (None, 1)                 4053      
                                                                 
Total params: 4,053
Trainable params: 

### Best Model Loop

Will continue to run until a better model is found

In [15]:
class Looper:
    def random_numbers():
        gen_hidden1 = np.random.randint(1, 101)
        gen_hidden2 = np.random.randint(1, 101)
        gen_hidden3 = np.random.randint(1, 101)
        return [gen_hidden1, gen_hidden2, gen_hidden3]
    
    def evaluate(gan):
        noise = tf.random.normal((num_samples, input_shape))
        generated_samples = gan.generator(noise)
        discriminator_predictions = gan.discriminator.predict(generated_samples)
        ideal_output = np.ones((num_samples,))
        discriminator_predictions_rounded = np.round(discriminator_predictions).flatten()
        ideal_output = np.ones((num_samples,))
        accuracy = accuracy_score(ideal_output, discriminator_predictions_rounded)
        f1 = f1_score(ideal_output, discriminator_predictions_rounded)
        return accuracy, f1
    
    def save(gan):
        generator_save_path = "model/best_generator"
        discriminator_save_path = "model/best_discriminator"
        gan.generator.save(generator_save_path)
        gan.discriminator.save(discriminator_save_path)

In [None]:
if gan is None:
    best_accuracy = 0
    best_f1 = 0
else:
    best_accuracy, best_f1 = Looper.evaluate(gan)

while (True):
    # Randomly select hidden layer sizes for the generator
    [gen_hidden1, gen_hidden2, gen_hidden3] = Looper.random_numbers()

    # Create the GAN with the selected hidden layer sizes
    gan = GAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape, num_classes)
    # Call the trainGAN function directly to start training
    gan.trainGAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape)
    accuracy, f1 = getAccuracies(gan)   
    print("Accuracy: ", accuracy, "F1 Score: ", f1, "Hidden Layers: ", gen_hidden1, gen_hidden2, gen_hidden3)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_f1 = f1
        Looper.save(gan)
        print("Saved New Model")
        break
    

print("Accuracy: ", best_accuracy, "F1 Score: ", best_f1)