In [1]:
#Important Libraries
from zipfile import ZipFile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

#Data Encoding
from sklearn.preprocessing import  OneHotEncoder

#Preprocessing
from sklearn.preprocessing import StandardScaler

#Classifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
#Synthetic dataset
synth_train = pd.read_csv('/content/drive/MyDrive/Degree Project/train.zip', delimiter=',')
synth_test = pd.read_csv('/content/drive/MyDrive/Degree Project/test.zip', delimiter=',')
def data(df):
  Y = df['Label']                                  #Labels
  Y = Y.to_numpy()
  X = df.to_numpy()
  X = np.delete(X, -1, axis=1)
  X1 = X[:,0:2]                                    #Categorical Features
  X2 = X[:, 2:]                                    #Numerical Features
  X1 = OneHotEncoder().fit_transform(X1).toarray() #Encode categorical features
  X = np.concatenate((X1,X2), axis=1)
  X = StandardScaler().fit_transform(X)            #Standardizes data
  X = np.asarray(X).astype('float32')
  Y = np.asarray(Y).astype('float32')
  return X,Y

X_train, Y_train = data(synth_train)
X_test, Y_test = data(synth_test)

In [None]:
#KDD-Cup 99 dataset
KDD_train = pd.read_csv('/content/drive/MyDrive/Degree Project/kddcup.data_10_percent.gz', compression='gzip').dropna() 
KDD_test = pd.read_csv('/content/drive/MyDrive/Degree Project/corrected.gz', compression='gzip').dropna()
KDD_train.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome'
]

KDD_test.columns = KDD_train.columns
#Remove duplicates
KDD_train = KDD_train.drop_duplicates(keep=False)
KDD_test = KDD_test.drop_duplicates(keep=False)
#Change outcome to 0 if normal and 1 if anomalous
KDD_train['outcome'] = (KDD_train['outcome']!='normal.')*1
KDD_test['outcome'] = (KDD_test['outcome']!='normal.')*1
def data(df):
  X = df.to_numpy()
  Y = X[:,-1]
  X = np.delete(X, -1, axis=1)
  X1 = np.array([X[:,0]]).transpose()
  X2 = OneHotEncoder().fit_transform(X[:,1:2]).toarray()
  X3 = X[:,4:]
  X = np.concatenate((X1,X2,X3), axis=1)
  X = np.asarray(X).astype('float32')
  Y = np.asarray(Y).astype('float32')
  X = StandardScaler().fit_transform(X)
  normal_indx = np.where(Y==0)                      #Index of normal observations
  anomaly_indx = np.where(Y==1)                     #Index of anomalies
  X_normal  = X[normal_indx]
  X_anomaly = X[anomaly_indx]
  Y_normal  = Y[normal_indx]
  Y_anomaly = Y[anomaly_indx]
  return X,X_normal,X_anomaly,Y,Y_normal,Y_anomaly

X_train,X_normal_train,X_anomaly_train,Y_train,Y_normal_train,Y_anomaly_train = data(KDD_train)
X_test,X_normal_test,X_anomaly_test,Y_test,Y_normal_test,Y_anomaly_test       = data(KDD_test) 

In [None]:
class GAN(keras.Model):
  def __init__(self,latent_dim=20, init_kernel=keras.initializers.GlorotNormal, init_bias=keras.initializers.constant(0)):
    self.latent_dim    = latent_dim
    self.generator     = self.Generator()
    self.discriminator = self.Discriminator

    def Generator(self):
      generator_input  = keras.Input(self.latent_dim, name='z') #z
      generator        = layers.Dense(128, activation='relu', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(generator_input)
      generator        = layers.Dense(256, activation='relu', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(generator)
      generator        = layers.Dense(128, activation='relu', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(generator)
      generator        = layers.Dense(64, activation='relu', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(generator)
      generator_output = layers.Dense(self.data_dim, activation='linear', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(generator) #G(z)
      return keras.Model(generator_input, generator_output, name='Generator')

    def Discriminator(self):
      D_input  = keras.Input(self.generator.output.shape[1])
      D  = layers.Dense(128, activation='relu', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(D_input)
      D  = layers.Dense(256, activation='relu', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(D)
      D  = layers.Dense(128, activation='relu', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(D)
      D_output = layers.Dense(1, activation='sigmoid', kernel_initializer=self.init_kernel, bias_initializer=self.init_bias)(D)
      return keras.Model(x, D_output, name='Discriminator')

    def compile(self,optimizer = keras.optimizers.Adam(learning_rate=10**-5, beta_1=0.5), loss = keras.losses.BinaryCrossentropy(), train_metric = keras.metrics.BinaryCrossentropy):
      self.optimizer    = optimizer
      self.loss         = loss

    def train(self,X_train,Epochs=50,batch_size=128):
        for Epoch in range(Epochs):
        X = tf.data.Dataset.from_tensor_slices(X_train)
        X = X.shuffle(buffer_size=1024).batch(batch_size) #Shuffles data and divides the dataset in batches
        pbar = tqdm(X, position=0, leave=True)            #Progressbar
        for step, x in enumerate(pbar):
          z = tf.random.normal(shape=(x.shape[0], self.latent_dim))     #Sample normal distributed noise
          with tf.GradientTape(persistent=True) as tape:
            x_ = self.generator(z)                                      #Generate x from noise              
            real_pred = self.discriminator(x_)
            fake_pred = self.discriminator(z)
            d_loss = self.loss(tf.ones_like(real_pred), real_pred)+self.loss(tf.zeros_like(fake_pred), fake_pred) #Discriminator loss
            g_loss = self.loss(tf.ones_like(fake_pred), fake_pred) #Generator loss
          
          d_gradients = tape.gradient(d_loss, self.discriminator.trainable_weights)               #Discriminator gradients
          self.optimizer.apply_gradients(zip(d_gradients, self.discriminator.trainable_weights))  #Update Discriminator paramaters
          g_gradients = tape.gradient(g_loss, self.generator.trainable_weights)                   #Generator & Encoder loss
          self.optimizer.apply_gradients(zip(g_gradients, self.generator.trainable_weights))      #Update Generator and Encoder parameters
          del tape
    
    def sample(self,X_train):