In [8]:
import os
import tensorflow as tf
import numpy as np
from tensorflow import keras
import random
import time
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import tree
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
%matplotlib inline

random.seed(5)


#Prevent Tensorflow messages from showing up
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
# Neural network with properties specified
def makeDNN(numOfLayers, numOfNeurons, activationFunc):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Flatten(input_dim=122))
    model.add(tf.keras.layers.Dense(numOfNeurons, activation = activationFunc))
    model.add(tf.keras.layers.BatchNormalization())
    for i in range(numOfLayers):
        model.add(tf.keras.layers.Dense(numOfNeurons, activation = activationFunc,kernel_regularizer='l2')) # Regularization added
        model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dense(5, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = [keras.metrics.SparseCategoricalAccuracy()])
    return model

class NeuralNet:
    def __init__(self, layers, nodes, activation):
        self.layers = layers
        self.nodes = nodes
        self._testAccuracy = 0
        self._testLoss = 1
        self.activation = activation
        self.model = makeDNN(layers, nodes, activation)

    def setFitness(self, testX, testY):
        self._testLoss, self._testAccuracy = self.model.evaluate(testX, testY)

    def getAccuracy(self):
        return self._testAccuracy

    def getLoss(self):
        return self._testLoss

    def train(self, trainX, trainY, epoch):
        self.model.fit(trainX, trainY, epochs = epoch)

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
class NeuroEvolution:
    
    population = []
    newPopulation = []
    populationSize = 0
    maxLayers = 0
    maxNodes = 0
    maxIterations = 0
    threshold = 0
    activations = ['relu', 'sigmoid', 'tanh']
    trend = []

    def __init__(self, populationSize = 10, maxLayers = 7, maxNodes = 16, maxIterations = 5, threshold = 1):
        self.populationSize = populationSize
        self.maxLayers = maxLayers
        self.maxNodes = maxNodes
        self.maxIterations = maxIterations
        self.threshold = threshold

    def initialize(self): # Generate random population
        for i in range(self.populationSize):
            self.newPopulation.append(NeuralNet(random.randint(1, self.maxLayers), random.randint(1, self.maxNodes), self.activations[random.randint(0, len(self.activations) - 1)]))

    # Mutation
    def mutate(self, nn):
        chance = random.randint(1, 1000)
        if(chance > 990):
            nn.layers = random.randint(1, self.maxLayers)
        chance = random.randint(1, 1000)
        if(chance > 990):
            nn.nodes = random.randint(1, self.maxNodes)
        chance = random.randint(1, 1000)
        if(chance > 990):
            nn.activation = self.activations[random.randint(0, len(self.activations) - 1)]
        return nn

    # Crossover
    def crossover(self, index1, index2):
        randLayers = index1 if random.randint(0, 2) == 0 else index2
        randNodes = index1 if random.randint(0, 2) == 0 else index2
        randActivation = index1 if random.randint(0, 2) == 0 else index2
        return NeuralNet(self.population[randLayers].layers, self.population[randNodes].nodes, self.population[randActivation].activation)


    def trainAll(self, trainX, trainY, epoch):
        for nn in self.newPopulation:
            nn.train(trainX, trainY, epoch)

    def setAllFitness(self, testX, testY):
        for nn in self.newPopulation:
            nn.setFitness(testX, testY)

    # Print intermediate best result for an iteration
    def printStatus(self, i):
        print("Iteration " + str(i))
        print("\tBest Accuracy = " + str(self.population[0].getAccuracy()))
        print("\tBest Loss = " + str(self.population[0].getLoss()))
        for individual in self.population:
            print(individual.getAccuracy())
        self.trend.append(self.population[0].getAccuracy())

    # Print final best result
    def printBestResult(self):
        print("Total number of iterations = " + str(self.maxIterations))
        print("Best Accuracy = " + str(self.population[0].getAccuracy()))
        print("Best Loss = " + str(self.population[0].getLoss()))
        print("Number of Layers = " + str(self.population[0].layers))
        print("Number of Nodes = " + str(self.population[0].nodes))
        print("Activation Function = " + str(self.population[0].activation))
        for ind in self.trend:
            print(ind)

    def run(self, trainX, trainY, testX, testY, epoch):
        self.initialize() # Population initialization
        tempStart = start
        for i in range(self.maxIterations):
            self.trainAll(trainX, trainY, epoch)
            self.setAllFitness(testX, testY)
            self.newPopulation = sorted(self.newPopulation, key = lambda net: net.getAccuracy(), reverse = True)
            self.population = self.newPopulation[:self.populationSize]
            self.newPopulation = []
            self.printStatus(i)
            if self.population[0].getAccuracy() >= self.threshold:
                break
            for j in range(int(self.populationSize/2)):
                parent1 = 0
                parent2 = 0
                while parent1 == parent2:
                    parent1 = random.randint(0, int(self.populationSize/2))
                    parent2 = random.randint(0, int(self.populationSize/2))
                self.newPopulation.append(self.crossover(parent1, parent2))
            self.newPopulation += self.population[:int(self.populationSize/2)]
            for nn in self.newPopulation:
                nn = self.mutate(nn) # Mutation
        self.printBestResult()

In [9]:
column_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class','difficulty']

In [10]:
class_dict = {'DOS':0, 'R2L':1, 'PROBE':2, 'U2R':3, 'NORMAL':4}  # custom labels hard coded

# Reaing and Preprocessing
def read():
    train = pd.read_csv(r'../../Datasets/NSL_KDD/KDDTrain+.txt', sep=',',header = None, names = column_names) 
    train.drop(['difficulty'],axis=1,inplace=True)
    change_label(train)
    train_x = train[train.columns[:-1]]
    normalization(train_x)
    train_x = one_hot(train_x)
    train_y = train[train.columns[-1]]
    test = pd.read_csv(r'../../Datasets/NSL_KDD/KDDTest+.txt', sep=',',header = None, names = column_names)
    test.drop(['difficulty'],axis=1,inplace=True)
    change_label(test)
    test_x = test[test.columns[:-1]]
    normalization(test_x)
    test_x = one_hot(test_x)
    test_y = test[test.columns[-1]]
    total_columns = list(set(train_x).union(set(test_x)))
    total_columns.sort() 
    for j in set(total_columns)-set(train_x):
        train_x[j] = 0.0
    for j in set(total_columns)-set(test_x):
        test_x[j] = 0.0
    train_x = train_x[total_columns]
    test_x = test_x[total_columns]
    return train_x,train_y,test_x,test_y

def change_label(df): # 5-classes including normal
    df['class'].replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm',
                         'worm'],'DOS',inplace=True)
    df['class'].replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
       'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
    df['class'].replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'PROBE',inplace=True)
    df['class'].replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)
    df['class'].replace(['normal'],'NORMAL',inplace=True)
    df['class'] = [class_dict[i] for i in df['class']]
    
def one_hot(df): # 3 categorical variables
    category_columns = ['protocol_type','service','flag']
    categorical = df[category_columns]
    categorical = pd.get_dummies(categorical,columns = category_columns)
    df = pd.concat([df, categorical], axis=1, join='inner')
    df = df[list(set(df.columns) - set(category_columns))]
    return df
    
def normalization(df): #Normalization 
    std_scaler = StandardScaler()
    numeric_col = df.select_dtypes(include='float').columns
    df[numeric_col] = StandardScaler().fit_transform(df[numeric_col])
    print("finished")

In [11]:
train_x,train_y,test_x,test_y = read()
train_x,train_y,test_x,test_y = train_x.to_numpy(),train_y.to_numpy(),test_x.to_numpy(),test_y.to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished
finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [None]:
from imblearn.combine import SMOTETomek
from collections import Counter

In [None]:
smt = SMOTETomek(random_state=10,n_jobs=-1)
train_x, train_y = smt.fit_resample(train_x, train_y)

In [None]:
populationSize = 25
maxLayers = 15
maxNodes = 35
maxIterations = 5
threshold = 0.85

Evolutor = NeuroEvolution(populationSize, maxLayers, maxNodes, maxIterations, threshold)
Evolutor.run(train_x,train_y,test_x,test_y,5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5

In [None]:
# Result obtained from Neuroevolutionary algorithm
model = makeDNN(2, 50, 'relu')

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
history2 = model.fit(train_x, train_y, validation_split=0.1, epochs=50,callbacks=[callback], verbose=2)

Epoch 1/50
9471/9471 - 37s - loss: 0.3448 - accuracy: 0.9242 - val_loss: 2.1973 - val_accuracy: 0.1006 - 37s/epoch - 4ms/step
Epoch 2/50
9471/9471 - 35s - loss: 0.1678 - accuracy: 0.9573 - val_loss: 3.7711 - val_accuracy: 0.0000e+00 - 35s/epoch - 4ms/step
Epoch 3/50
9471/9471 - 36s - loss: 0.1485 - accuracy: 0.9618 - val_loss: 3.5293 - val_accuracy: 0.0231 - 36s/epoch - 4ms/step
Epoch 4/50
9471/9471 - 35s - loss: 0.1326 - accuracy: 0.9664 - val_loss: 2.8838 - val_accuracy: 0.0000e+00 - 35s/epoch - 4ms/step


In [None]:
print(classification_report(test_y,np.argmax(model2.predict(test_x), axis=-1)))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      7460
           1       0.79      0.11      0.19      2885
           2       0.73      0.86      0.79      2421
           3       0.66      0.52      0.58        67
           4       0.76      0.95      0.84      9711

    accuracy                           0.81     22544
   macro avg       0.64      0.56      0.55     22544
weighted avg       0.81      0.81      0.77     22544



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
