## Pre-processing and Visualization

In [1]:
# Include the required libraries.
import random
import pandas as pd
from deap import creator, base, tools, algorithms
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support, mutual_info_score 
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

random.seed(21)

In [2]:
# Read the CSV file data.
df = pd.read_csv('20 Percent Training Set.csv',
                 names = ["feat_{0}".format(i) for i in range(1, 42)] + ["label", "NA"])
df.drop('NA', axis = 1, inplace = True)

In [3]:
df.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_33,feat_34,feat_35,feat_36,feat_37,feat_38,feat_39,feat_40,feat_41,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [4]:
# Read the txt file data.
df_test = pd.read_csv('KDDTest-21.txt',
                      names = ["feat_{0}".format(i) for i in range(1, 42)] + ["label", "NA"])
df_test.drop('NA', axis = 1, inplace = True)

In [5]:
df_test.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_33,feat_34,feat_35,feat_36,feat_37,feat_38,feat_39,feat_40,feat_41,label
0,13,tcp,telnet,SF,118,2425,0,0,0,0,...,10,0.38,0.12,0.04,0.0,0.0,0.0,0.12,0.3,guess_passwd
1,0,udp,private,SF,44,0,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpguess
2,0,tcp,telnet,S3,0,44,0,0,0,0,...,79,0.31,0.61,0.0,0.0,0.21,0.68,0.6,0.0,processtable
3,0,udp,private,SF,53,55,0,0,0,0,...,255,1.0,0.0,0.87,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,private,SH,0,0,0,0,0,0,...,1,0.06,1.0,1.0,0.0,1.0,1.0,0.0,0.0,nmap


## Feature Encoding for Decision Tree Classifier

In [6]:
# Prepare the test and training datasets.
X_train = df.iloc[:, : -1]
Y_train = df.loc[:, ["label"]]

X_test = df_test.iloc[:, : -1]
Y_test = df_test.loc[:, ["label"]]

In [7]:
# Encode the features for Decision Tree Classifier
le1 = LabelEncoder()
le1.fit(X_train.feat_2.tolist() + X_test.feat_2.tolist())
X_train.feat_2 = le1.transform(X_train.feat_2.tolist())
X_test.feat_2 = le1.transform(X_test.feat_2.tolist())

le2 = LabelEncoder()
le2.fit(X_train.feat_3.tolist() + X_test.feat_3.tolist())
X_train.feat_3 = le2.transform(X_train.feat_3.tolist())
X_test.feat_3 = le2.transform(X_test.feat_3.tolist())

le3 = LabelEncoder()
le3.fit(X_train.feat_4.tolist() + X_test.feat_4.tolist())
X_train.feat_4 = le3.transform(X_train.feat_4.tolist())
X_test.feat_4 = le3.transform(X_test.feat_4.tolist())

In [8]:
# Print the unique class labels.
Y_train.label.unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back',
       'guess_passwd', 'ftp_write', 'multihop', 'rootkit',
       'buffer_overflow', 'imap', 'warezmaster', 'phf', 'land',
       'loadmodule', 'spy'], dtype=object)

## NSGA-II Implementation

In [9]:
creator.create("FitnessMulti", base.Fitness, weights = (-1.0, 1.0))
creator.create("Individual", list, fitness = creator.FitnessMulti)

# Function for emulating the objective function.
def evalFitness(individual):
    feat = ["feat_{0}".format(i) for i in range(1, 42) if individual[i - 1] == 1]
    pred = DecisionTreeClassifier().fit(X_train.loc[:, feat], Y_train).predict(X_train.loc[:, feat])
    return mutual_info_score(Y_train.label, pred), sum(individual)

# Initialization
toolbox = base.Toolbox()
toolbox.register("bit", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.bit, n = 41)
toolbox.register("population", tools.initRepeat, list, toolbox.individual, n = 25)
toolbox.register("evaluate", evalFitness)
toolbox.register("mate", tools.cxUniform, indpb = 0.1)
toolbox.register("mutate", tools.mutFlipBit, indpb = 0.05)
toolbox.register("select", tools.selNSGA2)

population = toolbox.population()
fits = toolbox.map(toolbox.evaluate, population)

for fit, ind in zip(fits, population):
    ind.fitness.values = fit

for gen in range(50):
    offspring = algorithms.varOr(population, toolbox, lambda_ = 25, cxpb = 0.5, mutpb = 0.1)
    fits = toolbox.map(toolbox.evaluate, offspring)
    
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    
    population = toolbox.select(offspring + population, k = 25)

In [10]:
# Print the training results.
for individual in population:
    feat = ["feat_{0}".format(i) for i in range(1, 42) if individual[i - 1] == 1]
    print('Accuracy Score: ', accuracy_score(Y_train, DecisionTreeClassifier().fit(X_train.loc[:, feat], Y_train).predict(X_train.loc[:, feat])), 'Individual: ', sum(individual))

Accuracy Score:  0.9891235312797714 Individual:  33
Accuracy Score:  0.9891235312797714 Individual:  33
Accuracy Score:  0.9921006668783741 Individual:  34
Accuracy Score:  0.9921006668783741 Individual:  34
Accuracy Score:  0.9921006668783741 Individual:  34
Accuracy Score:  0.9921006668783741 Individual:  34
Accuracy Score:  0.9891235312797714 Individual:  31
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Score:  0.9983328040647824 Individual:  34
Accuracy Sco

In [11]:
# Print the test results.
for individual in population:
    feat = ["feat_{0}".format(i) for i in range(1, 42) if individual[i - 1] == 1]
    print('Accuracy: ', accuracy_score(Y_test, DecisionTreeClassifier().fit(X_test.loc[:, feat], Y_test).predict(X_test.loc[:, feat])), 'Individual: ', sum(individual))

Accuracy:  0.9893670886075949 Individual:  33
Accuracy:  0.9893670886075949 Individual:  33
Accuracy:  0.9935864978902954 Individual:  34
Accuracy:  0.9935864978902954 Individual:  34
Accuracy:  0.9935864978902954 Individual:  34
Accuracy:  0.9935864978902954 Individual:  34
Accuracy:  0.9893670886075949 Individual:  31
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Individual:  34
Accuracy:  0.9919831223628692 Indi