In [1]:
# Original Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# import stats for ANOVA feature selection
from scipy import stats

In [2]:
KDDTrain = pd.read_csv('C:/Users/Nimish Bhatt/Downloads/archive (34)/KDDTrain+.txt', header = None) # Data with difficulty level
# Column Headings
KDDTrain.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class', 'difficulty']

# We will not utilize the 'difficulty' feature for now
KDDTrain.drop('difficulty', axis=1, inplace=True)

In [3]:
KDDTest = pd.read_csv('C:/Users/Nimish Bhatt/Downloads/archive (34)/KDDTest+.txt', header = None) # Data with difficulty level
# Column Headings
KDDTest.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class', 'difficulty']

# We will not utilize the 'difficulty' feature for now
KDDTest.drop('difficulty', axis=1, inplace=True)

In [4]:
from sdv.single_table import CTGANSynthesizer

synthesizer = CTGANSynthesizer.load(
    filepath="C:/Users/Nimish Bhatt/Downloads/my_final_synthesizer.pkl"
)

In [5]:
KDDTrainsyn = synthesizer.sample(
    num_rows=300000,
    batch_size=1000
)

Sampling rows: 100%|██████████| 300000/300000 [01:25<00:00, 3502.14it/s]


In [6]:
KDDTrainsyn.rename(columns={'outcome': 'class'}, inplace=True)

# We will not utilize the 'difficulty' feature for now
KDDTrainsyn.drop('difficulty', axis=1, inplace=True)

KDDTrainsyn

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,IRC,SF,9400,5314,0,0,0,0,...,14,1.00,0.00,1.00,0.52,0.00,0.00,0.00,0.0,buffer_overflow
1,0,tcp,uucp,REJ,0,0,0,0,0,0,...,13,0.06,0.07,0.00,0.00,0.00,0.00,1.00,1.0,neptune
2,21,tcp,ftp,SF,1889,0,0,0,0,0,...,20,0.21,0.06,0.00,0.00,0.00,0.00,0.00,0.0,normal
3,1,tcp,http,SF,0,3927,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,normal
4,1,tcp,courier,S0,0,6478,0,0,0,0,...,14,0.02,0.05,0.00,0.00,0.99,1.00,0.00,0.0,neptune
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,icmp,ecr_i,SF,7087,693,0,0,0,0,...,253,1.00,0.00,0.99,0.00,0.00,0.00,0.00,0.0,smurf
299996,5,udp,private,RSTR,4663,0,0,0,0,0,...,0,0.00,0.41,0.10,0.00,0.00,0.00,0.34,0.0,normal
299997,24,tcp,ftp,SF,8049,4486,0,0,0,4,...,21,0.18,0.02,0.47,0.00,0.00,0.00,0.00,0.0,warezclient
299998,7,tcp,telnet,SF,1969,43398,0,0,0,2,...,17,1.00,0.00,0.03,0.18,0.00,0.00,0.00,0.0,normal


In [7]:
# Use Label Encoding for categorical features (including 'class')

# Encode class label with LabelEncoder
label_encoder = preprocessing.LabelEncoder()

# Define the columns to LabelEncode
categorical_columns=['protocol_type', 'service', 'flag']

# Encode categorical columns using LabelEncoder
label_encoder = preprocessing.LabelEncoder()
for column in categorical_columns:
    KDDTrain[column] = label_encoder.fit_transform(KDDTrain[column])
    KDDTest[column] = label_encoder.transform(KDDTest[column])
    
    
# Encode categorical columns using LabelEncoder
label_encoder = preprocessing.LabelEncoder()
for column in categorical_columns:
    KDDTrainsyn[column] = label_encoder.fit_transform(KDDTrainsyn[column])

In [8]:
# We replace all instances with a value of 2 to 1 because the feature should be a binary value (0 or 1)
KDDTrain['su_attempted'] = KDDTrain['su_attempted'].replace(2, 1)
KDDTest['su_attempted'] = KDDTest['su_attempted'].replace(2, 1)
KDDTrainsyn['su_attempted'] = KDDTrainsyn['su_attempted'].replace(2, 1)

In [9]:
# We drop 'num_outbound_cmds' from both training and testing dataset because every instance is equal to 0 in both datasets
KDDTrain.drop("num_outbound_cmds",axis=1,inplace=True)
KDDTest.drop("num_outbound_cmds",axis=1,inplace=True)
KDDTrainsyn.drop("num_outbound_cmds",axis=1,inplace=True)

In [10]:
KDDTrainsyn

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,1,0,9,9400,5314,0,0,0,0,...,14,1.00,0.00,1.00,0.52,0.00,0.00,0.00,0.0,buffer_overflow
1,0,1,66,1,0,0,0,0,0,0,...,13,0.06,0.07,0.00,0.00,0.00,0.00,1.00,1.0,neptune
2,21,1,19,9,1889,0,0,0,0,0,...,20,0.21,0.06,0.00,0.00,0.00,0.00,0.00,0.0,normal
3,1,1,24,9,0,3927,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,normal
4,1,1,6,5,0,6478,0,0,0,0,...,14,0.02,0.05,0.00,0.00,0.99,1.00,0.00,0.0,neptune
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,15,9,7087,693,0,0,0,0,...,253,1.00,0.00,0.99,0.00,0.00,0.00,0.00,0.0,smurf
299996,5,2,49,4,4663,0,0,0,0,0,...,0,0.00,0.41,0.10,0.00,0.00,0.00,0.34,0.0,normal
299997,24,1,19,9,8049,4486,0,0,0,4,...,21,0.18,0.02,0.47,0.00,0.00,0.00,0.00,0.0,warezclient
299998,7,1,60,9,1969,43398,0,0,0,2,...,17,1.00,0.00,0.03,0.18,0.00,0.00,0.00,0.0,normal


In [11]:
# Define the columns to scale
columns_to_scale=['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count']

# Scale numerical columns using MinMax
scaler = MinMaxScaler()
for column in columns_to_scale:
    KDDTrain[column] = scaler.fit_transform(KDDTrain[[column]])
    KDDTest[column] = scaler.transform(KDDTest[[column]])

In [12]:
scaler = MinMaxScaler()
for column in columns_to_scale:
    KDDTrainsyn[[column]] = scaler.fit_transform(KDDTrainsyn[[column]])

In [13]:
KDDTrainsyn

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.000000,1,0,9,0.147389,0.107341,0,0.0,0.0,0.0,...,0.054902,1.00,0.00,1.00,0.52,0.00,0.00,0.00,0.0,buffer_overflow
1,0.000000,1,66,1,0.000000,0.000000,0,0.0,0.0,0.0,...,0.050980,0.06,0.07,0.00,0.00,0.00,0.00,1.00,1.0,neptune
2,0.000963,1,19,9,0.029619,0.000000,0,0.0,0.0,0.0,...,0.078431,0.21,0.06,0.00,0.00,0.00,0.00,0.00,0.0,normal
3,0.000046,1,24,9,0.000000,0.079324,0,0.0,0.0,0.0,...,1.000000,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,normal
4,0.000046,1,6,5,0.000000,0.130853,0,0.0,0.0,0.0,...,0.054902,0.02,0.05,0.00,0.00,0.99,1.00,0.00,0.0,neptune
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.000000,0,15,9,0.111122,0.013998,0,0.0,0.0,0.0,...,0.992157,1.00,0.00,0.99,0.00,0.00,0.00,0.00,0.0,smurf
299996,0.000229,2,49,4,0.073114,0.000000,0,0.0,0.0,0.0,...,0.000000,0.00,0.41,0.10,0.00,0.00,0.00,0.34,0.0,normal
299997,0.001100,1,19,9,0.126205,0.090615,0,0.0,0.0,1.0,...,0.082353,0.18,0.02,0.47,0.00,0.00,0.00,0.00,0.0,warezclient
299998,0.000321,1,60,9,0.030873,0.876621,0,0.0,0.0,0.5,...,0.066667,1.00,0.00,0.03,0.18,0.00,0.00,0.00,0.0,normal


In [14]:
# Change training attack labels to their respective attack class for multiclass classification
KDDTrain['class'].replace(['neptune', 'smurf', 'back', 'teardrop', 'pod', 'land'],'DoS',inplace=True) # 6 sub classes of DoS
KDDTrain['class'].replace(['satan', 'ipsweep', 'portsweep', 'nmap'],'Probe',inplace=True) # 4 sub classes of Probe
KDDTrain['class'].replace(['warezclient', 'guess_passwd', 'warezmaster', 'imap', 'ftp_write', 'multihop', 'phf','spy'],'R2L',inplace=True) # 8 sub classes of R2L
KDDTrain['class'].replace(['buffer_overflow', 'rootkit', 'loadmodule','perl'],'U2R',inplace=True) # 4 sub classes of U2R

# Change training synthetic attack labels to their respective attack class for multiclass classification
KDDTrainsyn['class'].replace(['neptune', 'smurf', 'back', 'teardrop', 'pod', 'land'],'DoS',inplace=True) # 6 sub classes of DoS
KDDTrainsyn['class'].replace(['satan', 'ipsweep', 'portsweep', 'nmap'],'Probe',inplace=True) # 4 sub classes of Probe
KDDTrainsyn['class'].replace(['warezclient', 'guess_passwd', 'warezmaster', 'imap', 'ftp_write', 'multihop', 'phf','spy'],'R2L',inplace=True) # 8 sub classes of R2L
KDDTrainsyn['class'].replace(['buffer_overflow', 'rootkit', 'loadmodule','perl'],'U2R',inplace=True) # 4 sub classes of U2R

# Change testing attack labels to their respective attack class for multiclass classification
KDDTest['class'].replace(['neptune', 'apache2', 'processtable', 'smurf', 'back', 'mailbomb', 'pod', 'teardrop', 'land', 'udpstorm'],'DoS',inplace=True) # 10 sub classes of DoS
KDDTest['class'].replace(['mscan', 'satan', 'saint', 'portsweep', 'ipsweep', 'nmap'],'Probe',inplace=True) # 6 sub classes of Probe
KDDTest['class'].replace(['guess_passwd', 'warezmaster', 'snmpguess', 'snmpgetattack', 'httptunnel', 'multihop', 'named', 'sendmail', 'xlock', 'xsnoop', 'ftp_write', 'worm', 'phf', 'imap'],'R2L',inplace=True) # 14 sub classes of R2L
KDDTest['class'].replace(['buffer_overflow', 'ps', 'rootkit', 'xterm', 'loadmodule', 'perl', 'sqlattack'],'U2R',inplace=True) # 7 sub classes of U2R

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  KDDTrain['class'].replace(['neptune', 'smurf', 'back', 'teardrop', 'pod', 'land'],'DoS',inplace=True) # 6 sub classes of DoS
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  KDDTrainsyn['class'].replace(['neptune', 'smurf', 'back', 'teardrop', 'pod', 'land'],'DoS',inplace=True) # 

In [15]:
# Distribution of attack classes in training dataset
KDDTrain['class'].value_counts()


class
normal    67343
DoS       45927
Probe     11656
R2L         995
U2R          52
Name: count, dtype: int64

In [16]:
# Distribution of attack classes in testing dataset
KDDTest['class'].value_counts()

class
normal    9711
DoS       7458
R2L       2887
Probe     2421
U2R         67
Name: count, dtype: int64

In [17]:
# Distribution of attack classes in training dataset
KDDTrainsyn['class'].value_counts()


class
normal    152952
DoS        99343
Probe      31961
R2L        11582
U2R         4162
Name: count, dtype: int64

In [18]:
# Encode class label with LabelEncoder
label_encoder = preprocessing.LabelEncoder()
KDDTrain['class'] = label_encoder.fit_transform(KDDTrain['class'])
KDDTest['class'] = label_encoder.fit_transform(KDDTest['class'])
KDDTrainsyn['class'] = label_encoder.fit_transform(KDDTrainsyn['class'])

In [19]:
# Distribution of attack classes in training dataset
KDDTrain['class'].value_counts()


class
4    67343
0    45927
1    11656
2      995
3       52
Name: count, dtype: int64

In [20]:
# Distribution of attack classes in testing dataset
KDDTest['class'].value_counts()

class
4    9711
0    7458
2    2887
1    2421
3      67
Name: count, dtype: int64

In [21]:
# Distribution of attack classes in training dataset
KDDTrainsyn['class'].value_counts()

class
4    152952
0     99343
1     31961
2     11582
3      4162
Name: count, dtype: int64

In [22]:
# There is already a large sample size for class '4' and '0', no need for additional synthetic data
# Drop rows with class '4' or '0'
KDDTrainsyn = KDDTrainsyn[(KDDTrainsyn['class'] != 4) & (KDDTrainsyn['class'] != 0)]

In [23]:
KDDTrainsyn['class'].value_counts()

class
1    31961
2    11582
3     4162
Name: count, dtype: int64

In [24]:
# Drop rows with class '4' 
KDDTrain = KDDTrain[(KDDTrain['class'] != 4)]
KDDTest = KDDTest[(KDDTest['class'] != 4)]

In [25]:
# Assuming synth_data and KDDTrain are both pandas DataFrames
# If they are not, you can convert them to DataFrames using pd.DataFrame()

# Concatenate the synthetic samples to the original dataset
concatenated_data = pd.concat([KDDTrain, KDDTrainsyn], ignore_index=True)

In [26]:
# Now, 'concatenated_data' contains both the original dataset and the synthetic samples
concatenated_data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.000000,1,49,5,0.000000,0.000000,0,0.0,0.0,0.0,...,0.101961,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.0,0
1,0.000000,1,49,1,0.000000,0.000000,0,0.0,0.0,0.0,...,0.074510,0.07,0.07,0.00,0.00,0.00,0.00,1.00,1.0,0
2,0.000000,1,49,5,0.000000,0.000000,0,0.0,0.0,0.0,...,0.035294,0.04,0.05,0.00,0.00,1.00,1.00,0.00,0.0,0
3,0.000000,1,49,5,0.000000,0.000000,0,0.0,0.0,0.0,...,0.058824,0.06,0.07,0.00,0.00,1.00,1.00,0.00,0.0,0
4,0.000000,1,51,5,0.000000,0.000000,0,0.0,0.0,0.0,...,0.090196,0.09,0.05,0.00,0.00,1.00,1.00,0.00,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106330,0.000275,1,14,1,0.122097,0.000000,0,0.0,0.0,0.0,...,0.070588,1.00,0.00,1.00,0.52,0.00,0.00,1.00,1.0,1
106331,0.028560,1,19,9,0.057497,0.000000,0,0.0,0.0,1.0,...,0.337255,0.28,0.02,0.00,0.00,0.00,0.00,0.00,0.0,2
106332,0.000000,0,14,9,0.132070,0.000000,0,0.0,0.0,0.0,...,0.274510,1.00,0.00,0.99,0.50,0.00,0.00,0.00,0.0,1
106333,0.001100,1,19,9,0.126205,0.090615,0,0.0,0.0,1.0,...,0.082353,0.18,0.02,0.47,0.00,0.00,0.00,0.00,0.0,2


In [27]:
# Distribution of classes in dataset after synthetic concatenation
concatenated_data['class'].value_counts()

class
0    45927
1    43617
2    12577
3     4214
Name: count, dtype: int64

In [28]:
KDDTest['class'].value_counts()

class
0    7458
2    2887
1    2421
3      67
Name: count, dtype: int64

In [29]:
# Use this code for Synthetic + Real Samples
X_train = concatenated_data.iloc[:, :-1].values.astype('float32')
y_train = concatenated_data.iloc[:, -1].values
X_test = KDDTest.iloc[:, :-1].values.astype('float32')
y_test = KDDTest.iloc[:, -1].values

In [30]:
# Deep Neural Network for 4 class classification

# Import necessary libraries
from keras.models import Sequential
from keras.layers import Input, Dense
from keras.models import Model
from keras.utils import plot_model, to_categorical
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as pyplot


# Number of classes 0 = DoS, 1 = Probe, 2 = R2L, 3 = U2R | lexicographic order | 4 class classification
n_classes = 4
y_train_encoded = to_categorical(y_train, num_classes=n_classes)
y_test_encoded = to_categorical(y_test, num_classes=n_classes)

# Number of features in the input data (40 total features)
n_inputs = 40

# Define the input layer
visible = Input(shape=(n_inputs,))

# Hidden Layer 1
e = Dense(80, activation='relu')(visible)  # 80 neurons with ReLU activation

# Hidden layer 2
e = Dense(40, activation='relu')(e) # 40 neurons with ReLU activation

# Hidden Layer 3
e = Dense(4, activation='relu')(e) # 4 neurons with ReLU activation

# Output Layer
output = Dense(4, activation='softmax')(e) # Condensed to 4 neurons (for 4 classes)

# Define the Deep Neural Network model
model = Model(inputs=visible, outputs=output)

# Cast the input data to float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Compile the model with a suitable loss function for classification, e.g., categorical cross-entropy
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Possible Better performance when a fixed learning rate is NOT used with Adam Optimizer, however not as stable/consistent overall
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping with a patience of 6 steps
early_stopping = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)

# Fit the Deep Neural Network model to reconstruct input with batch size of 32 and 9 epochs
history = model.fit(X_train, y_train_encoded, epochs=50, batch_size=32, verbose=2, validation_split=0.15, callbacks=[early_stopping])

Epoch 1/50
2825/2825 - 4s - 1ms/step - accuracy: 0.8634 - loss: 0.4331 - val_accuracy: 0.8454 - val_loss: 0.4094
Epoch 2/50
2825/2825 - 2s - 765us/step - accuracy: 0.9477 - loss: 0.1562 - val_accuracy: 0.8762 - val_loss: 0.3254
Epoch 3/50
2825/2825 - 2s - 730us/step - accuracy: 0.9530 - loss: 0.1399 - val_accuracy: 0.8803 - val_loss: 0.3289
Epoch 4/50
2825/2825 - 2s - 745us/step - accuracy: 0.9555 - loss: 0.1317 - val_accuracy: 0.8820 - val_loss: 0.3128
Epoch 5/50
2825/2825 - 2s - 686us/step - accuracy: 0.9568 - loss: 0.1267 - val_accuracy: 0.8776 - val_loss: 0.3634
Epoch 6/50
2825/2825 - 2s - 743us/step - accuracy: 0.9579 - loss: 0.1209 - val_accuracy: 0.8808 - val_loss: 0.3253
Epoch 7/50
2825/2825 - 2s - 689us/step - accuracy: 0.9588 - loss: 0.1171 - val_accuracy: 0.8899 - val_loss: 0.3309
Epoch 8/50
2825/2825 - 2s - 705us/step - accuracy: 0.9600 - loss: 0.1138 - val_accuracy: 0.8937 - val_loss: 0.2945
Epoch 9/50
2825/2825 - 2s - 701us/step - accuracy: 0.9613 - loss: 0.1099 - val_acc

In [31]:
!pip install graphviz





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [33]:
# Define a deep network model
import tensorflow as tf
from tensorflow import keras

neural_network = Model(inputs=visible, outputs=output)
#plot_model(neural_network, 'DNN_CTGAN.png', show_shapes=True) #Have to recheck

# Save the neural_network model in Keras format
neural_network.save('DNN_CTGAN_AUG.keras')

neural_network = tf.keras.models.load_model('DNN_CTGAN_AUG.keras')


# Data cleaning, preprocessing, sampling, and neural network has been applied prior to training
# SoftMax Regression Multiclass Classification (4 class)

from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef

# Make predictions on the test data
y_pred = neural_network.predict(X_test)

# Convert the predicted probabilities to class labels
y_pred_classes = np.argmax(y_pred, axis=1)

# Convert one-hot encoded true labels to class labels
y_test_classes = np.argmax(y_test_encoded, axis=1)

# Print classification report and confusion matrix on the test set
class_names = ["DoS", "Probe", "R2L", "U2R"]
print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes, target_names=class_names))

print("Confusion Matrix:")
print(confusion_matrix(y_test_classes, y_pred_classes), "\n")

# Calculate MCC
mcc_score = matthews_corrcoef(y_test_classes, y_pred_classes)
print("MCC Score:", mcc_score)

[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step
Classification Report:
              precision    recall  f1-score   support

         DoS       0.93      0.86      0.89      7458
       Probe       0.57      0.94      0.71      2421
         R2L       0.88      0.57      0.69      2887
         U2R       0.36      0.46      0.41        67

    accuracy                           0.80     12833
   macro avg       0.68      0.71      0.67     12833
weighted avg       0.84      0.80      0.81     12833

Confusion Matrix:
[[6397  862  199    0]
 [ 154 2264    3    0]
 [ 357  838 1638   54]
 [   0    8   28   31]] 

MCC Score: 0.6859039271716596
