In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [32]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [33]:
train_set = pd.read_csv('./Dataset/NSL_KDD_Train.csv', header=None, names = col_names)
test_set = pd.read_csv('./Dataset/NSL_KDD_Test.csv',header=None, names = col_names)

In [34]:
train_set_label = train_set['label']

newlabeldf=train_set_label.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

train_set['label'] = newlabeldf

In [35]:
test_set_label = test_set['label']
newlabeldf_test= test_set_label.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

test_set['label'] = newlabeldf_test

In [36]:
from sklearn.preprocessing import OneHotEncoder

In [37]:
categorical_columns=['protocol_type', 'service', 'flag']

train_categorical_values = train_set[categorical_columns]
test_categorical_values = test_set[categorical_columns]
train_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [38]:
# protocol type
unique_protocol=sorted(train_set.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol_train=[string1 + x for x in unique_protocol]
print(unique_protocol_train)

# service
unique_service=sorted(train_set.service.unique())
string2 = 'service_'
unique_service_train=[string2 + x for x in unique_service]
print(unique_service_train)


# flag
unique_flag=sorted(train_set.flag.unique())
string3 = 'flag_'
unique_flag_train=[string3 + x for x in unique_flag]
print(unique_flag_train)


# put together
dumcols_train=unique_protocol_train + unique_service_train + unique_flag_train


#do it for test set
unique_service_test=sorted(test_set.service.unique())
unique_service_test=[string2 + x for x in unique_service_test]
dumcols_test=unique_protocol_train + unique_service_test + unique_flag_train

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp']
['service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'ser

In [39]:
encoder = OneHotEncoder(categories='auto')
train_categorical_values_enc = encoder.fit_transform(train_categorical_values)
train_cat_data = pd.DataFrame(train_categorical_values_enc.toarray(),columns=dumcols_train)


# test set
testdf_categorical_values_enc = encoder.fit_transform(test_categorical_values)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_enc.toarray(),columns=dumcols_test)

train_cat_data.head()

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
trainservice=train_set['service'].tolist()
testservice= test_set['service'].tolist()
difference=list(set(trainservice) - set(testservice))
string = 'service_'
difference=[string + x for x in difference]
difference

['service_http_8001',
 'service_http_2784',
 'service_aol',
 'service_red_i',
 'service_harvest',
 'service_urh_i']

In [41]:
for col in difference:
    testdf_cat_data[col] = 0

print(train_cat_data.shape)
print(testdf_cat_data.shape)

(125973, 84)
(22544, 84)


In [42]:
newdf_train=train_set.join(train_cat_data)
newdf_train.drop('flag', axis=1, inplace=True)
newdf_train.drop('protocol_type', axis=1, inplace=True)
newdf_train.drop('service', axis=1, inplace=True)

# test data
newdf_test=test_set.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)

print(newdf_train.shape)
print(newdf_test.shape)

(125973, 123)
(22544, 123)


In [43]:
newdf_train.sort_index(axis=1,inplace=True)
newdf_test.sort_index(axis=1,inplace=True)

4. Ensemble

Hard Voting

In [44]:
x_train3 = newdf_train
x_train3 = x_train3.drop(['label'], axis=1)
y_train3 = newdf_train['label']

x_test3 = newdf_test
x_test3 = newdf_test.drop(['label'],axis=1)
y_test3 = newdf_test['label']

In [45]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

In [46]:
x_train3 = scaler.fit_transform(x_train3)
x_test3 = scaler.transform(x_test3)

In [50]:
# Reshape for CNN and LSTM if necessary
X_train_cnn = x_train3.reshape(-1, 122, 1, 1)  # For CNN
X_test_cnn = x_train3.reshape(-1, 122, 1, 1)

X_train_lstm = x_train3.reshape(-1, 122, 1)    # For LSTM
X_test_lstm = x_train3.reshape(-1, 122, 1)

In [51]:
# Custom wrapper class for Keras models
class KerasModelWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model_path, input_shape):
        self.model_path = model_path
        self.model = tf.keras.models.load_model(model_path)
        self.input_shape = input_shape
    
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        X = X.reshape(self.input_shape)
        predictions = self.model.predict(X)
        return np.argmax(predictions, axis=1)

In [56]:
# Load saved models
ann_model = KerasModelWrapper('./Model/model_ANN.h5', (-1, 122))             # ANN: Input shape for the flat array
cnn_model = KerasModelWrapper('./Model/model_CNN.h5', (-1, 1, 122, 1))       # CNN: Input shape for 4D array
lstm_model = KerasModelWrapper('./Model/model_LSTM.h5', (-1, 1, 122))        # LSTM: Input shape for 3D array



In [57]:
# Create VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[('ann', ann_model), ('cnn', cnn_model), ('lstm', lstm_model)],
    voting='hard'
)

In [58]:
ensemble_model.fit(x_test3, y_test3)



In [59]:
y_pred = ensemble_model.predict(x_test3)
accuracy = accuracy_score(y_test3, y_pred)

[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step


In [60]:
print(f"Ensemble Model Accuracy: {accuracy * 100:.2f}%")

Ensemble Model Accuracy: 63.77%


Soft Voting

In [87]:
class KerasModelWrapper2(BaseEstimator, ClassifierMixin):
    def __init__(self, model_path, input_shape):
        self.model_path = model_path
        self.model = tf.keras.models.load_model(model_path)
        self.input_shape = input_shape
    
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        X = X.reshape(self.input_shape)
        predictions = self.model.predict(X)
        return np.argmax(predictions, axis=1)
    
    def predict_proba(self, X):
        X = X.reshape(self.input_shape)
        return self.model.predict(X)

# Load models using the updated KerasModelWrapper2
ann_model2 = KerasModelWrapper2('./Model/model_ANN.h5', (-1, 122))              # ANN: Input shape for the flat array
cnn_model2 = KerasModelWrapper2('./Model/model_CNN.h5', (-1, 1, 122, 1))        # CNN: Input shape for 4D array
lstm_model2 = KerasModelWrapper2('./Model/model_LSTM.h5', (-1, 1, 122))          # LSTM: Input shape for 3D array

# Create the VotingClassifier using the correct wrapper instances
ensemble_model2 = VotingClassifier(
    estimators=[('ann', ann_model2), ('cnn', cnn_model2), ('lstm', lstm_model2)],
    voting='soft'  # Use soft voting
)

# Fit the ensemble model with the appropriate training data
ensemble_model2.fit(x_train3, y_train3)  # Make sure to use the right training data

# Make predictions on test data
y_pred2 = ensemble_model2.predict(x_test3)  # Ensure that X_test_cnn is the appropriate input shape
accuracy2 = accuracy_score(y_test3, y_pred2)

print(f"Ensemble Model Accuracy: {accuracy2 * 100:.2f}%")



[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Ensemble Model Accuracy: 68.25%
