## Lib

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering


## Dataset loading

In [2]:
todrop = ['srcip', 'sport', 'dstip', 'dsport','sloss', 'dloss','stcpb', 'dtcpb', 'trans_depth', 'Stime', 'Ltime','ct_flw_http_mthd', 
        'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat']

In [3]:
keep = ['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'smeansz',
       'dmeansz',  'res_bdy_len', 'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'ct_state_ttl','Label']


In [2]:
train = pd.read_csv(r'D:\Project Phase II\Dataset\finaltrain.csv',encoding='cp1252')
test = pd.read_csv(r'D:\Project Phase II\Dataset\finaltest.csv',encoding='cp1252')

  train = pd.read_csv(r'D:\Project Phase II\Dataset\finaltrain.csv',encoding='cp1252')
  test = pd.read_csv(r'D:\Project Phase II\Dataset\finaltest.csv',encoding='cp1252')


In [93]:
print("Training set:")
print('Attack: ', len(train[train['Label'] == 0]), ' Non attack: ', len(train[train['Label'] == 0]), 'Total: ', len(train))
print("Testing set:")
print('Attack: ', len(test[test['Label'] == 0]), ' Non attack: ', len(test[test['Label'] == 0]), 'Total: ', len(test))

Training set:
Attack:  40502  Non attack:  40502 Total:  81004
Testing set:
Attack:  17358  Non attack:  17358 Total:  34716


## Data preprocessing

Extract edge features alone from train and test

In [3]:
todrop = ['srcip', 'sport', 'dstip', 'dsport','sloss', 'dloss','stcpb', 'dtcpb', 'trans_depth', 'Stime', 'Ltime','ct_flw_http_mthd', 
        'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label']

trainAttributes = train.drop(todrop, axis = 1)
testAttributes = test.drop(todrop, axis = 1)

trainLabel = train['Label']
testLabel = test['Label']

Use LDA to reduce dimension to 1

In [4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components = 1)
lda_x_train = lda.fit_transform(trainAttributes, trainLabel)
lda_x_test = lda.transform(testAttributes)

Compute nodes and edges

Consider only 7000 train records since memory becomes high

In [6]:
train = pd.concat([train.head(3500), train.tail(3500)])
lda_x_train = np.concatenate((lda_x_train[:3500], lda_x_train[len(lda_x_train)-3500:]))

In [7]:
nodes = set()
edges = set()
for i in range(7000):
    src = str(train['srcip'].iloc[i])+':'+str(train['sport'].iloc[i])
    dst = str(train['dstip'].iloc[i])+':'+str(train['dsport'].iloc[i])
    nodes.add(src)
    nodes.add(dst)
    edges.add((src,dst))

In [9]:
len(nodes)

8963

Train Data

In [8]:
import numpy as np
# Prepare an adjacency matrix of the nodes
adjacency_matrix = np.zeros((len(nodes), len(nodes)), dtype=float)
label_matrix = np.zeros((len(nodes), len(nodes)))


# Add the columns srcip:sport and dstip:dsport to the adjacency matrix
for i in range(7000):
    src = str(train['srcip'].iloc[i])+':'+str(train['sport'].iloc[i])
    dst = str(train['dstip'].iloc[i])+':'+str(train['dsport'].iloc[i])
    src_index = list(nodes).index(src)
    dst_index = list(nodes).index(dst)
    adjacency_matrix[src_index, dst_index] = lda_x_train[i]
    label_matrix[src_index, dst_index] = train['Label'].iloc[i]
# Flatten tuple values in the adjacency matrix
adjacency_matrix_flat = adjacency_matrix.reshape(-1, 1)
label_flat = label_matrix.reshape(-1, 1)

  adjacency_matrix[src_index, dst_index] = lda_x_train[i]


Test Dataset

Consider only 5000 records for test

In [9]:
test = pd.concat([test.head(2500), test.tail(2500)])
lda_x_test = np.concatenate((lda_x_test[:2500], lda_x_test[len(lda_x_test)-2500:]))

In [10]:
tenodes = set()
teedges = set()
for i in range(len(test)):
    src = str(test['srcip'].iloc[i])+':'+str(test['sport'].iloc[i])
    dst = str(test['dstip'].iloc[i])+':'+str(test['dsport'].iloc[i])
    tenodes.add(src)
    tenodes.add(dst)
    teedges.add((src,dst))

In [11]:
import numpy as np
# Prepare an adjacency matrix of the nodes
adjacency_matrix_test = np.zeros((len(tenodes), len(tenodes)), dtype=float)
label_matrix_test = np.zeros((len(tenodes), len(tenodes)))


# Add the columns srcip:sport and dstip:dsport to the adjacency matrix
for i in range(len(test)):
    src = str(test['srcip'].iloc[i])+':'+str(test['sport'].iloc[i])
    dst = str(test['dstip'].iloc[i])+':'+str(test['dsport'].iloc[i])
    src_index = list(tenodes).index(src)
    dst_index = list(tenodes).index(dst)
    adjacency_matrix_test[src_index, dst_index] = lda_x_test[i]
    label_matrix_test[src_index, dst_index] = test['Label'].iloc[i]
# Flatten tuple values in the adjacency matrix
adjacency_matrix_test_flat = adjacency_matrix_test.reshape(-1, 1)
label_test_flat = label_matrix_test.reshape(-1, 1)

  adjacency_matrix_test[src_index, dst_index] = lda_x_test[i]


## LSTM Model

In [16]:
# Create an instance of the model
# Hyperparameters
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Modify the call method of the LSTMClassifier
class LSTMClassifier(tf.keras.Model):
    def __init__(self, input_shape, hidden_units, output_units):
        super(LSTMClassifier, self).__init__()
        self.lstm = LSTM(hidden_units, return_sequences=False)  # Set return_sequences=False since we only need the output at the last timestep
        self.dense = Dense(output_units, activation='sigmoid')

    def call(self, inputs):
        x = tf.expand_dims(inputs, axis=-1)  # Add a new dimension to represent the input features
        x = self.lstm(x)
        output = self.dense(x)
        return output
    

# Hyperparameters
input_shape = adjacency_matrix_flat.shape[1:]  # Assuming each tuple has 2 attributes
hidden_units = 16
output_units = 1  # Binary classification

# Convert adjacency_matrix_flat to a TensorFlow tensor
adjacency_tensor = tf.convert_to_tensor(adjacency_matrix_flat, dtype=tf.float32)

# Initialize model, optimizer, and loss function
lstmmodel = LSTMClassifier(input_shape, hidden_units, output_units)
lstmmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstmhistory = lstmmodel.fit(adjacency_tensor, label_flat, epochs=2, batch_size=32, validation_split=0.2)





Epoch 1/2


Epoch 2/2


In [17]:
lstmmodel.save_weights("lstmfinalldaweights.h5")

Load model

In [12]:
# Create an instance of the model
# Hyperparameters
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Modify the call method of the LSTMClassifier
class LSTMClassifier(tf.keras.Model):
    def __init__(self, input_shape, hidden_units, output_units):
        super(LSTMClassifier, self).__init__()
        self.lstm = LSTM(hidden_units, return_sequences=False)  # Set return_sequences=False since we only need the output at the last timestep
        self.dense = Dense(output_units, activation='sigmoid')

    def call(self, inputs):
        x = tf.expand_dims(inputs, axis=-1)  # Add a new dimension to represent the input features
        x = self.lstm(x)
        output = self.dense(x)
        return output
    
input_shape = adjacency_matrix_flat.shape[1:]  # Assuming each tuple has 2 attributes
hidden_units = 16
output_units = 1  # Binary classification
# Initialize model, optimizer, and loss function
lstmmodel = LSTMClassifier(input_shape, hidden_units, output_units)

dummy_input = np.zeros((4, 4), dtype=float)
dummy_label = np.zeros((4, 4))
#Use dummy data to initialize the model
dummy_flat = dummy_input.reshape(-1, 1)
dummy_label_flat = dummy_label.reshape(-1, 1) 

lstmmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstmhistory = lstmmodel.fit(dummy_flat, dummy_label_flat, epochs=2, batch_size=32, validation_split=0.2)




Epoch 1/2


Epoch 2/2


In [13]:
# Load the saved weights
lstmmodel.load_weights("lstmfinalldaweights.h5")

In [14]:
lstmpredictions = lstmmodel.predict(adjacency_matrix_test_flat)
lstmpredictions = (lstmpredictions > 0.5).astype(int)

  16740/1350547 [..............................] - ETA: 41:28

In [None]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(label_test_flat, lstmpredictions)

print("Classification Report for LSTM:")
print(report)

## GRU

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense

# Modify the call method of the GRUClassifier
class GRUClassifier(tf.keras.Model):
    def __init__(self, input_shape, hidden_units, output_units):
        super(GRUClassifier, self).__init__()
        self.gru = GRU(hidden_units, return_sequences=False)  # Set return_sequences=False since we only need the output at the last timestep
        self.dense = Dense(output_units, activation='sigmoid')

    def call(self, inputs):
        x = tf.expand_dims(inputs, axis=-1)  # Add a new dimension to represent the input features
        x = self.gru(x)
        output = self.dense(x)
        return output

# Hyperparameters
input_shape = adjacency_matrix_flat.shape[1:]  # Assuming each tuple has 2 attributes
hidden_units = 16
output_units = 1  # Binary classification

grumodel = GRUClassifier(input_shape, hidden_units, output_units)


grumodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gruhistory = grumodel.fit(adjacency_tensor, label_flat, epochs=2, batch_size=32, validation_split=0.2)

# Convert adjacency_matrix_flat to a TensorFlow tensor


In [None]:
grumodel.save_weights("grufinalldaweights.h5")

In [None]:
grupredictions = grumodel.predict(adjacency_matrix_test_flat)
grupredictions = (grupredictions > 0.5).astype(int)

In [None]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(label_test_flat, grupredictions)

print("Classification Report for GRU:")
print(report)