In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/5G NIDD/updated_Combined.csv')

In [4]:
print ('object_columns:{}', df.select_dtypes(include=[object]).columns)
print('float_columns: {}', df.select_dtypes(include=[float]).columns)
print('int_columns:{}',  df.select_dtypes(include=[int]).columns)

object_columns:{} Index(['Proto', 'Cause', 'State', 'Label', 'Attack Type', 'Attack Tool'], dtype='object')
float_columns: {} Index(['Dur', 'sTtl', 'dTtl', 'sHops', 'dHops', 'sMeanPktSz', 'dMeanPktSz',
       'Load', 'SrcLoad', 'DstLoad', 'pLoss', 'Rate', 'SrcRate', 'DstRate',
       'SrcWin', 'DstWin', 'SrcTCPBase', 'DstTCPBase', 'TcpRtt', 'SynAck',
       'AckDat'],
      dtype='object')
int_columns:{} Index(['Unnamed: 0', 'Seq', 'TotPkts', 'SrcPkts', 'DstPkts', 'TotBytes',
       'SrcBytes', 'DstBytes', 'Offset', 'Loss', 'SrcLoss', 'DstLoss'],
      dtype='object')


In [5]:
df.shape

(1215890, 39)

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Input, concatenate
from keras.models import Model

# Assuming 'df' is your cleaned dataset loaded into a pandas DataFrame
# Assuming 'Label' is the column containing the labels (Benign/Malicious)

# Remove unnecessary columns
df = df.drop(columns=['Unnamed: 0', 'Seq', 'Attack Type', 'Attack Tool', 'Cause'])

# One-hot encode categorical columns 'Proto' and 'State'
df = pd.get_dummies(df, columns=['Proto', 'State'])

# Convert categorical labels to numerical labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

# Separate features and target variable
X = df.drop(columns=['Label'])
y = df['Label']

# Normalize only the numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Reshape the data for CNN input
X = X.values.reshape(-1, X.shape[1], 1)
y = y.values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the CNN model
input_shape = (X_train.shape[1], 1)
input1 = Input(shape=input_shape)
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(input1)
pool1 = MaxPooling1D(pool_size=2)(conv1)
flat1 = Flatten()(pool1)

# Define additional input for one-hot encoded features
input2 = Input(shape=(X_train.shape[1],))  # Shape matches the number of features after one-hot encoding
dense1 = Dense(50, activation='relu')(input2)

# Concatenate the output of the CNN layers and the dense layer
concat = concatenate([flat1, dense1])

# Add more layers as needed
dense2 = Dense(10, activation='relu')(concat)

# Output layer
output = Dense(1, activation='sigmoid')(dense2)

# Combine inputs and outputs into a model
model = Model(inputs=[input1, input2], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_train, X_train[:, :, 0]], y_train, epochs=10, batch_size=64, validation_data=([X_test, X_test[:, :, 0]], y_test))


# Evaluate the model
loss, accuracy = model.evaluate([X_test, X_test[:, :, 0]], y_test)
print("Test Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9812359809875488


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict probabilities for test set
y_pred_probs = model.predict([X_test, X_test[:, :, 0]])

# Convert probabilities to binary predictions
y_pred = (y_pred_probs > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.9812359670693895
Precision: 0.9732086649393334
Recall: 0.9964718015524073
F1-score: 0.9847028572769544


In [None]:
print("X_train dtype:", X_train.dtype)
print("X_train shape:", X_train.shape)
print("X_train[:, :, 0] dtype:", X_train[:, :, 0].dtype)
print("X_train[:, :, 0] shape:", X_train[:, :, 0].shape)

print("X_test dtype:", X_test.dtype)
print("X_test shape:", X_test.shape)
print("X_test[:, :, 0] dtype:", X_test[:, :, 0].dtype)
print("X_test[:, :, 0] shape:", X_test[:, :, 0].shape)


X_train dtype: object
X_train shape: (972712, 51, 1)
X_train[:, :, 0] dtype: object
X_train[:, :, 0] shape: (972712, 51)
X_test dtype: object
X_test shape: (243178, 51, 1)
X_test[:, :, 0] dtype: object
X_test[:, :, 0] shape: (243178, 51)


In [None]:
# Replace non-numeric values with NaN
X_train = np.where(X_train == 'Start', np.nan, X_train)
X_test = np.where(X_test == 'Start', np.nan, X_test)

# Convert to numeric values
X_train = np.array([[pd.to_numeric(cell, errors='coerce') for cell in row] for row in X_train])
X_test = np.array([[pd.to_numeric(cell, errors='coerce') for cell in row] for row in X_test])

# Fill NaN values with a specific value or strategy (for example, median)
X_train = np.nan_to_num(X_train, nan=np.nanmedian(X_train))
X_test = np.nan_to_num(X_test, nan=np.nanmedian(X_test))

# Reshape the data for CNN input
X_train = X_train.reshape(-1, X_train.shape[1], 1)
X_test = X_test.reshape(-1, X_test.shape[1], 1)



In [None]:
df = df.drop(columns=['Cause'])