In [372]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight
import numpy as np
from tensorflow.keras.optimizers import SGD

In [373]:
#df = pd.read_csv('all_data.csv')
df = pd.read_csv('cleaned_data_drop.csv')

In [249]:
#print(df.isnull().sum())

In [374]:
df = df.dropna()

In [251]:
#print(df.isnull().sum())

In [None]:
# if using clean data
print("Original Attack Distribution:")
print(df['Attack'].value_counts())

# dropping cases to balance the target column
num_normal_cases_to_drop = 272000  
num_neptune_cases_to_drop = 45000  

# Filter the DataFrame for the "normal" and "neptune" classes
normal_cases = df[df['Attack'] == 'normal']
neptune_cases = df[df['Attack'] == 'neptune']

# Randomly drop specified cases from the "normal" and "neptune" classes
normal_cases_dropped = normal_cases.sample(n=len(normal_cases) - num_normal_cases_to_drop, random_state=42)
neptune_cases_dropped = neptune_cases.sample(n=len(neptune_cases) - num_neptune_cases_to_drop, random_state=42)

# Combine the modified cases with the other classes
modified_df = pd.concat([normal_cases_dropped, neptune_cases_dropped, df[df['Attack'] != 'normal'][df['Attack'] != 'neptune']])

# Print the new distribution after dropping cases
print("\nModified Attack Distribution:")
print(modified_df['Attack'].value_counts())

print (df.shape)
print (modified_df.shape)

# if using all_data file to test the model

# print("Original Attack Distribution:")
# print(df['Attack'].value_counts())

# num_normal_cases_to_drop = 589000  # Change this number as needed for "normal"
# num_neptune_cases_to_drop = 200000   # Change this number as needed for "neptune"
# num_smurf_cases_to_drop = 220000   # Change this number as needed for "smurf"

# normal_cases = df[df['Attack'] == 'normal']
# neptune_cases = df[df['Attack'] == 'neptune']
# smurf_cases = df[df['Attack'] == 'smurf']

# normal_cases_dropped = normal_cases.sample(n=len(normal_cases) - num_normal_cases_to_drop, random_state=42)
# neptune_cases_dropped = neptune_cases.sample(n=len(neptune_cases) - num_neptune_cases_to_drop, random_state=42)
# smurf_cases_dropped = smurf_cases.sample(n=len(smurf_cases) - num_smurf_cases_to_drop, random_state=42)

# modified_df = pd.concat([
#     normal_cases_dropped,
#     neptune_cases_dropped,
#     smurf_cases_dropped,  # Use the dropped smurf cases here
#     df[~df['Attack'].isin(['normal', 'neptune', 'smurf'])]  # Filter out the other classes])

# print("\nModified Attack Distribution:")
# print(modified_df['Attack'].value_counts())

In [None]:
# Select a specific categorical column
column = 'Attack'

# Bar Plot to see distribution before
plt.figure(figsize=(12, 6))
sns.countplot(y=df[column], order=df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of balanced {column}')
plt.show()

# Bar Plot to see distribution after
plt.figure(figsize=(12, 6))
sns.countplot(y=modified_df[column], order=modified_df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of balanced {column}')
plt.show()

In [None]:
column = 'protocol_typesymbolic'

plt.figure(figsize=(12, 6))
sns.countplot(y=df[column], order=df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of balanced {column}')
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(y=modified_df[column], order=modified_df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of balanced {column}')
plt.show()

In [None]:
column = 'servicesymbolic'

plt.figure(figsize=(12, 6))
sns.countplot(y=modified_df[column], order=modified_df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of balanced {column}')
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(y=df[column], order=df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of {column}')
plt.show()

In [None]:
column = 'flagsymbolic'

plt.figure(figsize=(12, 6))
sns.countplot(y=modified_df[column], order=modified_df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of balanced {column}')
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(y=df[column], order=df[column].value_counts().index[:10])  # Top 10 most frequent values
plt.title(f'Count Plot of {column}')
plt.show()

In [None]:
# Encoding the 'Attack' column whihc is our target column
label_encoder = LabelEncoder()
modified_df['Attack'] = label_encoder.fit_transform(modified_df['Attack'])

#Encoding the other categorical columns
modified_df = pd.get_dummies(modified_df, columns=['protocol_typesymbolic', 'servicesymbolic', 'flagsymbolic'])

# Feature Selection
X = modified_df.drop(columns=['Attack'])
y = modified_df['Attack']

# Standardizing the features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test you can change according to accuracy
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
# Convert target variable to categorical (one-hot encoded for multi-class classification)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

model = Sequential()

# Input layer
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

# Hidden layers with dropout these can vary aswell
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(units=16, activation='relu'))
model.add(Dropout(0.3))

# Output layer all factors can vary
model.add(Dense(units=y_train_cat.shape[1], activation='sigmoid'))

# Compile the model all factors can vary
model.compile(optimizer=SGD(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model epochs can vary
history = model.fit(X_train, y_train_cat, epochs=50, batch_size=32, validation_data=(X_test, y_test_cat))

# Perform predictions on the test set
y_pred = model.predict(X_test)

# Check the shape of predictions
print("Shape of y_pred:", y_pred.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {test_acc:.2f}")

In [380]:
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)
print("Shape of y_test:", y_test.shape)
print("Shape of y_pred:", y_pred.shape)

Shape of X_test: (3016, 103)
Shape of y_test: (3016,)


In [None]:
# Plotting Training and Validation Loss
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Learning Curve')
plt.show()

In [383]:
# Get the predicted class labels from the predicted probabilities for RMSE
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE, not relevent here
rmse = np.sqrt(mean_squared_error(y_test, y_pred_classes))
print("Root Mean Square Error (RMSE):", rmse)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarize the output (one-hot encoding)
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
y_pred_bin = model.predict(X_test)

# Calculate ROC
fpr_micro, tpr_micro, _ = roc_curve(y_test_bin.ravel(), y_pred_bin.ravel())
roc_auc_micro = auc(fpr_micro, tpr_micro)

plt.figure()
plt.plot(fpr_micro, tpr_micro, label='Micro-average ROC curve (area = {0:0.2f})'.format(roc_auc_micro))

plt.plot([0, 1], [0, 1], 'k--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Micro-Averaged Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Make predictions we have made above already made here again does not matter
y_pred_classes = np.argmax(y_pred_bin, axis=1)

cm = confusion_matrix(y_test, y_pred_classes)

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [387]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.9360079575596817
Precision: 0.914271261902086
Recall: 0.9360079575596817
F1 Score: 0.923878598490224


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# # Save the trained model if you want to not required here
# model.save('ann_model.keras')
# model.save('ann_model.h5')


In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    # Plot training & validation accuracy values
    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    plt.grid()

    # Plot training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    plt.grid()

    plt.tight_layout()
    plt.show()
    
#history is the model
plot_training_history(history)
