In [None]:
# importing required libraries
import pandas as pd
import numpy as np
# importing library for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# importing required libraries for normalizing data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

from keras.models import Sequential #importing Sequential layer
from keras.layers import Dense
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay # for generating a classification report,confusion matrix of model

import joblib

In [None]:
file_path = 'train_data.csv'
file_path_test = 'test_data.csv'

In [None]:
df = pd.read_csv(file_path)
test_df = pd.read_csv(file_path_test)

In [None]:
columns = ([
    'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 
    'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 
    'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 
    'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 
    'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 
    'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 
    'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 
    'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 
    'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 
    'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 
    'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 
    'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 
    'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 
    'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 
    'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 
    'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 
    'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 
    'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 
    'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 
    'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts', 
    'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 
    'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 
    'Label'
])

df.columns = columns
test_df.columns = columns


df.head()


In [None]:
test_df.head()

In [None]:
df.shape

In [None]:
test_df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().values.any()

In [None]:
test_df.isnull().values.any()

In [None]:
df['Label'].value_counts()

In [None]:
test_df['Label'].value_counts()

In [None]:
# Define your attack categories with new labels
dos_attacks = ['DoS', 'DDoS']  # Based on the provided data, 'DoS' and 'DDoS' are DOS attacks
probe_attacks = ['Probe']      # Based on the provided data, 'Probe' is the only Probe attack label
privilege_attacks = ['U2R']    # Based on the provided data, 'U2R' is the only Privilege attack label
access_attacks = ['BFA', 'Web-Attack', 'BOTNET']  # Based on the provided data, these are Access attacks

In [None]:
# Apply the updated mapping logic to the training dataset
df['attack_category'] = df['Label'].apply(
    lambda x: 'dos' if x in dos_attacks else (
        'probe' if x in probe_attacks else (
            'privilege' if x in privilege_attacks else (
                'access' if x in access_attacks else 'normal'
            )
        )
    )
)

# Apply the updated mapping logic to the test dataset
test_df['attack_category'] = test_df['Label'].apply(
    lambda x: 'dos' if x in dos_attacks else (
        'probe' if x in probe_attacks else (
            'privilege' if x in privilege_attacks else (
                'access' if x in access_attacks else 'normal'
            )
        )
    )
)

In [None]:
df['attack_category'].value_counts()

In [None]:
test_df['attack_category'].value_counts()

In [None]:
df

In [None]:
test_df

In [None]:
df.drop(['Label'],axis=1,inplace=True)
test_df.drop(['Label'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
test_df.shape

In [None]:
lab = LabelEncoder()
df['Flow ID'] = lab.fit_transform(df['Flow ID'])
df['Src IP'] = lab.fit_transform(df['Src IP'])
df['Dst IP'] = lab.fit_transform(df['Dst IP'])
df['Timestamp'] = lab.fit_transform(df['Timestamp'])
df['attack_category'] = lab.fit_transform(df['attack_category'])

df.info()

In [None]:




# Ensure you are encoding columns that exist in test_df
test_df['Flow ID'] = lab.fit_transform(test_df['Flow ID'])
test_df['Src IP'] = lab.fit_transform(test_df['Src IP'])
test_df['Dst IP'] = lab.fit_transform(test_df['Dst IP'])
test_df['Timestamp'] = lab.fit_transform(test_df['Timestamp'])
test_df['attack_category'] = lab.fit_transform(test_df['attack_category'])

test_df.info()


In [None]:
lab.classes_

In [None]:
# Retrieve the mapping of encoded labels to original categories
mapping = {index: category for index, category in enumerate(lab.classes_)}

# Print the mapping
print(mapping)

In [None]:
df['attack_category'].value_counts()

In [None]:
test_df['attack_category'].value_counts()

In [None]:

plt.figure(figsize=(25, 25))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
corr = df.corr()['attack_category']
corr = corr.drop(['attack_category'])

# Set thresholds for correlation coefficients
high_threshold = 0.5  # Threshold for highly correlated features
low_threshold = 0.1   # Threshold for low correlated features

# Find highly correlated features
highly_correlated_features = corr[abs(corr) > high_threshold].index.tolist()

# Find low correlated features
low_correlated_features = corr[abs(corr) < low_threshold].index.tolist()

In [None]:
selected_features = highly_correlated_features + low_correlated_features

In [None]:
print(highly_correlated_features)
print(low_correlated_features)

In [None]:
selected_features

In [None]:
data = df.drop(columns=selected_features)

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data

In [None]:
X = data.iloc[:,:-1]
X.shape

In [None]:
y = data.iloc[:,-1]
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [None]:
joblib.dump(X_train, 'X_train')
joblib.dump(X_test,'X_test')
joblib.dump(y_train,'y_train')
joblib.dump(y_test, 'y_test')

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.fit_transform(X_test)

In [None]:
svm = SVC()

In [None]:
# from sklearn.decomposition import PCA

# # Specify a number of components less than or equal to the minimum of n_samples and n_features
# n_components = min(10, X_scaled_train.shape[1])
# pca = PCA(n_components=n_components, svd_solver='arpack')
# X_train_pca = pca.fit_transform(X_scaled_train)
# X_test_pca = pca.transform(X_scaled_test)

In [None]:
# svm.fit(X_scaled_train,y_train)

In [None]:
# joblib.dump(svm, 'svm_model')

In [None]:
svm_loaded = joblib.load('svm_model')
y_pred_svm = svm_loaded.predict(X_scaled_test)
# y_pred_svm = svm.predict(X_pca_test)

In [None]:
print(classification_report(y_test,y_pred_svm))

In [None]:
print(ConfusionMatrixDisplay.from_predictions(y_test,y_pred_svm))

In [None]:
print(confusion_matrix(y_test,y_pred_svm))

In [None]:
from keras.utils import to_categorical
y_train_cat = to_categorical(y_train, num_classes=5)  # Ensure 5 classes
y_test_cat = to_categorical(y_test, num_classes=5)


In [None]:
num_classes = 5  

ann = Sequential()
ann.add(Dense(X_train.shape[1], activation='relu'))
ann.add(Dense(X_train.shape[1], activation='relu'))
ann.add(Dense(X_train.shape[1], activation='relu'))
ann.add(Dense(X_train.shape[1], activation='relu'))
ann.add(Dense(num_classes, activation='softmax'))

In [None]:
ann.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])

In [None]:
# ann.fit(X_scaled_train, y_train_cat, batch_size=50, epochs=50)

In [None]:
# joblib.dump(ann, 'ann_model')

In [None]:
loss,accuracy = ann.evaluate(X_scaled_train,y_train_cat)
print('Loss',loss)
print('Accuracy',accuracy)

In [None]:
ann_loaded = joblib.load('ann_model')
y_pred_ann = ann_loaded.predict(X_scaled_test)
y_pred_ann

In [None]:
y_test_cat

In [None]:
y_train_cat

In [None]:
print(classification_report(y_test_cat.argmax(axis=1),y_pred_ann.argmax(axis=1)))

In [None]:
print(confusion_matrix(y_test_cat.argmax(axis=1),y_pred_ann.argmax(axis=1)))

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class KerasWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        y_cat = to_categorical(y)
        self.model.fit(X, y_cat, batch_size=50, epochs=50)
        return self

    def predict(self, X):
        return np.argmax(self.model.predict(X), axis=1)

# Wrap your ANN model
ann_estimator = KerasWrapper(ann)

In [None]:
ann_estimator = KerasWrapper(ann)

In [None]:
estimators = [('svm', svm),('ann', ann_estimator)]

In [None]:
stacking_ensemble = StackingClassifier(estimators= estimators, final_estimator= LogisticRegression())

In [None]:
stacking_ensemble.fit(X_scaled_train, y_train)

# takes almost 1 hour

In [None]:
joblib.dump(stacking_ensemble, 'stacking_ensemble_model')

In [None]:
ensemble_loaded = joblib.load('stacking_ensemble_model')
y_pred_stack = ensemble_loaded.predict(X_test_pca)
# y_pred_stack = stacking_ensemble.predict(X_test_pca)

In [None]:
y_pred_stack

In [None]:
print(classification_report(y_test,y_pred_stack))

In [None]:
report = classification_report(y_test, y_pred_stack, target_names=["Normal", "DOS", "Access", "Probe", "Privilege"])
print(report)

Model Evaluation

In [None]:
df.head()

In [None]:
test_df.tail()

In [None]:
test_data = test_df
true_values = test_data[test_data['attack_category'] == 3]
true_values

In [None]:
X_test.loc[101584].values

In [None]:
test_data.loc[3172].values

In [None]:
X_new=[[0, 9, 0, 0, 1, 0, 0, 1, 0, 0, 1, 16, 1, 0, 0, 1, 0, 0, 0, 9, 0, 0, 1, 0, 0, 1, 0, 0, 1, 16, 0, 0, 1, 1, 0, 0, 0, 9, 0, 0, 0, 0, 0, 1, 0]]

In [None]:
from sklearn.preprocessing import StandardScaler

X_new = np.array(X_new)  # convert list to NumPy array

scaler_new = StandardScaler()
X_new_scaled = scaler_new.fit_transform(X_new)

In [None]:
y_new_stack = ensemble_loaded.predict(X_new_scaled)

In [None]:
pred_stack = lab.inverse_transform(y_new_stack)

In [None]:
print('Class', pred_stack.item())

SVM model Evaluation

In [None]:
# Scale the input data
scaler_new = StandardScaler()
scaled_data = scaler_new.fit_transform(X_new)

# Make prediction
y_new = svm_loaded.predict(scaled_data)

# Inverse transform the predicted label
pred = lab.inverse_transform(y_new)

print("Class:", pred.item())

ANN Model Evaluation

In [None]:
# Make predictions using the ANN model
y_new2 = ann_loaded.predict(scaled_data)

# Find the index of the maximum value in the prediction array
ind = y_new2.argmax()

pred_ann = lab.inverse_transform([ind])
print("Class:",pred_ann.item())
