# Step-1: Loading the Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
import glob 
filePaths = glob.glob("/home/prakhar/Desktop/College/3rdYear/SEM-VI/MinorProject/Datasets/archive-2/*.csv")
df_list = [pd.read_csv(file,low_memory=False) for file in filePaths]
df=pd.concat(df_list,ignore_index=True)
print(df.shape)
print(df.head())

(2830743, 79)
   Destination Port  Flow Duration  ...  Idle Min   Label
0             54865              3  ...         0  BENIGN
1             55054            109  ...         0  BENIGN
2             55055             52  ...         0  BENIGN
3             46236             34  ...         0  BENIGN
4             54863              3  ...         0  BENIGN

[5 rows x 79 columns]


In [3]:
df.columns = df.columns.str.strip()
df['Label'].unique()

array(['BENIGN', 'DDoS', 'PortScan', 'Bot', 'Infiltration',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'FTP-Patator', 'SSH-Patator',
       'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye',
       'Heartbleed'], dtype=object)

# Step-2: Pre-Processing

In [4]:
df.dtypes #Viewing Datatype

Destination Port                 int64
Flow Duration                    int64
Total Fwd Packets                int64
Total Backward Packets           int64
Total Length of Fwd Packets      int64
                                ...   
Idle Mean                      float64
Idle Std                       float64
Idle Max                         int64
Idle Min                         int64
Label                           object
Length: 79, dtype: object

In [5]:
is_inf = df.isin([np.inf, -np.inf])

inf_counts = is_inf.sum()

# Filter to show only columns with infinite values
columns_with_inf = inf_counts[inf_counts > 0]
print("Columns with infinite values:\n", columns_with_inf)

nan_counts = df.isnull().sum()

# Filter to show only columns with NaN values
columns_with_nan = nan_counts[nan_counts > 0]
print("\nColumns with NaN values:\n", columns_with_nan)

combined_counts = nan_counts + inf_counts

# Filter to show only columns with any NaN or inf values
columns_with_nan_or_inf = combined_counts[combined_counts > 0]
print("\nColumns with NaN or infinite values:\n", columns_with_nan_or_inf)

duplicate_rows_count = df.duplicated().sum()
print("\nNumber of duplicate rows:", duplicate_rows_count)

Columns with infinite values:
 Flow Bytes/s      1509
Flow Packets/s    2867
dtype: int64

Columns with NaN values:
 Flow Bytes/s    1358
dtype: int64

Columns with NaN or infinite values:
 Flow Bytes/s      2867
Flow Packets/s    2867
dtype: int64

Number of duplicate rows: 308381


In [6]:
df.replace([np.inf, -np.inf], np.nan, inplace=True) #Replacing inf values
df.dropna(inplace=True) #Replacing NaN values
df.drop_duplicates(inplace=True) #Removing duplicates to improve accuracy

In [7]:
x = df.drop(columns='Label')
y = df['Label']

In [8]:
category_mapping = {
    'BENIGN': 'BENIGN',
    'Bot': 'BOTNET',
    'DDoS': 'DOS',
    'DoS GoldenEye': 'DOS',
    'DoS Hulk': 'DOS',
    'DoS Slowhttptest': 'DOS',
    'DoS slowloris': 'DOS',
    'FTP-Patator': 'BRUTE_FORCE',
    'SSH-Patator': 'BRUTE_FORCE',
    'Heartbleed': 'WEB_ATTACK',
    'Infiltration': 'WEB_ATTACK',
    'PortScan': 'RECONNAISSANCE',
    'Web Attack � Brute Force': 'WEB_ATTACK',
    'Web Attack � Sql Injection': 'WEB_ATTACK',
    'Web Attack � XSS': 'WEB_ATTACK'
}
y = y.map(category_mapping) #Simplifying the columns to improve model accuracy
y.unique()

array(['BENIGN', 'DOS', 'RECONNAISSANCE', 'BOTNET', 'WEB_ATTACK',
       'BRUTE_FORCE'], dtype=object)

In [9]:
from sklearn.model_selection import train_test_split #Splitting dataset into training and splitting parts
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42,shuffle=True)

# Step-3: Applying Classifier Algorithms

In [10]:
from sklearn.metrics import classification_report

#### Step-3.1: Applying Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier #Importing Random Forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test) #Performing Random Forest

In [None]:
print(classification_report(y_test, y_pred)) #Printing Classification Report

                precision    recall  f1-score   support

        BENIGN       1.00      1.00      1.00    523384
        BOTNET       0.86      0.74      0.80       481
   BRUTE_FORCE       1.00      1.00      1.00      2215
           DOS       1.00      1.00      1.00     80840
RECONNAISSANCE       0.99      0.99      0.99     22725
    WEB_ATTACK       1.00      0.96      0.98       555

      accuracy                           1.00    630200
     macro avg       0.97      0.95      0.96    630200
  weighted avg       1.00      1.00      1.00    630200



#### Step-3.2: Applying KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier #Importing KNN
knn = KNeighborsClassifier(n_neighbors=15) #Changing Value Manually (Due to Performance Reasons)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test) #Performing KNN

In [14]:
print(classification_report(y_test, y_pred)) #Printing Classification Report

                precision    recall  f1-score   support

        BENIGN       1.00      0.99      0.99    523384
        BOTNET       0.75      0.46      0.57       481
   BRUTE_FORCE       0.98      0.96      0.97      2215
           DOS       0.98      0.99      0.99     80840
RECONNAISSANCE       0.91      0.97      0.94     22725
    WEB_ATTACK       0.97      0.89      0.93       555

      accuracy                           0.99    630200
     macro avg       0.93      0.88      0.90    630200
  weighted avg       0.99      0.99      0.99    630200



#### Step-3.3: Applying LSTM

In [15]:
from tensorflow import keras #Importing For LSTM
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import InputLayer

In [17]:
model = Sequential() #Creating LSTM Model
time_step = 1
features = 78
model.add(InputLayer(input_shape=(time_step,features)))
model.add(LSTM(64))
model.add(LSTM(64))
model.add(Dense(8,'relu'))
model.add(Dense(1,'linear'))
model.summay()

W0000 00:00:1739697885.156022   26061 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


ValueError: Input 0 of layer "lstm_1" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 64)