# **DDoS detection notebook for preprocessing**

## This file will perform these list of actions on the CIS dataset

*   Data cleaning
*   Feature extracting
*   Feature formmatting
*   Removing null values

In [1]:
"""
Data preprocessing Jupiter notebook for the CICDDoS2019 dataset

Author: Thomas Roethenbaugh
Date: 2025-01-29
Version: 0.8
"""

# Data science libraries
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Mounting my google drive to pull in the datasets
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

# Verify by listing the files in the drive
#!ls /content/drive/My\ Drive/ | grep .parquet

# List of Parquet files
files = [
    "/content/drive/MyDrive/DNS-testing.parquet", "/content/drive/MyDrive/LDAP-testing.parquet", "/content/drive/MyDrive/LDAP-training.parquet", "/content/drive/MyDrive/MSSQL-testing.parquet",
    "/content/drive/MyDrive/MSSQL-training.parquet", "/content/drive/MyDrive/NetBIOS-testing.parquet", "/content/drive/MyDrive/NetBIOS-training.parquet", "/content/drive/MyDrive/NTP-testing.parquet",
    "/content/drive/MyDrive/Portmap-training.parquet", "/content/drive/MyDrive/SNMP-testing.parquet", "/content/drive/MyDrive/Syn-testing.parquet", "/content/drive/MyDrive/Syn-training.parquet",
    "/content/drive/MyDrive/TFTP-testing.parquet", "/content/drive/MyDrive/UDPLag-testing.parquet", "/content/drive/MyDrive/UDPLag-training.parquet", "/content/drive/MyDrive/UDP-testing.parquet",
    "/content/drive/MyDrive/UDP-training.parquet"
]

# Verify files are uploaded
#df = pd.read_parquet("DNS-testing.parquet")
#print(df.head(3))  # Display first 3 rows

# Loop through each file and display the first 3 rows
for file in files:
    if file in files:
        print(f"\n=== {file} ===")
        df = pd.read_parquet(file)
        #print("Columns:", df.columns.tolist())  # Display column namess

        column_name = 'Label'

        # Get unique values from the specified colum
        unique_values = df[column_name].unique()
        print(unique_values)





Mounted at /content/drive/

=== /content/drive/MyDrive/DNS-testing.parquet ===
Unique values in column 'Label':
['DrDoS_DNS', 'Benign']
Categories (2, object): ['Benign', 'DrDoS_DNS']

=== /content/drive/MyDrive/LDAP-testing.parquet ===
Unique values in column 'Label':
['DrDoS_LDAP', 'Benign']
Categories (2, object): ['Benign', 'DrDoS_LDAP']

=== /content/drive/MyDrive/LDAP-training.parquet ===
Unique values in column 'Label':
['NetBIOS', 'LDAP', 'Benign']
Categories (3, object): ['Benign', 'LDAP', 'NetBIOS']

=== /content/drive/MyDrive/MSSQL-testing.parquet ===
Unique values in column 'Label':
['DrDoS_MSSQL', 'Benign']
Categories (2, object): ['Benign', 'DrDoS_MSSQL']

=== /content/drive/MyDrive/MSSQL-training.parquet ===
Unique values in column 'Label':
['MSSQL', 'LDAP', 'Benign']
Categories (3, object): ['Benign', 'LDAP', 'MSSQL']

=== /content/drive/MyDrive/NetBIOS-testing.parquet ===
Unique values in column 'Label':
['DrDoS_NetBIOS', 'Benign']
Categories (2, object): ['Benign', 'D

In [None]:
"""
Analysis of the NSL-KDD dataset
"""

train_data = pd.read_csv('/content/drive/MyDrive/KDDTrain+.txt', header=None)
test_data = pd.read_csv('/content/drive/MyDrive/KDDTest+.txt', header=None)

print(train_data.tail)  # Display column names

print(test_data.head)  # Display column names



<bound method NDFrame.tail of         0    1         2   3     4     5   6   7   8   9   ...    33    34  \
0        0  tcp  ftp_data  SF   491     0   0   0   0   0  ...  0.17  0.03   
1        0  udp     other  SF   146     0   0   0   0   0  ...  0.00  0.60   
2        0  tcp   private  S0     0     0   0   0   0   0  ...  0.10  0.05   
3        0  tcp      http  SF   232  8153   0   0   0   0  ...  1.00  0.00   
4        0  tcp      http  SF   199   420   0   0   0   0  ...  1.00  0.00   
...     ..  ...       ...  ..   ...   ...  ..  ..  ..  ..  ...   ...   ...   
125968   0  tcp   private  S0     0     0   0   0   0   0  ...  0.10  0.06   
125969   8  udp   private  SF   105   145   0   0   0   0  ...  0.96  0.01   
125970   0  tcp      smtp  SF  2231   384   0   0   0   0  ...  0.12  0.06   
125971   0  tcp    klogin  S0     0     0   0   0   0   0  ...  0.03  0.05   
125972   0  tcp  ftp_data  SF   151     0   0   0   0   0  ...  0.30  0.03   

          35    36    37    38   