In [3]:
import numpy as np
import pandas as pd
import glob, os

In [4]:
# Define the folder path containing your CSV files
folder_path = "Dataset"

# Find all CSV files in that folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Create a list to hold all DataFrames
dfs = []

# Loop through the files and read them
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
    
# Merge all CSVs into a single DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Display first few rows to confirm merge
display(merged_df.head())

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3268,112740690,32,16,6448,1152,403,0,201.5,204.724205,...,32,359.4286,11.99802,380,343,16100000.0,498804.8,16400000,15400000,BENIGN
1,389,112740560,32,16,6448,5056,403,0,201.5,204.724205,...,32,320.2857,15.74499,330,285,16100000.0,498793.7,16400000,15400000,BENIGN
2,0,113757377,545,0,0,0,0,0,0.0,0.0,...,0,9361829.0,7324646.0,18900000,19,12200000.0,6935824.0,20800000,5504997,BENIGN
3,5355,100126,22,0,616,0,28,28,28.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,0,54760,4,0,0,0,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


### Data Inspection

In [5]:
merged_df.info()
merged_df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416778 entries, 0 to 416777
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             416778 non-null  int64  
 1    Flow Duration                416778 non-null  int64  
 2    Total Fwd Packets            416778 non-null  int64  
 3    Total Backward Packets       416778 non-null  int64  
 4   Total Length of Fwd Packets   416778 non-null  int64  
 5    Total Length of Bwd Packets  416778 non-null  int64  
 6    Fwd Packet Length Max        416778 non-null  int64  
 7    Fwd Packet Length Min        416778 non-null  int64  
 8    Fwd Packet Length Mean       416778 non-null  float64
 9    Fwd Packet Length Std        416778 non-null  float64
 10  Bwd Packet Length Max         416778 non-null  int64  
 11   Bwd Packet Length Min        416778 non-null  int64  
 12   Bwd Packet Length Mean       416778 non-nul

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,...,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778.0,416778
unique,,,,,,,,,,,...,,,,,,,,,,3
top,,,,,,,,,,,...,,,,,,,,,,BENIGN
freq,,,,,,,,,,,...,,,,,,,,,,286785
mean,7906.276274,14134740.0,8.977957,10.000149,783.8437,16239.2,371.776399,26.055864,113.055645,139.40983,...,23.363232,139511.4,31678.75,196815.8,121827.5,7386322.0,2036628.0,8830855.0,5917528.0,
std,18446.81082,31233660.0,743.301976,1001.993632,5876.146,2244220.0,1433.866753,123.520052,384.08184,602.451016,...,5.678366,832299.7,368434.0,1120016.0,776893.0,18899400.0,9655660.0,22469420.0,17409560.0,
min,0.0,-12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,53.0,7729.0,2.0,1.0,26.0,6.0,6.0,0.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
50%,80.0,196446.5,3.0,2.0,52.0,156.0,30.0,6.0,18.5,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
75%,443.0,6010748.0,5.0,5.0,92.0,4799.0,49.0,33.0,46.0,10.263203,...,32.0,695.0,0.0,695.0,666.0,5588871.0,0.0,5588911.0,5346825.0,


In [7]:
# labels in data
print(merged_df[' Label'].unique())

['BENIGN' 'Bot' 'DDoS']


### Check for null and Infinity values in Columns and drop rows with null values

In [8]:
# find out which many null values exist per feature/column
for col in merged_df.columns:
    has_null = merged_df[col].isnull().any()   # Returns True if column has at least one null
    if has_null:
        print(f"{col}: {has_null}")

Flow Bytes/s: True


In [10]:
# drop rows with null values
merged_df.dropna(inplace=True)

#### Remove rows with infinity values

In [11]:
# Make a copy to avoid modifying the original DataFrame accidentally
df = merged_df.copy()

# Identify numeric columns only (skip non-numerical ones like label)
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Check for infinite values (only in numeric columns)
inf_counts = np.isinf(df[numeric_cols]).sum()

# Display only columns that contain infinity
inf_columns = inf_counts[inf_counts > 0]

if not inf_columns.empty:
    print("Columns containing infinity values:")
    for col, count in inf_columns.items():
        print(f" - {col}: {count} infinite values")
else:
    print("No infinite values found in numeric columns.")

# Drop rows that contain any infinity values (across all numeric columns)
rows_before = len(df)
df = df[~np.isinf(df[numeric_cols]).any(axis=1)].copy()
rows_after = len(df)

print(f"Rows with infinity values removed: {rows_before - rows_after}")
print(f"Remaining rows: {rows_after}")

# Replace the original DataFrame if desired
merged_df = df


No infinite values found in numeric columns.
Rows with infinity values removed: 0
Remaining rows: 416622


### Check and remove duplicates

In [12]:
merged_df.duplicated().sum()
merged_df.drop_duplicates(inplace=True)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 405401 entries, 0 to 416777
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             405401 non-null  int64  
 1    Flow Duration                405401 non-null  int64  
 2    Total Fwd Packets            405401 non-null  int64  
 3    Total Backward Packets       405401 non-null  int64  
 4   Total Length of Fwd Packets   405401 non-null  int64  
 5    Total Length of Bwd Packets  405401 non-null  int64  
 6    Fwd Packet Length Max        405401 non-null  int64  
 7    Fwd Packet Length Min        405401 non-null  int64  
 8    Fwd Packet Length Mean       405401 non-null  float64
 9    Fwd Packet Length Std        405401 non-null  float64
 10  Bwd Packet Length Max         405401 non-null  int64  
 11   Bwd Packet Length Min        405401 non-null  int64  
 12   Bwd Packet Length Mean       405401 non-null  fl

### Save the cleaned data

In [14]:
merged_df.to_csv("clean_data/cleaned_DDoS_and_begnin_traffic_dataset.csv", index=False)