In [None]:
import pandas as pd
import numpy as np


path = '/content/drive/MyDrive/DOS_Project/data_raw/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv'
df = pd.read_csv(path)
# clean column names (remove spaces)
df.columns = df.columns.str.strip()

print("Initial shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())
print("\nColumns (first 30):")
print(list(df.columns[:30]))


Initial shape: (286467, 79)

First 5 rows:


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,1266342,41,44,2664,6954,456,0,64.97561,109.864573,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,22,1319353,41,44,2664,6954,456,0,64.97561,109.864573,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,22,160,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,22,1303488,41,42,2728,6634,456,0,66.536585,110.129945,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,35396,77,1,2,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN



Columns (first 30):
['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min']


In [None]:
#  Check missing values
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100

print("Columns with missing values (>0):")
print(missing_counts[missing_counts > 0].sort_values(ascending=False))
print("\nPercentage (only >0):")
print(missing_percent[missing_percent > 0].sort_values(ascending=False))
print("\nTotal missing cells:", missing_counts.sum())


Columns with missing values (>0):
Flow Bytes/s    15
dtype: int64

Percentage (only >0):
Flow Bytes/s    0.005236
dtype: float64

Total missing cells: 15


In [None]:
# Drop rows that contain any missing values
df.dropna(inplace=True)

# Check again if there any
print("Shape after removing missing values:", df.shape)
print("Any missing left:", df.isnull().sum().sum())


Shape after removing missing values: (286452, 79)
Any missing left: 0


In [None]:
#  Check and remove duplicates
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

if duplicates > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates removed.")
else:
    print("No duplicates found.")

print("Shape after duplicate removal:", df.shape)


Number of duplicate rows: 72353
Duplicates removed.
Shape after duplicate removal: (214099, 79)


In [None]:
#  Replace infinite values with NaN and remove them
import numpy as np

inf_count = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
print("Total infinite numeric cells before replacement:", inf_count)

# Replace ±inf with NaN, then drop them
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# Verify
print("Any NaN left after dropping infinities:", df.isnull().sum().sum())
print("Shape after cleaning infinities:", df.shape)


Total infinite numeric cells before replacement: 644
Any NaN left after dropping infinities: 0
Shape after cleaning infinities: (213777, 79)


In [None]:
#  Fix Data Types
df = df.apply(pd.to_numeric, errors='ignore')  # convert numeric-looking columns

# Ensure label is string type
df['Label'] = df['Label'].astype(str)

# Show summary of data types
print("Column type summary:")
print(df.dtypes.value_counts())

# Show first 10 columns and their types for a quick check
print("\nSample of column data types:")
print(df.dtypes.head(10))


Column type summary:
int64      54
float64    24
object      1
Name: count, dtype: int64

Sample of column data types:
Destination Port                 int64
Flow Duration                    int64
Total Fwd Packets                int64
Total Backward Packets           int64
Total Length of Fwd Packets      int64
Total Length of Bwd Packets      int64
Fwd Packet Length Max            int64
Fwd Packet Length Min            int64
Fwd Packet Length Mean         float64
Fwd Packet Length Std          float64
dtype: object


  df = df.apply(pd.to_numeric, errors='ignore')  # convert numeric-looking columns


In [None]:
#  Correct Labeling
print("Unique labels before encoding:")
print(df['Label'].unique())

# Define DoS attack names (add others if needed)
dos_names = ['DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DDoS', 'DoS GoldenEye']

# Keep only BENIGN + DoS-related rows
df = df[df['Label'].isin(['BENIGN'] + dos_names)]

# Encode: 0 = BENIGN, 1 = DoS
df['Label'] = df['Label'].apply(lambda x: 1 if x in dos_names else 0)

# Check distribution
print("\nLabel distribution after encoding:")
print(df['Label'].value_counts())


Unique labels before encoding:
['BENIGN' 'PortScan']

Label distribution after encoding:
Label
0    123083
Name: count, dtype: int64


In [None]:
# Keep only BENIGN rows
df = df[df['Label'] == 0]  # 0 means BENIGN after encoding

print("After removing PortScan, dataset shape:", df.shape)
print("Label distribution:\n", df['Label'].value_counts())


After removing PortScan, dataset shape: (123083, 79)
Label distribution:
 Label
0    123083
Name: count, dtype: int64


In [None]:
#  Outlier and impossible value check

# 1️ Check for negative values in key numeric columns
check_cols = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
              'Packet Length Mean', 'Fwd Packet Length Max']
for col in check_cols:
    if col in df.columns:
        negatives = (df[col] < 0).sum()
        print(f"Negative values in {col}: {negatives}")

# 2️ Cap extreme values at 99th percentile (remove only the top 1%)
for col in ['Total Fwd Packets', 'Total Backward Packets', 'Flow Duration']:
    if col in df.columns:
        q99 = df[col].quantile(0.99)
        df = df[df[col] <= q99]
        print(f"{col}: capped at 99th percentile value = {q99}")

print("\nShape after removing extreme outliers:", df.shape)


Negative values in Flow Duration: 35
Negative values in Total Fwd Packets: 0
Negative values in Total Backward Packets: 0
Negative values in Packet Length Mean: 0
Negative values in Fwd Packet Length Max: 0
Total Fwd Packets: capped at 99th percentile value = 63.0
Total Backward Packets: capped at 99th percentile value = 44.0
Flow Duration: capped at 99th percentile value = 117930795.9

Shape after removing extreme outliers: (119484, 79)


In [None]:
print("Any remaining nulls:", df.isnull().sum().sum())
print("Any duplicates left:", df.duplicated().sum())

print("\nFinal shape:", df.shape)
# quick description of numeric features
print("\nNumeric summary (top 6 cols):")
display(df.select_dtypes(include=[np.number]).iloc[:, :6].describe())


Any remaining nulls: 0
Any duplicates left: 0

Final shape: (119484, 79)

Numeric summary (top 6 cols):


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets
count,119484.0,119484.0,119484.0,119484.0,119484.0,119484.0
mean,7671.870594,10017370.0,4.367447,3.673488,394.798124,1344.082856
std,17820.288315,28441670.0,6.112847,6.047281,1704.723145,4788.716729
min,0.0,-13.0,1.0,0.0,0.0,0.0
25%,53.0,195.0,2.0,1.0,31.0,6.0
50%,80.0,31018.0,2.0,2.0,68.0,142.0
75%,443.0,337662.5,4.0,2.0,140.0,320.0
max,65300.0,117930600.0,63.0,44.0,112944.0,83704.0


In [None]:
print("Final dataset shape:", df.shape)
print("\nClass distribution (binary):")
print(df['Label'].value_counts(normalize=False))

# Memory usage
print("\nMemory usage (MB):")
print(df.memory_usage(deep=True).sum() / (1024**2))


Final dataset shape: (119484, 79)

Class distribution (binary):
Label
0    119484
Name: count, dtype: int64

Memory usage (MB):
72.92724609375


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create folder if not exists
!mkdir -p /content/drive/MyDrive/DOS_Project

# Save the clean BENIGN dataset
benign_path = '/content/drive/MyDrive/DOS_Project/data_cleaned/Friday2.csv'
df.to_csv(benign_path, index=False)

print("\n BENIGN dataset saved successfully at:", benign_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

 BENIGN dataset saved successfully at: /content/drive/MyDrive/DOS_Project/data_cleaned/Friday2.csv
