In [None]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/DOS_Project/data_raw/Wednesday-workingHours.pcap_ISCX.csv')

# Clean column names
df.columns = df.columns.str.strip()

print("Initial shape:", df.shape)

Initial shape: (692703, 79)


In [None]:
# Check missing values per column
missing = df.isnull().sum()
print("Missing values per column:\n", missing[missing > 0])

# Percentage of missing data
missing_percent = (df.isnull().sum() / len(df)) * 100
print("\nMissing percentage per column:\n", missing_percent[missing_percent > 0])

# Drop rows with too many missing values (usually safe for large data)
df.dropna(inplace=True)


Missing values per column:
 Flow Bytes/s    1008
dtype: int64

Missing percentage per column:
 Flow Bytes/s    0.145517
dtype: float64


In [None]:
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Remove them
df.drop_duplicates(inplace=True)


Number of duplicate rows: 80962


In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
df = df.apply(pd.to_numeric, errors='ignore')

# Example: ensure Flow Duration etc. are integers/floats
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns

print(f"\nNumeric columns: {len(num_cols)}")
print(f"Categorical columns: {len(cat_cols)}")

# Convert label column to string type
df['Label'] = df['Label'].astype(str)

  df = df.apply(pd.to_numeric, errors='ignore')



Numeric columns: 78
Categorical columns: 1


In [None]:
if 'Flow Duration' in df.columns:
    df = df[df['Flow Duration'] >= 0]

# Example: remove extremely large values if not realistic
for col in ['Total Fwd Packets', 'Total Backward Packets']:
    if col in df.columns:
        df = df[df[col] < df[col].quantile(0.99)]  # keep 99% of data

In [None]:
dos_names = ['DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DDoS', 'DoS GoldenEye']

# Keep only BENIGN + DoS
df = df[df['Label'].isin(['BENIGN'] + dos_names)]

# Encode labels: BENIGN=0, DoS=1
df['Label'] = df['Label'].apply(lambda x: 1 if x in dos_names else 0)

print("\nLabel distribution:\n", df['Label'].value_counts())


Label distribution:
 Label
0    404448
1    193745
Name: count, dtype: int64


In [None]:
for col in ['Total Fwd Packets', 'Total Backward Packets', 'Flow Duration']:
    if col in df.columns:
        neg_count = (df[col] < 0).sum()
        print(f"Negative values in {col}: {neg_count}")

# Verify consistent label count
print("\nFinal Label distribution:\n", df['Label'].value_counts(normalize=True))

# Confirm no remaining NaN or Inf
print("Null values remaining:", df.isnull().sum().sum())

Negative values in Total Fwd Packets: 0
Negative values in Total Backward Packets: 0
Negative values in Flow Duration: 0

Final Label distribution:
 Label
0    0.676116
1    0.323884
Name: proportion, dtype: float64
Null values remaining: 0


In [None]:


df.to_csv('/content/drive/MyDrive/DOS_Project/data_cleaned/wednesday_cleaned.csv', index=False)
print("\n Saved final cleaned dataset to Google Drive.")



 Saved final cleaned dataset to Google Drive.
