In [1]:
import pandas as pd
import numpy as np

# STEP 1: Load dataset
df = pd.read_csv('/content/drive/MyDrive/DOS_Project/data_raw/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
print("Initial shape:", df.shape)

# STEP 2: Fix column name spaces
df.columns = df.columns.str.strip()

# STEP 3: Handle missing values
print("\nMissing values before cleaning:")
print(df.isnull().sum().sum(), "total missing cells")

df.dropna(inplace=True)  # removes rows with any NaN values
print("Shape after removing missing values:", df.shape)

# STEP 4: Remove duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)

# STEP 5: Handle infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print("Shape after removing infinite values:", df.shape)

# STEP 6: Fix data types
df = df.apply(pd.to_numeric, errors='ignore')
df['Label'] = df['Label'].astype(str)

# STEP 7: Correct Labeling / Target Variable
dos_names = ['DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DDoS', 'DoS GoldenEye']

# Keep only BENIGN + DoS rows
df = df[df['Label'].isin(['BENIGN'] + dos_names)]

# Encode labels
df['Label'] = df['Label'].apply(lambda x: 1 if x in dos_names else 0)

print("\nLabel distribution:\n", df['Label'].value_counts())

# STEP 8: Remove outliers and invalid values
if 'Flow Duration' in df.columns:
    df = df[df['Flow Duration'] >= 0]

# Optional: cap extreme packet counts
for col in ['Total Fwd Packets', 'Total Backward Packets']:
    if col in df.columns:
        df = df[df[col] < df[col].quantile(0.99)]

# STEP 9: Validate
print("\nValidation checks:")
print("Null values:", df.isnull().sum().sum())
print("Duplicates:", df.duplicated().sum())
print("Final shape:", df.shape)

# STEP 10: Save cleaned dataset
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p /content/drive/MyDrive/DOS_Project
df.to_csv('/content/drive/MyDrive/DOS_Project/data_cleaned/Friday1.csv', index=False)

print("\n Cleaning complete! File saved as 'Friday1.csv' in Google Drive.")


Initial shape: (225745, 79)

Missing values before cleaning:
4 total missing cells
Shape after removing missing values: (225741, 79)

Duplicate rows: 2633
Shape after removing duplicates: (223108, 79)
Shape after removing infinite values: (223082, 79)


  df = df.apply(pd.to_numeric, errors='ignore')



Label distribution:
 Label
1    128014
0     95068
Name: count, dtype: int64

Validation checks:
Null values: 0
Duplicates: 0
Final shape: (218504, 79)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

âœ… Cleaning complete! File saved as 'Friday1.csv' in Google Drive.
