In [12]:
import pandas as pd

# List of dataset file paths to check shape of the dataset
file_paths = [
    r"C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv",
    r"C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv",
    r"C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv",
    r"C:\Users\S569652\Documents\INADS\data\mixed_attacks\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv",
]

# Check attack label counts in each file
print("🔍 Checking Labels in Individual Files Before Merging...\n")
pre_merge_labels = {}

for file in file_paths:
    df = pd.read_csv(file, usecols=['Label'], dtype=str)  # Ensure 'Label' column is read properly
    label_counts = df['Label'].value_counts()
    pre_merge_labels[file] = label_counts
    print(f"✅ {file}:\n{label_counts}\n")

# Store unique labels found across all datasets
unique_labels_before = set()
for labels in pre_merge_labels.values():
    unique_labels_before.update(labels.index)


🔍 Checking Labels in Individual Files Before Merging...

✅ C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv:
Label
Benign           544200
Infilteration     68871
Label                33
Name: count, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv:
Label
DoS attacks-GoldenEye    41508
DoS attacks-Slowloris    10990
Name: count, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv:
Label
DDoS attacks-LOIC-HTTP    576191
Name: count, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\mixed_attacks\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv:
Label
Benign              1048009
Brute Force -Web        362
Brute Force -XSS        151
Name: count, dtype: int64



In [13]:
import pandas as pd

# List of dataset file paths
file_paths = [
    r"C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv",
    r"C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv",
    r"C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv",
    r"C:\Users\S569652\Documents\INADS\data\mixed_attacks\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv"
]

# Store column structures
columns_dict = {}

for file in file_paths:
    df = pd.read_csv(file, nrows=5, low_memory=False)  # Load small portion to check structure
    columns_dict[file] = set(df.columns)
    print(f"✅ {file}: {len(df.columns)} columns")

# Compare all column names
common_cols = set.intersection(*columns_dict.values())
for file, cols in columns_dict.items():
    extra_cols = cols - common_cols
    missing_cols = common_cols - cols
    if extra_cols or missing_cols:
        print(f"⚠️ {file}: Extra Columns: {extra_cols} | Missing Columns: {missing_cols}")

✅ C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv: 80 columns
✅ C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv: 80 columns
✅ C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv: 80 columns
✅ C:\Users\S569652\Documents\INADS\data\mixed_attacks\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv: 80 columns


In [14]:
for file in file_paths:
    df = pd.read_csv(file, usecols=['Label'], low_memory=False)
    label_counts = df['Label'].value_counts()
    print(f"✅ {file}: \n{label_counts}\n")

✅ C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv: 
Label
Benign           544200
Infilteration     68871
Label                33
Name: count, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv: 
Label
DoS attacks-GoldenEye    41508
DoS attacks-Slowloris    10990
Name: count, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv: 
Label
DDoS attacks-LOIC-HTTP    576191
Name: count, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\mixed_attacks\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv: 
Label
Benign              1048009
Brute Force -Web        362
Brute Force -XSS        151
Name: count, dtype: int64



In [15]:
for file in file_paths:
    df = pd.read_csv(file, low_memory=False)
    print(f"✅ {file}: Missing Values\n{df.isnull().sum()}\n")

✅ C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv: Missing Values
Dst Port         0
Protocol         0
Timestamp        0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
Length: 80, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv: Missing Values
Dst Port         0
Protocol         0
Timestamp        0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
Length: 80, dtype: int64

✅ C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv: Missing Values
Dst Port         0
Protocol         0
Timestamp        0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
Length: 80, 

In [16]:
import pandas as pd

# List of dataset file paths
file_paths = [
    r"C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv",
    r"C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv",
    r"C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv",
    r"C:\Users\S569652\Documents\INADS\data\mixed_attacks\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv"
]

# Output file for the merged dataset
output_file = r"C:\Users\S569652\Documents\INADS\data\Merged-Dataset.csv"

# Define chunk size to process data efficiently
chunk_size = 100000  # Load 100,000 rows at a time
first_chunk = True   # Flag to ensure header is written only once

# Merge and write in chunks
for file in file_paths:
    print(f"✅ Merging: {file}")
    for chunk in pd.read_csv(file, chunksize=chunk_size, low_memory=False):
        chunk.to_csv(output_file, mode='a', header=first_chunk, index=False)
        first_chunk = False  # Only write header for the first file

print(f"✅ Merged dataset saved at: {output_file}")

# ✅ Load merged dataset for final verification
df = pd.read_csv(output_file, usecols=['Label'], low_memory=False)
print("\n🔍 Final Label Distribution After Merging:")
print(df['Label'].value_counts())

✅ Merging: C:\Users\S569652\Documents\INADS\data\benign\Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv
✅ Merging: C:\Users\S569652\Documents\INADS\data\ddos\DoS_Attacks_Filtered.csv
✅ Merging: C:\Users\S569652\Documents\INADS\data\ddos\Friday-16-02-2018_TrafficForML_CICFlowMeter.csv
✅ Merging: C:\Users\S569652\Documents\INADS\data\mixed_attacks\Friday-23-02-2018_TrafficForML_CICFlowMeter.csv
✅ Merged dataset saved at: C:\Users\S569652\Documents\INADS\data\Merged-Dataset.csv

🔍 Final Label Distribution After Merging:
Label
Benign                    1592209
DDoS attacks-LOIC-HTTP     576191
Infilteration               68871
DoS attacks-GoldenEye       41508
DoS attacks-Slowloris       10990
Brute Force -Web              362
Brute Force -XSS              151
Label                          33
Name: count, dtype: int64


In [22]:
import pandas as pd

# Load the merged dataset
file_path = r"C:\Users\S569652\Documents\INADS\data\Merged-Dataset.csv"
df = pd.read_csv(file_path)

# Display basic dataset info
print("🔍 Dataset Info:")
print(df.info())  # Checks for data types, missing values, and memory usage

print("\n🔍 First Few Rows:")
print(df.head())  # Displays first few rows to check overall structure


  df = pd.read_csv(file_path)


🔍 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2290315 entries, 0 to 2290314
Data columns (total 80 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Dst Port           object
 1   Protocol           object
 2   Timestamp          object
 3   Flow Duration      object
 4   Tot Fwd Pkts       object
 5   Tot Bwd Pkts       object
 6   TotLen Fwd Pkts    object
 7   TotLen Bwd Pkts    object
 8   Fwd Pkt Len Max    object
 9   Fwd Pkt Len Min    object
 10  Fwd Pkt Len Mean   object
 11  Fwd Pkt Len Std    object
 12  Bwd Pkt Len Max    object
 13  Bwd Pkt Len Min    object
 14  Bwd Pkt Len Mean   object
 15  Bwd Pkt Len Std    object
 16  Flow Byts/s        object
 17  Flow Pkts/s        object
 18  Flow IAT Mean      object
 19  Flow IAT Std       object
 20  Flow IAT Max       object
 21  Flow IAT Min       object
 22  Fwd IAT Tot        object
 23  Fwd IAT Mean       object
 24  Fwd IAT Std        object
 25  Fwd IAT Max        object
 26

In [23]:
import pandas as pd

# Reload dataset (ensure low_memory=False)
file_path = r"C:\Users\S569652\Documents\INADS\data\Merged-Dataset.csv"
df = pd.read_csv(file_path, low_memory=False)

# Detect numeric columns that should NOT be objects
numeric_columns = [col for col in df.columns if col not in ['Timestamp', 'Label']]
problematic_columns = []

for col in numeric_columns:
    try:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    except Exception as e:
        print(f"⚠️ Error converting {col}: {e}")
        problematic_columns.append(col)

print(f"✅ Successfully converted {len(numeric_columns) - len(problematic_columns)} columns to numeric.")
print(f"⚠️ Columns that still have issues: {problematic_columns}")

✅ Successfully converted 78 columns to numeric.
⚠️ Columns that still have issues: []


In [24]:
cleaned_path = r"C:\Users\S569652\Documents\INADS\data\Merged-Dataset-Cleaned.csv"
df.to_csv(cleaned_path, index=False)
print(f"✅ Cleaned dataset saved at: {cleaned_path}")

✅ Cleaned dataset saved at: C:\Users\S569652\Documents\INADS\data\Merged-Dataset-Cleaned.csv


In [27]:
import pandas as pd

# Load cleaned dataset
cleaned_file_path = r"C:\Users\S569652\Documents\INADS\data\Merged-Dataset-Cleaned.csv"
df = pd.read_csv(cleaned_file_path)

# 🔍 1. Verify column count
print(f"✅ Dataset Shape: {df.shape}")
print("✅ Column Count:", len(df.columns))

# 🔍 2. Check missing values
print("🔍 Missing Values per Column:")
print(df.isnull().sum().sum())

# 🔍 3. Ensure all numerical columns are converted correctly
print("🔍 Data Types Summary:")
print(df.dtypes.value_counts())

# 🔍 4. Check final label distribution
print("🔍 Final Label Distribution After Cleaning:\n")
print(df["Label"].value_counts())

✅ Dataset Shape: (2290315, 80)
✅ Column Count: 80
🔍 Missing Values per Column:
10369
🔍 Data Types Summary:
float64    78
object      2
Name: count, dtype: int64
🔍 Final Label Distribution After Cleaning:

Label
Benign                    1592209
DDoS attacks-LOIC-HTTP     576191
Infilteration               68871
DoS attacks-GoldenEye       41508
DoS attacks-Slowloris       10990
Brute Force -Web              362
Brute Force -XSS              151
Label                          33
Name: count, dtype: int64


In [28]:
print("🔍 Columns with Missing Values:\n")
print(df.isnull().sum()[df.isnull().sum() > 0])

🔍 Columns with Missing Values:

Dst Port         33
Protocol         33
Flow Duration    33
Tot Fwd Pkts     33
Tot Bwd Pkts     33
                 ..
Active Min       33
Idle Mean        33
Idle Std         33
Idle Max         33
Idle Min         33
Length: 78, dtype: int64


In [29]:
print(df[df.isnull().any(axis=1)])

         Dst Port  Protocol            Timestamp  Flow Duration  Tot Fwd Pkts  \
6         49688.0       6.0  28/02/2018 08:22:21            0.0           2.0   
138       49749.0       6.0  28/02/2018 08:29:33            0.0           2.0   
162       49753.0       6.0  28/02/2018 08:29:37            0.0           2.0   
163       49767.0       6.0  28/02/2018 08:29:37            0.0           2.0   
246       49836.0       6.0  28/02/2018 08:34:05            0.0           2.0   
...           ...       ...                  ...            ...           ...   
2285565   50632.0       6.0  23/02/2018 11:53:30            0.0           2.0   
2286474   52042.0       6.0  23/02/2018 03:34:08            0.0           2.0   
2287538   52102.0       6.0  23/02/2018 03:43:01            0.0           2.0   
2288132   49491.0       6.0  23/02/2018 08:17:04            0.0           2.0   
2288411   50894.0       6.0  23/02/2018 12:15:28            0.0           2.0   

         Tot Bwd Pkts  TotL

In [30]:
df = df[df["Flow Duration"] != 0]
print("✅ Removed rows with zero Flow Duration.")

✅ Removed rows with zero Flow Duration.


In [31]:
print("🔍 Missing Values After Cleanup:\n", df.isnull().sum().sum())  # Should return 0

🔍 Missing Values After Cleanup:
 2574


In [33]:
missing_values = df.isnull().sum()
print("🔍 Columns with Remaining Missing Values:\n", missing_values[missing_values > 0])

🔍 Columns with Remaining Missing Values:
 Dst Port         33
Protocol         33
Flow Duration    33
Tot Fwd Pkts     33
Tot Bwd Pkts     33
                 ..
Active Min       33
Idle Mean        33
Idle Std         33
Idle Max         33
Idle Min         33
Length: 78, dtype: int64


In [34]:
df_missing = df[df.isnull().any(axis=1)]
print(df_missing.head())
print(f"🔍 Total Rows with Missing Values: {len(df_missing)}")

        Dst Port  Protocol  Timestamp  Flow Duration  Tot Fwd Pkts  \
21838        NaN       NaN  Timestamp            NaN           NaN   
43117        NaN       NaN  Timestamp            NaN           NaN   
63291        NaN       NaN  Timestamp            NaN           NaN   
84013        NaN       NaN  Timestamp            NaN           NaN   
107719       NaN       NaN  Timestamp            NaN           NaN   

        Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
21838            NaN              NaN              NaN              NaN   
43117            NaN              NaN              NaN              NaN   
63291            NaN              NaN              NaN              NaN   
84013            NaN              NaN              NaN              NaN   
107719           NaN              NaN              NaN              NaN   

        Fwd Pkt Len Min  ...  Fwd Seg Size Min  Active Mean  Active Std  \
21838               NaN  ...               NaN       

In [35]:
df = df.dropna()
print("✅ Removed all rows with missing values.")

✅ Removed all rows with missing values.


In [36]:
missing_values_after = df.isnull().sum().sum()
print(f"🔍 Missing Values After Cleanup: {missing_values_after}")  

🔍 Missing Values After Cleanup: 0


In [37]:
df.to_csv("C:\\Users\\S569652\\Documents\\INADS\\data\\Merged-Dataset-Final.csv", index=False)
print("✅ Final cleaned dataset saved successfully!")

✅ Final cleaned dataset saved successfully!


In [38]:
import pandas as pd
import shutil

# ✅ Load the cleaned dataset
file_path = "C:\\Users\\S569652\\Documents\\INADS\\data\\Merged-Dataset-Final.csv"
df = pd.read_csv(file_path)

# 🔍 Step 1: Basic Dataset Information
print("\n✅ Dataset Overview")
print(f"Shape: {df.shape}")  # Rows and columns
print(f"Total Columns: {len(df.columns)}")
print("\n🔍 Data Types Summary:\n", df.dtypes.value_counts())  # Check dtype distribution

# 🔍 Step 2: Check for Missing Values
missing_values = df.isnull().sum()
total_missing = missing_values.sum()
if total_missing > 0:
    print(f"\n⚠️ Total Missing Values: {total_missing}")
    print("\n🔍 Columns with Missing Values:\n", missing_values[missing_values > 0])
else:
    print("\n✅ No Missing Values Found!")

# 🔍 Step 3: Label Distribution
print("\n🔍 Final Label Distribution:")
print(df["Label"].value_counts())

# 🔍 Step 4: Detect Anomalous Rows (like 'Label' in the Label column)
if "Label" in df["Label"].unique():
    print("\n⚠️ Anomalous Entries Found with Label 'Label':")
    print(df[df["Label"] == "Label"])
else:
    print("\n✅ No Anomalous Label Entries Found!")

# 🔍 Step 5: Check for Duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
    print(f"\n⚠️ Duplicate Rows Found: {duplicates}")
else:
    print("\n✅ No Duplicates Found!")

# ✅ Step 6: Backup the Final Dataset (Avoid Reprocessing)
backup_path = "C:\\Users\\S569652\\Documents\\INADS\\data\\Backup-Merged-Dataset-Final.csv"
shutil.copy(file_path, backup_path)
print(f"\n✅ Backup created at: {backup_path}")

# 🔍 Step 7: Decide Next Steps
print("\n🚀 Next Steps to Consider:")
print("1️⃣ Feature Selection: Identify the most important features for training.")
print("2️⃣ Data Normalization: Ensure all numerical features are scaled appropriately.")
print("3️⃣ Data Balancing: Address class imbalance if needed.")

print("\n✅ Dataset Inspection & Backup Complete! 🎯 Ready for Next Steps.")



✅ Dataset Overview
Shape: (2278405, 80)
Total Columns: 80

🔍 Data Types Summary:
 float64    78
object      2
Name: count, dtype: int64

✅ No Missing Values Found!

🔍 Final Label Distribution:
Label
Benign                    1580967
DDoS attacks-LOIC-HTTP     576191
Infilteration               68236
DoS attacks-GoldenEye       41508
DoS attacks-Slowloris       10990
Brute Force -Web              362
Brute Force -XSS              151
Name: count, dtype: int64

✅ No Anomalous Label Entries Found!

⚠️ Duplicate Rows Found: 8088

✅ Backup created at: C:\Users\S569652\Documents\INADS\data\Backup-Merged-Dataset-Final.csv

🚀 Next Steps to Consider:
1️⃣ Feature Selection: Identify the most important features for training.
2️⃣ Data Normalization: Ensure all numerical features are scaled appropriately.
3️⃣ Data Balancing: Address class imbalance if needed.

✅ Dataset Inspection & Backup Complete! 🎯 Ready for Next Steps.


In [40]:
import pandas as pd
import numpy as np

# 📌 Load the dataset
file_path = "C:\\Users\\S569652\\Documents\\INADS\\data\\Merged-Dataset-Final.csv"
df = pd.read_csv(file_path)

# 📌 Display basic dataset information
print(f"✅ Dataset Loaded - Shape: {df.shape}")
print("\n🔍 Column Names & Data Types:")
print(df.dtypes)

# 📌 Check for missing values
missing_values = df.isnull().sum()
print("\n🔍 Missing Values Per Column:")
print(missing_values[missing_values > 0])

# 📌 Check unique labels
print("\n🔍 Unique Labels & Counts:")
print(df["Label"].value_counts())

# 📌 Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"\n⚠️ Duplicate Rows: {duplicate_rows}")

✅ Dataset Loaded - Shape: (2278405, 80)

🔍 Column Names & Data Types:
Dst Port         float64
Protocol         float64
Timestamp         object
Flow Duration    float64
Tot Fwd Pkts     float64
                  ...   
Idle Mean        float64
Idle Std         float64
Idle Max         float64
Idle Min         float64
Label             object
Length: 80, dtype: object

🔍 Missing Values Per Column:
Series([], dtype: int64)

🔍 Unique Labels & Counts:
Label
Benign                    1580967
DDoS attacks-LOIC-HTTP     576191
Infilteration               68236
DoS attacks-GoldenEye       41508
DoS attacks-Slowloris       10990
Brute Force -Web              362
Brute Force -XSS              151
Name: count, dtype: int64

⚠️ Duplicate Rows: 8088


In [41]:
# Count the exact duplicate rows (all columns match)
duplicate_rows = df[df.duplicated(keep=False)]  # Keep=False shows all duplicates, not just first occurrence
print(f"🔍 Total Identical Duplicates Found: {duplicate_rows.shape[0]}")

# Display a sample of duplicate rows for verification
print("\n🔍 Sample Duplicate Rows:")
print(duplicate_rows.head(10))

🔍 Total Identical Duplicates Found: 15292

🔍 Sample Duplicate Rows:
     Dst Port  Protocol            Timestamp  Flow Duration  Tot Fwd Pkts  \
42    49689.0       6.0  28/02/2018 08:25:11            1.0           2.0   
53      443.0       6.0  28/02/2018 08:25:42          128.0           2.0   
55      443.0       6.0  28/02/2018 08:26:06          250.0           2.0   
188   49853.0       6.0  28/02/2018 08:31:22            1.0           2.0   
213     443.0       6.0  28/02/2018 08:32:46          128.0           2.0   
245   49832.0       6.0  28/02/2018 08:34:06            1.0           2.0   
293   49903.0       6.0  28/02/2018 08:36:12           20.0           2.0   
315   49894.0       6.0  28/02/2018 08:37:05            1.0           2.0   
316   49889.0       6.0  28/02/2018 08:37:05            1.0           2.0   
325   49902.0       6.0  28/02/2018 08:37:06            1.0           2.0   

     Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
42          

In [46]:
import pandas as pd

# Load the cleaned dataset (Modify path if necessary)
file_path = "C:\\Users\\S569652\\Documents\\INADS\\data\\Merged-Dataset-Final.csv"
df_cleaned = pd.read_csv(file_path)

print("✅ Dataset Loaded Successfully!")
print(f"📊 Dataset Shape: {df_cleaned.shape}")
print(f"📌 Total Columns: {len(df_cleaned.columns)}\n")

# 1️⃣ Missing Values Check
missing_values = df_cleaned.isnull().sum()
missing_summary = missing_values[missing_values > 0]

if missing_summary.empty:
    print("✅ No Missing Values Found!\n")
else:
    print("⚠️ Missing Values Present:")
    print(missing_summary, "\n")

# 2️⃣ Label Distribution Check
print("🔍 Final Label Distribution:")
print(df_cleaned["Label"].value_counts(), "\n")

# 3️⃣ Data Types & Structure Check
print("🔍 Data Types Summary:\n")
print(df_cleaned.dtypes.value_counts())  # Ensures correct numerical/object types

print("\n🔍 Sample Data (5 Random Rows):")
print(df_cleaned.sample(5))  # View random sample rows

# 4️⃣ Confirm Dataset Shape & Columns
print(f"✅ Final Dataset Shape: {df_cleaned.shape}")  # Confirm expected row count
print(f"✅ Total Columns: {len(df_cleaned.columns)}\n")  # Ensure 80 columns remain

# 5️⃣ Detect Any Remaining Duplicates
duplicate_count = df_cleaned.duplicated().sum()
if duplicate_count == 0:
    print("✅ No Remaining Duplicate Rows!\n")
else:
    print(f"⚠️ Warning: {duplicate_count} Duplicate Rows Still Present!\n")


✅ Dataset Loaded Successfully!
📊 Dataset Shape: (2278405, 80)
📌 Total Columns: 80

✅ No Missing Values Found!

🔍 Final Label Distribution:
Label
Benign                    1580967
DDoS attacks-LOIC-HTTP     576191
Infilteration               68236
DoS attacks-GoldenEye       41508
DoS attacks-Slowloris       10990
Brute Force -Web              362
Brute Force -XSS              151
Name: count, dtype: int64 

🔍 Data Types Summary:

float64    78
object      2
Name: count, dtype: int64

🔍 Sample Data (5 Random Rows):
                   Dst Port           Protocol            Timestamp  \
874177   80.000000000000000  6.000000000000000  20/02/2018 10:20:03   
2251056 443.000000000000000  6.000000000000000  23/02/2018 11:30:49   
551241   53.000000000000000 17.000000000000000  28/02/2018 10:06:09   
404695   53.000000000000000 17.000000000000000  28/02/2018 08:22:10   
595507   53.000000000000000 17.000000000000000  28/02/2018 03:56:23   

                    Flow Duration       Tot Fwd Pkts 

In [53]:
# Drop completely identical duplicates
df_cleaned = df.drop_duplicates()

# Save the cleaned dataset
cleaned_file_path = "C:\\Users\\S569652\\Documents\\INADS\\data\\Merged-Dataset-Final.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

# Display updated stats
print(f"✅ Duplicates Removed! New Dataset Shape: {df_cleaned.shape}")
print("\n🔍 Final Label Distribution:\n", df_cleaned["Label"].value_counts())


✅ Duplicates Removed! New Dataset Shape: (2270317, 80)

🔍 Final Label Distribution:
 Label
Benign                    1573665
DDoS attacks-LOIC-HTTP     576175
Infilteration               68224
DoS attacks-GoldenEye       41455
DoS attacks-Slowloris       10285
Brute Force -Web              362
Brute Force -XSS              151
Name: count, dtype: int64
