In [None]:
import pandas as pd

files = [
    "/content/drive/MyDrive/DOS_Project/data_cleaned/monday_cleaned.csv",
    "/content/drive/MyDrive/DOS_Project/data_cleaned/tuesday_cleaned.csv",
    "/content/drive/MyDrive/DOS_Project/data_cleaned/wednesday_cleaned.csv",
    "/content/drive/MyDrive/DOS_Project/data_cleaned/Friday1.csv",
    "/content/drive/MyDrive/DOS_Project/data_cleaned/Friday2.csv",
    "/content/drive/MyDrive/DOS_Project/data_cleaned/Friday3.csv"
]

dfs = [pd.read_csv(f) for f in files]

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.drop_duplicates(inplace=True)

print(" Combined shape:", combined_df.shape)
combined_df.head()

combined_df.to_csv("/content/drive/MyDrive/DOS_Project/data_cleaned/final_combined_dataset.csv", index=False)
print(" Saved: final_combined_dataset.csv")


 Combined shape: (1911227, 79)
 Saved: final_combined_dataset.csv


# load dataset


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/DOS_Project/data_cleaned/final_combined_dataset.csv")
print(df.shape)
df.head()


(1911227, 79)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,49188,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,49486,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,49486,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,88,609,7,4,484,414,233,0,69.142857,111.967895,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


## Check class distribution


In [None]:
df['Label'].value_counts()


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,1589468
1,321759


# List of Column Names

In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1911227 entries, 0 to 1911226
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

**Checking for the missing values
**

In [None]:
print("Missing values per column:\n")
print(df.isnull().sum())

print("\nTotal missing values:", df.isnull().sum().sum())


Missing values per column:

Destination Port               0
Flow Duration                  0
Total Fwd Packets              0
Total Backward Packets         0
Total Length of Fwd Packets    0
                              ..
Idle Mean                      0
Idle Std                       0
Idle Max                       0
Idle Min                       0
Label                          0
Length: 79, dtype: int64

Total missing values: 56


In [None]:
# Count empty strings or spaces
hidden_nulls = (df.astype(str).isin(["", " ", "  ", "NaN", "nan", "None"])).sum()
print("Hidden nulls per column:\n", hidden_nulls[hidden_nulls > 0])

print("\nTotal hidden nulls:", hidden_nulls.sum())


In [None]:
duplicates = df.duplicated().sum()
print("Total duplicate rows:", duplicates)


Total duplicate rows: 0


# Checking negative values


In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

neg_counts = (df[numeric_cols] < 0).sum()
print("Negative values in each column:\n", neg_counts[neg_counts>0])


Negative values in each column:
 Flow Duration                  65
Flow Bytes/s                   43
Flow Packets/s                 65
Flow IAT Mean                  65
Flow IAT Max                   65
Flow IAT Min                 1515
Fwd IAT Min                    17
Fwd Header Length              35
Bwd Header Length              22
Fwd Header Length.1            35
Init_Win_bytes_forward     665711
Init_Win_bytes_backward    907061
min_seg_size_forward           35
dtype: int64


In [None]:
numeric_cols = df.select_dtypes(include=['int64','float64']).columns

# Filter rows having ANY negative numeric value
df = df[(df[numeric_cols] >= 0).all(axis=1)]

print(" Negative values removed. New shape:", df.shape)


 Negative values removed. New shape: (1002628, 79)


In [None]:
print(f"Total number of rows: {df.shape[0]}")
print(f"Total number of columns: {df.shape[1]}")

Total number of rows: 1002628
Total number of columns: 79


In [None]:
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,744162
1,258466


In [None]:
negative_check = (df[numeric_cols] < 0).sum()
print("Remaining negative values:", negative_check[negative_check > 0])


Remaining negative values: Series([], dtype: int64)


In [None]:
df.dtypes[df.dtypes == 'object']


Unnamed: 0,0


In [None]:
print("Class distribution:\n")
print(df['Label'].value_counts())


Class distribution:

Label
0    744162
1    258466
Name: count, dtype: int64


In [None]:
import numpy as np

print("Infinite values:", np.isinf(df).sum().sum())
df = df.replace([np.inf, -np.inf], np.nan)
print(" Inf replaced with NaN")

print("Missing after replacing inf:", df.isnull().sum().sum())
df.dropna(inplace=True)
print(" Dropped NaN caused by inf. New shape:", df.shape)


Infinite values: 0
 Inf replaced with NaN
Missing after replacing inf: 0
 Dropped NaN caused by inf. New shape: (1002478, 79)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1911227 entries, 0 to 1911226
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

In [None]:
#Save cleaned dataset
df.to_csv("/content/drive/MyDrive/DOS_Project/data_cleaned/Final_cleaned_dataset.csv", index=False)

print(" Cleaned dataset saved to Google Drive")

 Cleaned dataset saved to Google Drive
