In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)


In [2]:
DATA_PATH = r"D:\Coding\VanetUAV\data\processed\cicids_v1_raw.csv"

df = pd.read_csv(DATA_PATH)

df.shape


(2830743, 83)

In [3]:
if "label_bin" not in df.columns:
    df["label_bin"] = (df["Label"] != "BENIGN").astype(int)

df["label_bin"].value_counts(), df["label_bin"].value_counts(normalize=True)


(label_bin
 0    2273097
 1     557646
 Name: count, dtype: int64,
 label_bin
 0    0.803004
 1    0.196996
 Name: proportion, dtype: float64)

In [4]:
df["Label"].value_counts(normalize=True)


Label
BENIGN                        0.803004
DoS Hulk                      0.081630
PortScan                      0.056144
DDoS                          0.045227
DoS GoldenEye                 0.003636
FTP-Patator                   0.002804
SSH-Patator                   0.002083
DoS slowloris                 0.002048
DoS Slowhttptest              0.001943
Bot                           0.000695
Web Attack � Brute Force      0.000532
Web Attack � XSS              0.000230
Infiltration                  0.000013
Web Attack � Sql Injection    0.000007
Heartbleed                    0.000004
Name: proportion, dtype: float64

In [5]:
RANDOM_STATE = 42

df_10pct, _ = train_test_split(
    df,
    test_size=0.90,               # keep 10%
    stratify=df["label_bin"],     # preserve ratio
    random_state=RANDOM_STATE
)

df_10pct.shape


(283074, 84)

In [6]:
print("FULL DATASET")
print(df["label_bin"].value_counts(normalize=True))

print("\n10% DATASET")
print(df_10pct["label_bin"].value_counts(normalize=True))


FULL DATASET
label_bin
0    0.803004
1    0.196996
Name: proportion, dtype: float64

10% DATASET
label_bin
0    0.803002
1    0.196998
Name: proportion, dtype: float64


In [7]:
comparison = pd.concat(
    [
        df["Label"].value_counts(normalize=True),
        df_10pct["Label"].value_counts(normalize=True)
    ],
    axis=1,
    keys=["full_dataset", "10pct_dataset"]
).fillna(0)

comparison


Unnamed: 0_level_0,full_dataset,10pct_dataset
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
BENIGN,0.803004,0.803002
DoS Hulk,0.08163,0.081491
PortScan,0.056144,0.056353
DDoS,0.045227,0.045077
DoS GoldenEye,0.003636,0.003646
FTP-Patator,0.002804,0.002692
SSH-Patator,0.002083,0.002187
DoS slowloris,0.002048,0.002028
DoS Slowhttptest,0.001943,0.002067
Bot,0.000695,0.000742


In [8]:
cols_to_check = [
    "Flow Duration",
    "Total Fwd Packets",
    "Total Backward Packets",
    "Packet Length Mean"
]

df[cols_to_check].describe(percentiles=[.05, .5, .95])


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Packet Length Mean
count,2830743.0,2830743.0,2830743.0,2830743.0
mean,14785660.0,9.36116,10.39377,171.9444
std,33653740.0,749.6728,997.3883,305.4915
min,-13.0,1.0,0.0,0.0
5%,4.0,1.0,0.0,0.0
50%,31316.0,2.0,2.0,57.2
95%,101114600.0,19.0,17.0,917.4615
max,120000000.0,219759.0,291922.0,3337.143


In [9]:
df_10pct[cols_to_check].describe(percentiles=[.05, .5, .95])


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Packet Length Mean
count,283074.0,283074.0,283074.0,283074.0
mean,14732590.0,9.758099,10.961858,170.947655
std,33572050.0,838.130197,1131.55195,304.436024
min,-1.0,1.0,0.0,0.0
5%,4.0,1.0,0.0,0.0
50%,31271.5,2.0,2.0,57.2
95%,100959700.0,19.0,17.0,917.076923
max,120000000.0,207964.0,284602.0,2595.555556


In [10]:
OUTPUT_PATH = r"D:\Coding\VanetUAV\data\processed\cicids_10pct_stratified.csv"

df_10pct.to_csv(OUTPUT_PATH, index=False)

print(f"Saved {df_10pct.shape[0]} rows to:")
print(OUTPUT_PATH)


Saved 283074 rows to:
D:\Coding\VanetUAV\data\processed\cicids_10pct_stratified.csv


In [11]:
del df
import gc
gc.collect()


0