In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit


# Filter malicious rows
malicious_df = df[df['Label'] == 'Malicious'].copy()


# Bin Flow Duration
malicious_df['duration_bin'] = pd.qcut(malicious_df['Flow Duration'], q=5, labels=False, duplicates='drop')


# Create strata key
malicious_df['strata_key'] = (
    malicious_df['Protocol'].astype(str) + '_' +
    malicious_df['duration_bin'].astype(str) + '_' +
    malicious_df['Traffic Subtype'].astype(str)
)


# Drop too-small strata groups
strata_counts = malicious_df['strata_key'].value_counts()
valid_keys = strata_counts[strata_counts >= 2].index
malicious_df = malicious_df[malicious_df['strata_key'].isin(valid_keys)]


# Stratified undersampling
splitter = StratifiedShuffleSplit(n_splits=1, test_size=5000, random_state=42)
for _, sample_idx in splitter.split(malicious_df.index, malicious_df['strata_key']):
    sampled_malicious = malicious_df.iloc[sample_idx].copy()


# Cleanup
sampled_malicious.drop(columns=['duration_bin', 'strata_key'], inplace=True)