# <b>Import Data</b>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("./train.csv")
df.shape

(1783356, 79)

In [3]:
df.columns = df.columns.str.strip()

In [4]:
features =['Destination Port', 'Total Fwd Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Mean', 'Bwd Packet Length Max',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Fwd IAT Std',
       'Fwd Header Length', 'Bwd Header Length', 'Bwd Packets/s',
       'Max Packet Length', 'Packet Length Mean', 'Packet Length Std',
       'Packet Length Variance', 'PSH Flag Count', 'Average Packet Size',
       'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Header Length.1',
       'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Bytes',
       'Init_Win_bytes_forward', 'Init_Win_bytes_backward','Label']

In [5]:
df["Label"].value_counts()

Label
BENIGN                        1432050
DoS Hulk                       145575
PortScan                       100125
DDoS                            80656
DoS GoldenEye                    6484
FTP-Patator                      5000
SSH-Patator                      3714
DoS slowloris                    3651
DoS Slowhttptest                 3464
Bot                              1238
Web Attack � Brute Force          949
Web Attack � XSS                  410
Infiltration                       22
Web Attack � Sql Injection         12
Heartbleed                          6
Name: count, dtype: int64

In [6]:
df = df[features]

In [7]:
X = df.drop(columns=['Label'])
y = df["Label"]
print(X.shape)
print(y.shape)

(1783356, 27)
(1783356,)


### Seperating into different dataframes

In [8]:
high_classes = ['BENIGN', 'DoS Hulk', 'PortScan', 'DDoS']
middle_classes = ['DoS GoldenEye', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris', 'DoS Slowhttptest','Bot']
low_classes = ['Web Attack � Brute Force', 'Web Attack � XSS', 'Infiltration', 'Web Attack � Sql Injection', 'Heartbleed']

In [9]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame)
    
    # Drop rows with missing values
    df.dropna(inplace=True)
    
    # Drop rows with infinity or negative infinity
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    df = df[indices_to_keep]

    return df

# <b>Working with Low Samples</b>

In [10]:
low_df = df[df['Label'].isin(low_classes)]
low_df = clean_dataset(low_df)

# Encode categorical labels
label_encoder = LabelEncoder()
low_df['Label'] = label_encoder.fit_transform(low_df['Label'])

oversampling_ratios = {
    label_encoder.transform(['Web Attack � Brute Force'])[0]: 950, 
    label_encoder.transform(['Web Attack � XSS'])[0]: 800,        
    label_encoder.transform(['Infiltration'])[0]: 250,            
    label_encoder.transform(['Web Attack � Sql Injection'])[0]: 250, 
    label_encoder.transform(['Heartbleed'])[0]: 250
    
}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [11]:
X = low_df.drop(columns=['Label'])
y = low_df['Label']

# Apply SMOTE to balance classes with desired ratios
smote = SMOTE(sampling_strategy=oversampling_ratios, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame for the resampled data
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['Label'] = y_resampled

low_df['data_type'] = 'Original'
resampled_df['data_type'] = 'Augmented'

low_final_df = pd.concat([low_df, resampled_df], ignore_index=True)

# Decode the categorical labels back to original values
low_final_df['Label'] = label_encoder.inverse_transform(low_final_df['Label'])
low_final_df = low_final_df.drop(low_final_df[(low_final_df['Label'] == 'Web Attack � Brute Force') & (low_final_df['data_type'] == 'Augmented')].index)

low_final_df = low_final_df.drop(columns=['data_type'])
low_final_df["Label"].value_counts()



Label
Web Attack � XSS              1210
Web Attack � Brute Force       949
Infiltration                   272
Web Attack � Sql Injection     262
Heartbleed                     256
Name: count, dtype: int64

# <b>Working with High Samples</b>

In [12]:
high_df = df[df['Label'].isin(high_classes)]
high_df = clean_dataset(high_df)
# Encode categorical labels
high_df['Label'] = label_encoder.fit_transform(high_df['Label'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [13]:
from sklearn.utils import resample

majority_class = label_encoder.transform(['BENIGN'])[0]
minority_classes = label_encoder.transform(['DoS Hulk', 'PortScan', 'DDoS'])
majority_samples = high_df[high_df['Label'] == majority_class]
minority_samples = high_df[high_df['Label'].isin(minority_classes)]

# Downsample majority class
downsampled_majority = resample(majority_samples,
                                replace=False,  # Without replacement
                                n_samples=150000,  # Desired number of samples
                                random_state=42)  # For reproducibility

high_final_df = pd.concat([downsampled_majority, minority_samples])
# Shuffle the dataset to ensure randomness
high_final_df = high_final_df.sample(frac=1, random_state=42)
high_final_df['Label'] = label_encoder.inverse_transform(high_final_df['Label'])

high_final_df["Label"].value_counts()


Label
BENIGN      150000
DoS Hulk    145575
PortScan    100125
DDoS         80656
Name: count, dtype: int64

# <b>Concating Dataframes</b>

In [14]:
mid_df = df[df['Label'].isin(middle_classes)]
mid_df = clean_dataset(mid_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [15]:
concat_df = pd.concat([high_final_df, mid_df, low_final_df], ignore_index=True)
concat_df.to_csv('data.csv', index=False)