This notebook aims to use pycaret on the CICDDoS2019 dataset split according to the original release paper.
the original release paper here uses:

01-12 folder as the training data

03-12 as the testing data

and attempts to use all categories

In [1]:
import dask.dataframe as dd
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
%matplotlib inline 

Function to efficiently read a CSV file into a DataFrame

In [2]:
def read_csv_efficiently(file_path):
   chunksize = 10000  # Adjust chunksize as needed based on file size and memory
   df_chunks = dd.read_csv(file_path, chunksize=chunksize)
   df = dd.concat(df_chunks, ignore_index=True)
   return df

strictly type and use dask to minimize RAM usage

In [3]:
def reduce_mem_usage(df, int_cast=False, obj_to_category=True, subset=None):
    """
    Optimizes memory usage of a Dask DataFrame by adjusting dtypes.
    """
    start_mem = df.memory_usage(deep=True).sum().compute() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    cols = subset if subset is not None else df.columns

    for col in cols:
        col_type = df[col].dtype
        if col_type != 'object' and col_type != 'string' and not isinstance(col_type, (pd.DatetimeTZDtype, pd.CategoricalDtype, np.dtypes.StrDType)):
            try:  # Handle potential typing errors
                c_min = df[col].min().compute()
                c_max = df[col].max().compute()
            except TypeError:
                continue  # Skip columns with non-numeric values

            # Check for integer conversion
            treat_as_int = str(col_type)[:3] == 'int'
            if int_cast and not treat_as_int:
                treat_as_int = pd.api.types.is_integer_dtype(df[col])

            if treat_as_int:
                for np_type in [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64]:
                    if c_min > np.iinfo(np_type).min and c_max < np.iinfo(np_type).max:
                        df[col] = df[col].astype(np_type)
                        break
            else:
                for np_type in [np.float16, np.float32, np.float64]:
                    # Extract numeric values before comparison
                    if c_min > np.finfo(np_type).min and c_max < np.finfo(np_type).max:
                        df[col] = df[col].astype(np_type)
                        break

        elif not isinstance(col_type, pd.DatetimeTZDtype) and obj_to_category:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum().compute() / 1024 ** 2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

listing columns names

In [4]:
columns = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length',
       ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
       'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count',
       ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count',
       ' CWE Flag Count', ' ECE Flag Count', ' Down/Up Ratio',
       ' Average Packet Size', ' Avg Fwd Segment Size',
       ' Avg Bwd Segment Size', ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk',
       ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk',
       ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Packets',
       ' Subflow Fwd Bytes', ' Subflow Bwd Packets', ' Subflow Bwd Bytes',
       'Init_Win_bytes_forward', ' Init_Win_bytes_backward',
       ' act_data_pkt_fwd', ' min_seg_size_forward', 'Active Mean',
       ' Active Std', ' Active Max', ' Active Min', 'Idle Mean', ' Idle Std',
       ' Idle Max', ' Idle Min', 'SimillarHTTP', ' Inbound', ' Label']

function to combine the df

In [5]:
def combine_df(dir, categories):
    df = dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
    for ddos_type in categories:
        file_path = os.path.join(dir, ddos_type)
        df = dd.concat([df, reduce_mem_usage(dd.read_csv(file_path, dtype=dtype))], ignore_index=True)
        
    # Check for potential issues and handle them as needed
    if df.isnull().values.any():
       print("Warning: DataFrame contains missing values. Consider handling them.")
    return df

pre-defining these removes some bugs

In [6]:
dtype={'SimillarHTTP': 'object', ' Label': 'object', 'Flow_ID' : 'object', ' Source_IP' : 'object', ' Destination IP': 'object', ' TimeStamp': 'object'}

Read all of the training CSVs into one Data Frame

In [None]:
# CSV file directory and file names for testing
train_dir = 'C:\\Users\ktv07101\\Desktop\\BHNI Anomaly Detection Related\\DDoS Training Data\\CIC_DDoS2019\\publicCSV\\CSV-03-11\\03-11'
train_ddos_categories = ['LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv', 'Portmap.csv', 'Syn.csv', 'UDP.csv', 'UDPLag.csv']
test_df = combine_df(train_dir, train_ddos_categories)

Memory usage of dataframe is 1550.14 MB


Read all of the testing CSVs into one Data Frame

In [None]:
# CSV file directory and file names for training
test_dir = 'C:\\Users\\ktv07101\\Desktop\\BHNI Anomaly Detection Related\\DDoS Training Data\\CIC_DDoS2019\\publicCSV\\CSV-01-12\\01-12'
test_ddos_categories = ['DrDoS_SSDP.csv', 'DrDoS_NTP.csv', 'TFTP.csv', 'UDPLag.csv', 'DrDoS_UDP.csv', 'Syn.csv', 'DrDoS_MSSQL.csv', 'DrDoS_SNMP.csv', 'DrDoS_DNS.csv', 'DrDoS_LDAP.csv']
test_df = combine_df(test_dir, test_ddos_categories)

verify there aren't clear NaN and infs

In [None]:
import re
combined_df = combined_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
%%capture
!pip install pycaret

In [None]:
from pycaret.classification import *
s = setup(combined_df, ignore_features=['Flow_Bytes_s', '_Flow_Packets_s'], target='_Label', feature_selection=True, n_features_to_select=5)

In [None]:
best = compare_models(budget_time=420)

In [None]:
preds = predict_model(best)

In [None]:
plot_model(best, plot='confusion_matrix')

In [None]:
plot_model(best, plot='auc')

In [None]:
plot_model(best, plot='class_report')

In [None]:
plot_model(best, plot='feature')

In [None]:
result = predict_model(best, data=combined_df)

In [None]:
result.head()

In [None]:
result.query('Type == prediction_label').shape

In [None]:
result.shape

In [None]:
result = result[['TimeDateStamp', 'AddressOfEntryPoint', 'SizeOfInitializedData', 'SizeOfCode', 'SizeOfImage', 'Type', 'prediction_label', 'prediction_score']]