Reading labeled CSV files from the CIC_2019 dataset folder. Due to the dataset's large size, each CSV file is read individually, with columns related to ports, IPs, MAC addresses, and payloads being dropped. Additionally, the label count for each file is limited to a maximum of 5000. Unmatched rows have been removed, and the resulting DataFrame is exported. Users can adjust these settings based on their system requirements.

In [1]:
import os
import pandas as pd
from tqdm import tqdm

# Function to process all CSV files in a folder
def process_csvs_in_folder(input_folder):
    # Initialize an empty DataFrame to store the combined results
    combined_df = pd.DataFrame()

    # Iterate over all CSV files in the input folder with progress bar
    csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
    
    for csv_file in tqdm(csv_files, desc="Processing CSV files"):
        try:
            # Read the CSV file
            file_path = os.path.join(input_folder, csv_file)
            df = pd.read_csv(file_path, low_memory=False)
            
            # Drop unnecessary columns
            df.drop(['id','expiration_id','src_ip','src_mac','src_oui','src_port','dst_ip','dst_mac','dst_oui','dst_port',
                     'protocol','ip_version','vlan_id','tunnel_id','bidirectional_first_seen_ms','bidirectional_last_seen_ms',
                     'src2dst_first_seen_ms','src2dst_last_seen_ms','dst2src_first_seen_ms','dst2src_last_seen_ms',
                     'application_name', 'application_category_name', 'application_is_guessed', 'application_confidence',
                     'requested_server_name','client_fingerprint','server_fingerprint','user_agent','content_type',
                     'udps.payload_data','udps.delta_time','udps.packet_direction','udps.ip_size','udps.transport_size',
                     'udps.payload_size','udps.syn','udps.cwr','udps.ece','udps.urg','udps.ack','udps.psh','udps.rst',
                     'udps.fin','file'], axis=1, inplace=True)

            # Drop rows with missing values
            df.dropna(inplace=True)

            # Remove rows where the label is 'No Match'
            df = df[df['label'] != 'No Match']

            # Limit the number of instances per class label to 5000
            df = df.groupby('label').apply(lambda x: x.head(5000)).reset_index(drop=True)

            # Append the processed DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        
        except Exception as e:
            print(f"Failed to process {csv_file}: {e}")

    # Return the combined DataFrame
    return combined_df

# Example usage:
input_folder = "/scratch/user/syedwali/Datasets/CIC_2019/processed/labeled_csv"
df = process_csvs_in_folder(input_folder)




Processing CSV files: 100%|██████████| 929/929 [1:07:20<00:00,  4.35s/it]


In [2]:
print('processing completed')

processing completed


In [3]:
# If needed, save the combined DataFrame to a CSV
output_file = "/scratch/user/syedwali/Datasets/undersampled_CIC2019_dataset.csv"
df.to_csv(output_file, index=False)

In [4]:
df['label'].value_counts()

label
TFTP             950295
DrDoS_NTP        917607
DrDoS_DNS        893762
DrDoS_SNMP       415000
LDAP             290000
DrDoS_LDAP       290000
MSSQL            190000
DrDoS_MSSQL      150000
UDP              150000
DrDoS_SSDP       106650
DrDoS_UDP        105000
DrDoS_NetBIOS     65000
NetBIOS           55057
BENIGN            53277
Syn               42004
Portmap            5608
UDP-lag            5000
UDPLag             1203
WebDDoS             148
Name: count, dtype: int64

In [5]:
for i in df.columns:
    print(i)

bidirectional_duration_ms
bidirectional_packets
bidirectional_bytes
src2dst_duration_ms
src2dst_packets
src2dst_bytes
dst2src_duration_ms
dst2src_packets
dst2src_bytes
bidirectional_min_ps
bidirectional_mean_ps
bidirectional_stddev_ps
bidirectional_max_ps
src2dst_min_ps
src2dst_mean_ps
src2dst_stddev_ps
src2dst_max_ps
dst2src_min_ps
dst2src_mean_ps
dst2src_stddev_ps
dst2src_max_ps
bidirectional_min_piat_ms
bidirectional_mean_piat_ms
bidirectional_stddev_piat_ms
bidirectional_max_piat_ms
src2dst_min_piat_ms
src2dst_mean_piat_ms
src2dst_stddev_piat_ms
src2dst_max_piat_ms
dst2src_min_piat_ms
dst2src_mean_piat_ms
dst2src_stddev_piat_ms
dst2src_max_piat_ms
bidirectional_syn_packets
bidirectional_cwr_packets
bidirectional_ece_packets
bidirectional_urg_packets
bidirectional_ack_packets
bidirectional_psh_packets
bidirectional_rst_packets
bidirectional_fin_packets
src2dst_syn_packets
src2dst_cwr_packets
src2dst_ece_packets
src2dst_urg_packets
src2dst_ack_packets
src2dst_psh_packets
src2dst_rst_

In [6]:
df.drop(['flowid'],axis=1,inplace=True)

### Performance Evaluation with contextual Features

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming df is already defined and has a 'label' column
X = df.drop('label', axis=1)  # Features (drop the label column)
y = df['label']  # Labels

# Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.80
Classification Report:
               precision    recall  f1-score   support

       BENIGN       1.00      1.00      1.00     15870
    DrDoS_DNS       0.74      0.87      0.80    268037
   DrDoS_LDAP       0.62      0.18      0.28     87011
  DrDoS_MSSQL       0.71      0.71      0.71     44834
    DrDoS_NTP       1.00      1.00      1.00    275179
DrDoS_NetBIOS       0.88      0.85      0.87     19586
   DrDoS_SNMP       0.57      0.58      0.57    124394
   DrDoS_SSDP       0.58      0.55      0.56     31906
    DrDoS_UDP       0.53      0.52      0.52     31279
         LDAP       0.50      0.57      0.54     87285
        MSSQL       0.78      0.77      0.77     57060
      NetBIOS       0.83      0.95      0.89     16604
      Portmap       0.77      0.51      0.62      1677
          Syn       1.00      1.00      1.00     12534
         TFTP       1.00      1.00      1.00    285230
          UDP       0.61      0.63      0.62     45197
      UDP-lag       0.39  

### Performance Evaluation without contextual Features

To demonstrate the impact of extended contextual features on classification performance, we've removed all such features from our analysis. By doing so, we can observe the potential degradation in classification accuracy when contextual information is limited.

In [8]:
df.drop(['udps.srcdst_packet_size_variation','udps.srcdst_udp_packet_count','udps.udp_packet_count',
'udps.srcdst_tcp_packet_count','udps.tcp_packet_count','udps.srcdst_ack_packet_count',
'udps.ack_packet_count','udps.srcdst_fin_packet_count','udps.fin_packet_count',
'udps.srcdst_rst_packet_count','udps.rst_packet_count','udps.srcdst_psh_packet_count',
'udps.psh_packet_count','udps.srcdst_syn_packet_count','udps.syn_packet_count','udps.srcdst_unique_ports_count',
'udps.srcdst_icmp_packet_count','udps.icmp_packet_count','udps.srcdst_http_ports_count','udps.http_ports_count',
'udps.srcdst_bidirectional_duration_avg','udps.bidirectional_duration_avg','udps.srcdst_dns_port_count',
'udps.dns_port_count','udps.srcdst_dns_port_src_count','udps.dns_port_src_count','udps.srcdst_vul_ports_count',
'udps.src2dst_packet_count','udps.bidirectional_packet_count','udps.srcdst_src2dst_packet_count',
'udps.srcdst_bidirectional_packet_count'],axis=1,inplace=True)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming df is already defined and has a 'label' column
X = df.drop('label', axis=1)  # Features (drop the label column)
y = df['label']  # Labels

# Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.71
Classification Report:
               precision    recall  f1-score   support

       BENIGN       0.99      1.00      1.00     15870
    DrDoS_DNS       0.49      0.96      0.65    268037
   DrDoS_LDAP       0.35      0.01      0.02     87011
  DrDoS_MSSQL       0.51      0.23      0.32     44834
    DrDoS_NTP       0.99      1.00      1.00    275179
DrDoS_NetBIOS       0.47      0.87      0.61     19586
   DrDoS_SNMP       0.77      0.24      0.37    124394
   DrDoS_SSDP       0.42      0.05      0.09     31906
    DrDoS_UDP       0.42      0.02      0.05     31279
         LDAP       0.40      0.00      0.01     87285
        MSSQL       0.55      0.80      0.65     57060
      NetBIOS       0.56      0.10      0.17     16604
      Portmap       0.69      0.04      0.08      1677
          Syn       0.89      0.99      0.94     12534
         TFTP       1.00      1.00      1.00    285230
          UDP       0.42      0.93      0.58     45197
      UDP-lag       0.00  