Reading labeled CSV files from the CIC_2019 dataset folder. Due to the dataset's large size, each CSV file is read individually, with columns related to ports, IPs, MAC addresses, and payloads being dropped. Additionally, the label count for each file is limited to a maximum of 5000. Unmatched rows have been removed, and the resulting DataFrame is exported. Users can adjust these settings based on their system requirements.

In [1]:
import os
import pandas as pd
from tqdm import tqdm

# Function to process all CSV files in a folder
def process_csvs_in_folder(input_folder):
    # Initialize an empty DataFrame to store the combined results
    combined_df = pd.DataFrame()

    # Iterate over all CSV files in the input folder with progress bar
    csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
    
    for csv_file in tqdm(csv_files, desc="Processing CSV files"):
        try:
            # Read the CSV file
            file_path = os.path.join(input_folder, csv_file)
            df = pd.read_csv(file_path, low_memory=False)
            
            # Drop unnecessary columns
            df.drop(['id','expiration_id','src_ip','src_mac','src_oui','src_port','dst_ip','dst_mac','dst_oui','dst_port',
                     'protocol','ip_version','vlan_id','tunnel_id','bidirectional_first_seen_ms','bidirectional_last_seen_ms',
                     'src2dst_first_seen_ms','src2dst_last_seen_ms','dst2src_first_seen_ms','dst2src_last_seen_ms',
                     'application_name', 'application_category_name', 'application_is_guessed', 'application_confidence',
                     'requested_server_name','client_fingerprint','server_fingerprint','user_agent','content_type',
                     'udps.payload_data','udps.delta_time','udps.packet_direction','udps.ip_size','udps.transport_size',
                     'udps.payload_size','udps.syn','udps.cwr','udps.ece','udps.urg','udps.ack','udps.psh','udps.rst',
                     'udps.fin','file'], axis=1, inplace=True)

            # Drop rows with missing values
            df.dropna(inplace=True)

            # Remove rows where the label is 'No Match'
            df = df[df['label'] != 'No Match']

            # Limit the number of instances per class label to 5000
            df = df.groupby('label').apply(lambda x: x.head(5000)).reset_index(drop=True)

            # Append the processed DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        
        except Exception as e:
            print(f"Failed to process {csv_file}: {e}")

    # Return the combined DataFrame
    return combined_df

# Example usage:
input_folder = "/scratch/user/syedwali/Datasets/CIC_2019/processed/labeled_csv"
df = process_csvs_in_folder(input_folder)




Processing CSV files: 100%|██████████| 928/928 [1:11:10<00:00,  4.60s/it]


In [3]:
print('processing completed')

processing completed


In [4]:
# If needed, save the combined DataFrame to a CSV
output_file = "/scratch/user/syedwali/Datasets/undersampled_CIC2019_dataset.csv"
df.to_csv(output_file, index=False)

In [5]:
df['label'].value_counts()

label
BENIGN           1302431
TFTP              950229
DrDoS_NTP         916545
DrDoS_DNS         868900
DrDoS_SNMP        415000
DrDoS_LDAP        240462
MSSQL             190000
LDAP              188571
DrDoS_MSSQL       150000
UDP               150000
DrDoS_SSDP        105000
DrDoS_UDP         105000
DrDoS_NetBIOS      60000
NetBIOS            55000
Syn                41288
Portmap             5000
UDPLag               185
Name: count, dtype: int64

In [6]:
for i in df.columns:
    print(i)

bidirectional_duration_ms
bidirectional_packets
bidirectional_bytes
src2dst_duration_ms
src2dst_packets
src2dst_bytes
dst2src_duration_ms
dst2src_packets
dst2src_bytes
bidirectional_min_ps
bidirectional_mean_ps
bidirectional_stddev_ps
bidirectional_max_ps
src2dst_min_ps
src2dst_mean_ps
src2dst_stddev_ps
src2dst_max_ps
dst2src_min_ps
dst2src_mean_ps
dst2src_stddev_ps
dst2src_max_ps
bidirectional_min_piat_ms
bidirectional_mean_piat_ms
bidirectional_stddev_piat_ms
bidirectional_max_piat_ms
src2dst_min_piat_ms
src2dst_mean_piat_ms
src2dst_stddev_piat_ms
src2dst_max_piat_ms
dst2src_min_piat_ms
dst2src_mean_piat_ms
dst2src_stddev_piat_ms
dst2src_max_piat_ms
bidirectional_syn_packets
bidirectional_cwr_packets
bidirectional_ece_packets
bidirectional_urg_packets
bidirectional_ack_packets
bidirectional_psh_packets
bidirectional_rst_packets
bidirectional_fin_packets
src2dst_syn_packets
src2dst_cwr_packets
src2dst_ece_packets
src2dst_urg_packets
src2dst_ack_packets
src2dst_psh_packets
src2dst_rst_

In [8]:
df.drop(['flowid'],axis=1,inplace=True)

### Performance Evaluation with contextual Features

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming df is already defined and has a 'label' column
X = df.drop('label', axis=1)  # Features (drop the label column)
y = df['label']  # Labels

# Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

       BENIGN       0.58      0.72      0.64    390512
    DrDoS_DNS       0.61      0.86      0.71    261362
   DrDoS_LDAP       0.77      0.16      0.26     71965
  DrDoS_MSSQL       0.81      0.46      0.58     44958
    DrDoS_NTP       1.00      1.00      1.00    274893
DrDoS_NetBIOS       0.97      0.87      0.92     18080
   DrDoS_SNMP       0.72      0.39      0.51    124904
   DrDoS_SSDP       0.58      0.55      0.56     31366
    DrDoS_UDP       0.54      0.53      0.53     31209
         LDAP       0.65      0.16      0.26     56479
        MSSQL       0.75      0.46      0.57     57118
      NetBIOS       0.98      0.87      0.92     16421
      Portmap       0.91      0.16      0.27      1456
          Syn       0.99      0.98      0.99     12280
         TFTP       1.00      1.00      1.00    285131
          UDP       0.61      0.63      0.62     44893
       UDPLag       0.89  

### Performance Evaluation without contextual Features

To demonstrate the impact of extended contextual features on classification performance, we've removed all such features from our analysis. By doing so, we can observe the potential degradation in classification accuracy when contextual information is limited.

In [10]:
df.drop(['udps.srcdst_packet_size_variation','udps.srcdst_udp_packet_count','udps.udp_packet_count',
'udps.srcdst_tcp_packet_count','udps.tcp_packet_count','udps.srcdst_ack_packet_count',
'udps.ack_packet_count','udps.srcdst_fin_packet_count','udps.fin_packet_count',
'udps.srcdst_rst_packet_count','udps.rst_packet_count','udps.srcdst_psh_packet_count',
'udps.psh_packet_count','udps.srcdst_syn_packet_count','udps.syn_packet_count','udps.srcdst_unique_ports_count',
'udps.srcdst_icmp_packet_count','udps.icmp_packet_count','udps.srcdst_http_ports_count','udps.http_ports_count',
'udps.srcdst_bidirectional_duration_avg','udps.bidirectional_duration_avg','udps.srcdst_dns_port_count',
'udps.dns_port_count','udps.srcdst_dns_port_src_count','udps.dns_port_src_count','udps.srcdst_vul_ports_count',
'udps.src2dst_packet_count','udps.bidirectional_packet_count','udps.srcdst_src2dst_packet_count',
'udps.srcdst_bidirectional_packet_count'],axis=1,inplace=True)

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming df is already defined and has a 'label' column
X = df.drop('label', axis=1)  # Features (drop the label column)
y = df['label']  # Labels

# Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.60
Classification Report:
               precision    recall  f1-score   support

       BENIGN       0.47      0.36      0.41    390512
    DrDoS_DNS       0.37      0.95      0.53    261362
   DrDoS_LDAP       0.44      0.00      0.00     71965
  DrDoS_MSSQL       0.47      0.02      0.04     44958
    DrDoS_NTP       0.99      1.00      0.99    274893
DrDoS_NetBIOS       0.93      0.02      0.03     18080
   DrDoS_SNMP       0.60      0.21      0.32    124904
   DrDoS_SSDP       0.42      0.05      0.09     31366
    DrDoS_UDP       0.42      0.03      0.05     31209
         LDAP       0.47      0.00      0.00     56479
        MSSQL       0.49      0.05      0.08     57118
      NetBIOS       0.95      0.03      0.06     16421
      Portmap       0.17      0.00      0.00      1456
          Syn       0.78      0.60      0.68     12280
         TFTP       0.91      0.99      0.95    285131
          UDP       0.42      0.94      0.58     44893
       UDPLag       0.29  

In [13]:
import pandas as pd
df2=pd.read_csv('Datasets/CIC_2019/01-12/DrDoS_DNS.csv')

  df2=pd.read_csv('Datasets/CIC_2019/01-12/DrDoS_DNS.csv')


In [14]:
df2.columns

Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Len

In [18]:
import os
import pandas as pd

# List of folder paths
folder_paths = ['Datasets/CIC_2019/01-12/', 'Datasets/CIC_2019/03-11/']  # Add more paths as needed

# Set to store unique labels (set avoids duplicates automatically)
all_unique_labels = set()

# Loop through each folder path
for folder_path in folder_paths:
    print(f"Processing folder: {folder_path}")
    
    # Loop through all files in the current folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):  # Only process CSV files
            file_path = os.path.join(folder_path, file_name)
            print(f"Processing file: {file_name}")
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Check if ' Label' column exists
            if ' Label' in df.columns:
                # Add unique labels from this file to the set
                all_unique_labels.update(df[' Label'].unique())
            else:
                print(f"' Label' column not found in {file_name}")

# Show all unique labels found across all files
print("All unique labels:")
print(sorted(all_unique_labels))  # 


Processing folder: Datasets/CIC_2019/01-12/
Processing file: DrDoS_LDAP.csv


  df = pd.read_csv(file_path)


Processing file: DrDoS_DNS.csv


  df = pd.read_csv(file_path)


Processing file: Syn.csv


  df = pd.read_csv(file_path)


Processing file: DrDoS_SNMP.csv


  df = pd.read_csv(file_path)


Processing file: DrDoS_NTP.csv


  df = pd.read_csv(file_path)


Processing file: DrDoS_MSSQL.csv


  df = pd.read_csv(file_path)


Processing file: DrDoS_UDP.csv


  df = pd.read_csv(file_path)


Processing file: UDPLag.csv


  df = pd.read_csv(file_path)


Processing file: DrDoS_NetBIOS.csv


  df = pd.read_csv(file_path)


Processing file: TFTP.csv


  df = pd.read_csv(file_path)


Processing file: DrDoS_SSDP.csv


  df = pd.read_csv(file_path)


Processing folder: Datasets/CIC_2019/03-11/
Processing file: MSSQL.csv


  df = pd.read_csv(file_path)


Processing file: Syn.csv


  df = pd.read_csv(file_path)


Processing file: NetBIOS.csv


  df = pd.read_csv(file_path)


Processing file: UDP.csv


  df = pd.read_csv(file_path)


Processing file: LDAP.csv


  df = pd.read_csv(file_path)


Processing file: UDPLag.csv


  df = pd.read_csv(file_path)


Processing file: Portmap.csv
All unique labels:
['BENIGN', 'DrDoS_DNS', 'DrDoS_LDAP', 'DrDoS_MSSQL', 'DrDoS_NTP', 'DrDoS_NetBIOS', 'DrDoS_SNMP', 'DrDoS_SSDP', 'DrDoS_UDP', 'LDAP', 'MSSQL', 'NetBIOS', 'Portmap', 'Syn', 'TFTP', 'UDP', 'UDP-lag', 'UDPLag', 'WebDDoS']


  df = pd.read_csv(file_path)
