### Comparing Flow vs Flow+Payload

Reading CIC-IoT Undersampled Version having both flow and payload features

In [80]:
import pandas as pd
df=pd.read_csv('Datasets/undersampled_iot23_dataset.csv')

In [81]:
# Step 1: Get the count of instances per class
target_instances = 2000

# Step 2: Undersample or oversample each class to have exactly 2500 instances
balanced_df = pd.DataFrame()  # Empty dataframe to store balanced data

for label in df['label'].unique():
    class_subset = df[df['label'] == label]
    
    # If class has more than 2500 instances, undersample
    if len(class_subset) > target_instances:
        class_subset = class_subset.sample(target_instances, random_state=42)
    
    # If class has less than 2500 instances, oversample
    #elif len(class_subset) < target_instances:
    #    class_subset = class_subset.sample(target_instances, replace=True, random_state=42)
    
    # Append the balanced class subset to the balanced dataframe
    balanced_df = pd.concat([balanced_df, class_subset])

# Reset index of the final balanced dataframe
balanced_df = balanced_df.reset_index(drop=True)

In [83]:
# Define the mapping dictionary
label_mapping = {
    'Backdoor_Malware': 'webbased',
    'BrowserHijacking': 'webbased',
    'CommandInjection': 'webbased',
    'DDoS-ACK_Fragmentation': 'ddos',
    'DDoS-HTTP_Flood-': 'ddos',
    'DDoS-ICMP_Flood': 'ddos',
    'DDoS-ICMP_Fragmentation': 'ddos',
    'DDoS-PSHACK_Flood': 'ddos',
    'DDoS-RSTFINFlood': 'ddos',
    'DDoS-SYN_Flood': 'ddos',
    'DDoS-SlowLoris': 'ddos',
    'DDoS-SynonymousIP_Flood': 'ddos',
    'DDoS-TCP_Flood': 'ddos',
    'DDoS-UDP_Flood': 'ddos',
    'DDoS-UDP_Fragmentation': 'ddos',
    'DNS_Spoofing': 'spoofing',
    'DictionaryBruteForce': 'brute',
    'DoS-HTTP_Flood': 'dos',
    'DoS-SYN_Flood': 'dos',
    'DoS-TCP_Flood': 'dos',
    'DoS-UDP_Flood': 'dos',
    'MITM-ArpSpoofing': 'spoofing',
    'Mirai-udpplain': 'mirai',
    'Recon-HostDiscovery': 'recon',
    'Recon-PortScan': 'recon',
    'SqlInjection': 'webbased',
    'XSS': 'webbased',
    'benign': 'benign'
}

# Apply the mapping to the 'label' column
balanced_df['label_mapped'] = balanced_df['label'].map(label_mapping)



In [84]:
import pandas as pd
from sklearn.utils import resample
import ast 
import string
# Function to convert hex values to ASCII string
def hex_to_ascii(hex_list):
    ascii_str = ''

    #print(len(hex_list))
    for hex_val in hex_list:

    # Split each hex value into pairs and convert them to ASCII characters
      # Split each hex value into pairs and convert them to ASCII characters
      ascii_str += ''.join([chr(int(hex_val[i:i+2], 16)) for i in range(0, len(hex_val), 2)])
      filtered_str = ''.join(filter(lambda x: x in string.printable, ascii_str))
    #print(filtered_str)

    return filtered_str

# Convert each string representation to a list
balanced_df['udps.payload_data_parsed'] = balanced_df['udps.payload_data'].apply(ast.literal_eval)
balanced_df['string'] = balanced_df['udps.payload_data_parsed'].apply(hex_to_ascii)
#s=balanced_df.dropna(subset=['string'])
empty_rows = balanced_df['string'].str.strip().eq('')

# Get the indices of the rows to be dropped
indices_to_drop = empty_rows[empty_rows].index

# Drop the rows from the DataFrame
s = balanced_df.drop(indices_to_drop)

# Reset the index after dropping rows
s.reset_index(drop=True, inplace=True)

# Check the modified DataFrame
print(len(indices_to_drop))

29344


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
f=s.dropna(subset=['label_mapped'])
# Assuming 'data' contains the 'string' and 'label' columns
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split((f), f['label_mapped'], test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Fit the vectorizer on the training data and transform both train and test data
X_train_tfidf = tfidf.fit_transform(X_train['string'])
#X_test_tfidf = tfidf.transform(X_test['string'])

# Initialize the Random Forest Classifier with parallel processing (n_jobs=-1)
rf_classifier = RandomForestClassifier(n_jobs=-1, random_state=42)

# Train the classifier on the TF-IDF features and labels
rf_classifier.fit(X_train_tfidf, y_train)


In [86]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet

# Function to check if a word exists in WordNet
def is_word_in_wordnet(word):
    return wordnet.synsets(word)

# Get the feature names (vocabulary terms)
feature_names = np.array(tfidf.get_feature_names_out())

# Create an empty set to store top terms across all classes
top_terms = set()

# For each class, get the top 300 terms based on the average TF-IDF score
for class_label in y_train.unique():
    # Select the training samples for this class
    class_indices = np.where(y_train == class_label)
    
    # Calculate the mean TF-IDF score for each feature across the class samples
    mean_tfidf_scores = X_train_tfidf[class_indices].mean(axis=0).A1  # Convert sparse matrix to a dense array
    
    # Get the indices of the top 300 terms
    top_300_indices = mean_tfidf_scores.argsort()[-1000:][::-1]  # Sort and select top 300
    
    # Add the corresponding terms to the top_terms set
    top_terms.update(feature_names[top_300_indices])

# Filter feature names based on WordNet
filtered_top_terms = [term for term in top_terms if is_word_in_wordnet(term)]

# Restrict the vectorizer's vocabulary to the filtered top terms
tfidf_restricted = TfidfVectorizer(vocabulary=filtered_top_terms)



In [87]:
import re

# Function to filter terms
def is_valid_term(term):
    # Check if the term is more than 2 characters and does not consist of only digits
    return len(term) > 2 and not term.isdigit()

# Apply the filter to the list of terms
filtered_top_terms = [term for term in filtered_top_terms if is_valid_term(term)]

# Print the filtered terms
print(filtered_top_terms)


['response', 'streaming', 'sep', 'english', 'corresponding', 'philips', 'cam', 'feb', 'hint', 'trusted', 'close', 'licensed', 'signed', 'dial', 'discover', 'fail', 'password', 'top', 'news', 'azure', 'each', 'request', 'chromium', 'fbi', 'intellectual', 'accept', 'link', 'access', 'parameter', 'under', 'yahoo', 'see', 'texts', 'options', 'chunked', 'data', 'match', 'malicious', 'age', 'images', 'chrome', 'has', 'null', 'are', 'debug', 'three', 'permanently', 'image', 'spider', 'east', 'label', 'resolution', 'help', 'invalid', 'string', 'must', 'client', 'nim', 'height', 'net', 'wrapper', 'upgrade', 'transfer', 'base', 'new', 'describe', 'array', 'origin', 'void', 'lin', 'hellman', 'mode', 'gmt', 'description', 'packages', 'development', 'html', 'timely', 'script', 'hub', 'code', 'agent', 'boa', 'self', 'length', 'personal', 'url', 'below', 'source', 'while', 'update', 'licenses', 'met', 'fps', 'offers', 'gecko', 'placeholder', 'example', 'butcher', 'synthesizer', 'hook', 'letter', 'dev

In [90]:
tfidf_restricted = TfidfVectorizer(vocabulary=filtered_top_terms)
X_train_tfidf_restricted = tfidf_restricted.fit_transform(X_train['string'])
X_test_tfidf_restricted = tfidf_restricted.transform(X_test['string'])

In [93]:
flow_features = [
    'bidirectional_duration_ms','bidirectional_packets','bidirectional_bytes','src2dst_duration_ms','src2dst_packets','src2dst_bytes',
'dst2src_duration_ms','dst2src_packets','dst2src_bytes','bidirectional_min_ps','bidirectional_mean_ps',
'bidirectional_stddev_ps','bidirectional_max_ps','src2dst_min_ps','src2dst_mean_ps','src2dst_stddev_ps','src2dst_max_ps',
'dst2src_min_ps','dst2src_mean_ps','dst2src_stddev_ps','dst2src_max_ps','bidirectional_min_piat_ms','bidirectional_mean_piat_ms',
'bidirectional_stddev_piat_ms','bidirectional_max_piat_ms','src2dst_min_piat_ms','src2dst_mean_piat_ms','src2dst_stddev_piat_ms',
'src2dst_max_piat_ms','dst2src_min_piat_ms','dst2src_mean_piat_ms','dst2src_stddev_piat_ms','dst2src_max_piat_ms','bidirectional_syn_packets',
'bidirectional_cwr_packets','bidirectional_ece_packets','bidirectional_urg_packets','bidirectional_ack_packets','bidirectional_psh_packets',
'bidirectional_rst_packets','bidirectional_fin_packets','src2dst_syn_packets','src2dst_cwr_packets','src2dst_ece_packets',
'src2dst_urg_packets','src2dst_ack_packets','src2dst_psh_packets','src2dst_rst_packets','src2dst_fin_packets','dst2src_syn_packets',
'dst2src_cwr_packets','dst2src_ece_packets','dst2src_urg_packets','dst2src_ack_packets',
'dst2src_psh_packets','dst2src_rst_packets','dst2src_fin_packets'
]

In [94]:
# Convert the fixed-length payload data into a NumPy array
X_payload = np.array(X_train_tfidf_restricted).tolist()
# Extract the additional features from the DataFrame
X_additional = X_train[flow_features].to_numpy()

#### Training on both flow+payload

In [95]:
X = np.hstack([X_additional, X_payload.toarray()])

In [96]:
# Assuming the labels are in a column named 'label_column'
y = X_train['label_mapped']

# Split the data into train and test sets (80% train, 20% test)
X_train_mix, X_val_mix, y_train_mix, y_val_mix = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train_mix, y_train_mix)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_val_mix)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_val_mix, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_val_mix, y_pred))


Accuracy: 0.9405
Classification Report:
              precision    recall  f1-score   support

      benign       0.87      0.92      0.89       162
       brute       0.96      0.92      0.94       145
        ddos       0.99      0.96      0.97       717
         dos       0.92      0.96      0.94       185
       mirai       1.00      1.00      1.00       164
       recon       0.73      0.73      0.73        15
    spoofing       0.91      0.87      0.89       275
    webbased       0.89      0.94      0.91       287

    accuracy                           0.94      1950
   macro avg       0.91      0.91      0.91      1950
weighted avg       0.94      0.94      0.94      1950



Evaluating on test-set

In [97]:
# Convert the fixed-length payload data into a NumPy array
X_payload2 = np.array(X_test_tfidf_restricted).tolist()
# Extract the additional features from the DataFrame
X_additional2 = X_test[flow_features].to_numpy()

In [98]:
X_test_derived = np.hstack([X_additional2, X_payload2.toarray()])

In [99]:
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_derived)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9405
Classification Report:
              precision    recall  f1-score   support

      benign       0.85      0.91      0.88       359
       brute       0.95      0.93      0.94       380
        ddos       0.99      0.97      0.98      1830
         dos       0.91      0.95      0.93       436
       mirai       1.00      1.00      1.00       405
       recon       0.62      0.67      0.65        52
    spoofing       0.92      0.87      0.90       709
    webbased       0.89      0.94      0.91       702

    accuracy                           0.94      4873
   macro avg       0.89      0.91      0.90      4873
weighted avg       0.94      0.94      0.94      4873



#### Training on only flow

In [101]:
X = np.hstack([X_additional])

In [102]:
# Assuming the labels are in a column named 'label_column'
y = X_train['label_mapped']

# Split the data into train and test sets (80% train, 20% test)
X_train_mix, X_val_mix, y_train_mix, y_val_mix = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train_mix, y_train_mix)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_val_mix)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_val_mix, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_val_mix, y_pred))


Accuracy: 0.9405
Classification Report:
              precision    recall  f1-score   support

      benign       0.88      0.92      0.90       162
       brute       0.95      0.92      0.94       145
        ddos       0.99      0.97      0.98       717
         dos       0.93      0.96      0.94       185
       mirai       1.00      1.00      1.00       164
       recon       0.67      0.67      0.67        15
    spoofing       0.90      0.86      0.88       275
    webbased       0.89      0.93      0.91       287

    accuracy                           0.94      1950
   macro avg       0.90      0.90      0.90      1950
weighted avg       0.94      0.94      0.94      1950



Evaluating on test-set

In [103]:
X_test_derived = np.hstack([X_additional2])

In [104]:
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_derived)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9382
Classification Report:
              precision    recall  f1-score   support

      benign       0.87      0.92      0.89       359
       brute       0.95      0.93      0.94       380
        ddos       0.99      0.97      0.98      1830
         dos       0.91      0.92      0.92       436
       mirai       1.00      1.00      1.00       405
       recon       0.56      0.62      0.59        52
    spoofing       0.92      0.88      0.90       709
    webbased       0.88      0.93      0.90       702

    accuracy                           0.94      4873
   macro avg       0.88      0.90      0.89      4873
weighted avg       0.94      0.94      0.94      4873

