In [1]:
# Importing the required files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import pennylane as qml
from pennylane import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the original dataset.
file_path = "E:\\Studies\\IIT\\4 - Forth Year\\Final Year Project\\QuanNetDetct\\Datasets\\Darknet.csv"
darknet_data = pd.read_csv(file_path)

In [3]:
#  Process 1 - Filter TLS-related traffic.
tls_ports = [443, 993, 995, 465, 8443]
tls_traffic = darknet_data[
    (darknet_data['Dst Port'].isin(tls_ports)) &  # Destination port is TLS-related.
    (darknet_data['Protocol'] == 6)              # Protocol is TCP.
]

print("TLS traffic filtered!")

TLS traffic filtered!


In [5]:
#  Process 2 - Encoding the TLS traffic data.

# Creating an object of the LabelEncoder class
label_encoder = LabelEncoder()

# Encode string columns
for column in tls_traffic.select_dtypes(include=['object']).columns:
    # Assign explicitly to avoid warning
    encoded_column = label_encoder.fit_transform(tls_traffic[column])
    tls_traffic.loc[:, column] = encoded_column

print("Encoding Completed!")

Encoding Completed!


In [6]:
# Process 3 - Scaling the TLS traffic data

# Step 1: Make a copy of the DataFrame to avoid SettingWithCopyWarning
tls_traffic = tls_traffic.copy()

# Step 2: Identify Numeric Columns to Scale
# Exclude columns that should not be scaled
columns_to_exclude = ['Protocol', 'Dst Port']  # Add any additional columns to exclude if needed
numeric_columns = tls_traffic.select_dtypes(include=['float64', 'int64']).columns.difference(columns_to_exclude)

# Step 3: Check and Replace Invalid Values
# Replace infinity values with NaN
tls_traffic[numeric_columns] = tls_traffic[numeric_columns].replace([np.inf, -np.inf], np.nan)

# Fill NaN values with column means
tls_traffic[numeric_columns] = tls_traffic[numeric_columns].fillna(tls_traffic[numeric_columns].mean())

# Step 4: Scale Only the Selected Numeric Columns
scaler = MinMaxScaler()
tls_traffic[numeric_columns] = scaler.fit_transform(tls_traffic[numeric_columns])

# Print confirmation
print("Scaling completed!")

Scaling completed!


In [7]:
# Process 4 - SMOTE Class imbalance

# Step 1: Define the target variable and features
X = tls_traffic.drop('Label', axis=1)  # Features
y = tls_traffic['Label']              # Target

# Step 2: Apply SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Step 3: Combine features and target back into tls_traffic
tls_traffic = pd.DataFrame(X, columns=X.columns)
tls_traffic['Label'] = y

# Step 4: Display class distribution after SMOTE
print("Class distribution before SMOTE:", Counter(y))
print("Class distribution after SMOTE:", Counter(y))

Class distribution before SMOTE: Counter({0: 11395, 1: 11395, 2: 11395, 3: 11395})
Class distribution after SMOTE: Counter({0: 11395, 1: 11395, 2: 11395, 3: 11395})


In [11]:
# Process 5 - Feature Selection.

# Define the target variable.
target_variable = 'Label'

# Calculate the correlation matrix.
correlation_matrix = tls_traffic.corr()

# Extract correlations with the target variable.
target_correlation = correlation_matrix[target_variable].drop(target_variable)

# Set a threshold for correlation (absolute value).
threshold = 0.1  # Adjust this based on your needs (e.g., 0.1 for weak correlation).

# Select features that are strongly correlated with the target.
selected_features = target_correlation[abs(target_correlation) > threshold]

# Remove identifier columns from the selected features.
identifiers = ['Flow ID', 'Src IP']
selected_features = selected_features.drop(index=identifiers, errors='ignore')

# Print the selected features and their correlation values.
print("Selected Features Based on Correlation with 'Label':")
print(selected_features)

# Update tls_traffic to retain only selected features and the target variable.
tls_traffic = tls_traffic[selected_features.index.tolist() + [target_variable]]

# Remove 'Timestamp' after feature selection if it's in the dataset.
if 'Timestamp' in tls_traffic.columns:
    tls_traffic = tls_traffic.drop(columns=['Timestamp'])
    print("Timestamp feature removed after feature selection!")

# Save the updated dataset for review.
tls_traffic.to_csv("5. Feature_Selected_Dataset.csv", index=False)
print("Feature-selected dataset without Timestamp saved as '5. Feature_Selected_Dataset.csv'")

Selected Features Based on Correlation with 'Label':
Flow Duration             0.142499
Fwd Packet Length Max     0.144930
Fwd Packet Length Mean    0.159332
Fwd Packet Length Std     0.132663
Fwd IAT Total             0.143751
Bwd IAT Total             0.137290
Fwd PSH Flags             0.182089
Packet Length Mean        0.107494
FIN Flag Count           -0.172478
SYN Flag Count           -0.205988
Average Packet Size       0.108640
Fwd Segment Size Avg      0.159332
Subflow Fwd Bytes         0.132347
FWD Init Win Bytes        0.265631
Bwd Init Win Bytes        0.240076
Fwd Seg Size Min          0.486229
Name: Label, dtype: float64
Feature-selected dataset without Timestamp saved as '5. Feature_Selected_Dataset.csv'
