In [10]:
import pandas as pd

# Load the original dataset
file_path = "E:\\Studies\\IIT\\4 - Forth Year\\Final Year Project\\QuanNetDetct\\Datasets\\Darknet.csv"
darknet_data = pd.read_csv(file_path)

# Filter TLS-related traffic
tls_ports = [443, 993, 995, 465, 8443]
tls_traffic = darknet_data[
    (darknet_data['Dst Port'].isin(tls_ports)) &  # Destination port is TLS-related
    (darknet_data['Protocol'] == 6)              # Protocol is TCP
]

# Save filtered dataset for viewing purposes
tls_traffic.to_csv("Filtered_TLS_Darknet.csv", index=False)
print("Filtered TLS dataset saved as 'Filtered_TLS_Darknet.csv'")

Filtered TLS dataset saved as 'Filtered_TLS_Darknet.csv'


In [None]:
from sklearn.preprocessing import LabelEncoder

# Continue with tls_traffic from Block 1
label_encoder = LabelEncoder()

# Encode string columns
for column in tls_traffic.select_dtypes(include=['object']).columns:
    tls_traffic.loc[:, column] = label_encoder.fit_transform(tls_traffic[column])

# Save encoded dataset for viewing purposes
tls_traffic.to_csv("Encoded_TLS_Darknet.csv", index=False)
print("Encoded TLS dataset saved as 'Encoded_TLS_Darknet.csv'")


Encoded TLS dataset saved as 'Encoded_TLS_Darknet.csv'


In [21]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Continue with tls_traffic from the encoding step

# Step 1: Check and Replace Invalid Values
numeric_columns = tls_traffic.select_dtypes(include=['float64', 'int64']).columns

# Replace infinity values with NaN
tls_traffic.loc[:, numeric_columns] = tls_traffic[numeric_columns].replace([np.inf, -np.inf], np.nan)

# Fill NaN values with column means
tls_traffic.loc[:, numeric_columns] = tls_traffic[numeric_columns].fillna(tls_traffic[numeric_columns].mean())

# Step 2: Scale Numeric Columns
scaler = MinMaxScaler()
tls_traffic.loc[:, numeric_columns] = scaler.fit_transform(tls_traffic[numeric_columns])

# Step 3: Save the Scaled Dataset for Viewing
tls_traffic.to_csv("Scaled_TLS_Darknet.csv", index=False)
print("Scaled TLS dataset saved as 'Scaled_TLS_Darknet.csv'")

Scaled TLS dataset saved as 'Scaled_TLS_Darknet.csv'


In [22]:
# Define the target variable
target_variable = 'Label'

# Calculate the correlation matrix
correlation_matrix = tls_traffic.corr()

# Extract correlations with the target variable
target_correlation = correlation_matrix[target_variable].drop(target_variable)

# Set a threshold for correlation (absolute value)
threshold = 0.1  # Adjust this based on your needs (e.g., 0.1 for weak correlation)

# Select features that are strongly correlated with the target
selected_features = target_correlation[abs(target_correlation) > threshold]

# Print the selected features and their correlation values
print("Selected Features Based on Correlation with 'Label1':")
print(selected_features)

# Create a new dataset with selected features and the target variable
selected_data = tls_traffic[selected_features.index.tolist() + [target_variable]]

# Save the feature-selected dataset for review
selected_data.to_csv("Feature_Selected_Dataset.csv", index=False)
print("Feature-selected dataset saved as 'Feature_Selected_Dataset.csv'")

Selected Features Based on Correlation with 'Label1':
Flow ID                   0.461061
Src IP                    0.428061
Timestamp                -0.300921
Fwd Packet Length Mean    0.111124
Fwd PSH Flags             0.116855
SYN Flag Count           -0.177119
Fwd Segment Size Avg      0.111124
Subflow Fwd Packets       0.100899
FWD Init Win Bytes        0.295231
Bwd Init Win Bytes        0.269703
Fwd Seg Size Min          0.754610
Name: Label, dtype: float64
Feature-selected dataset saved as 'Feature_Selected_Dataset.csv'
