In [None]:
import pandas as pd

# Load the original dataset
file_path = "C:\\Users\\Nirusan03\\PycharmProjects\\FYP_POC\\Datasets\\Darknet.csv"
data = pd.read_csv(file_path)

# Define the ports of interest
ports_of_interest = [443, 995, 993, 587]

# Filter rows where 'Dst Port' or 'Src Port' matches any of the specified ports
filtered_data = data[(data['Dst Port'].isin(ports_of_interest)) | (data['Src Port'].isin(ports_of_interest))]

# Define the list of features to drop based on the analysis
features_to_drop = [
    'Flow ID', 'Src IP', 'Dst IP', 'Src Port', 'Dst Port', 'Protocol', 'Timestamp',
    'Active Mean', 'Active Std', 'Active Max', 'Active Min',
    'Fwd URG Flags', 'Bwd URG Flags', 'Bwd PSH Flags',
    'URG Flag Count', 'ECE Flag Count', 'CWE Flag Count',
    'Subflow Bwd Packets', 'Fwd Bytes/Bulk Avg', 'Bwd Bytes/Bulk Avg', 
    'Fwd Bulk Rate Avg', 'Fwd Packet/Bulk Avg', 'Bwd Packet Length Min',
    'SYN Flag Count', 'RST Flag Count', 'Fwd Packet Length Min', 'Packet Length Min'
]

# Drop the specified columns from the filtered dataset
filtered_data = filtered_data.drop(columns=features_to_drop, errors='ignore')

# Save the final filtered and reduced dataset to a new CSV file
filtered_data_file_path = "C:\\Users\\Nirusan03\\PycharmProjects\\FYP_POC\\Filtered_Dataset_4.csv"
filtered_data.to_csv(filtered_data_file_path, index=False)

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the pre-filtered dataset (assuming this has already been filtered and unnecessary columns removed)
file_path = "C:\\Users\\Nirusan03\\PycharmProjects\\FYP_POC\\Dataset.csv"
filtered_data = pd.read_csv(file_path)

# Separate labels
labels = filtered_data[['Label', 'Label.1']] if 'Label' in filtered_data.columns else None

# Identify continuous features to standardize (excluding binary/categorical features like flags)
continuous_features = filtered_data.select_dtypes(include=[np.number]).columns
binary_features = [col for col in filtered_data.columns if col not in continuous_features and col not in ['Label', 'Label.1']]

# Handle infinite and NaN values in continuous features
filtered_data[continuous_features] = filtered_data[continuous_features].replace([np.inf, -np.inf], np.nan)
filtered_data[continuous_features] = filtered_data[continuous_features].fillna(filtered_data[continuous_features].mean())

# Standardize only continuous features
scaler = StandardScaler()
filtered_data[continuous_features] = scaler.fit_transform(filtered_data[continuous_features])

# Re-attach the labels if they were separated
if labels is not None:
    standardized_data = pd.concat([filtered_data, labels.reset_index(drop=True)], axis=1)
else:
    standardized_data = filtered_data

# Remove duplicate columns, if any exist
standardized_data = standardized_data.loc[:, ~standardized_data.columns.duplicated()]

# Save the standardized dataset to a new CSV file
standardized_data_file_path = "C:\\Users\\Nirusan03\\PycharmProjects\\FYP_POC\\Filtered_Standardized_Dataset.csv"
standardized_data.to_csv(standardized_data_file_path, index=False)

# Display first few rows of the final standardized data (optional, for checking in Jupyter Notebook)
standardized_data.head()

Unnamed: 0,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,...,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label.1
0,-0.80208,-0.122979,-0.141549,-0.093384,-0.095111,-0.543622,-0.470417,-0.692019,-0.448772,-0.559008,...,-0.552631,-0.258036,-0.09867,-0.406186,-1.325148,-0.206721,-1.328194,-1.191678,Non-Tor,AUDIO-STREAMING
1,-0.802077,-0.122979,-0.141549,-0.093384,-0.095111,-0.543622,-0.470417,-0.692019,-0.448772,-0.559008,...,-0.546603,-0.258036,-0.09867,-0.406186,-1.325148,-0.206721,-1.328194,-1.191678,Non-Tor,AUDIO-STREAMING
2,-0.802076,-0.122979,-0.141549,-0.093384,-0.095111,-0.543622,-0.470417,-0.692019,-0.448772,-0.559008,...,-0.542669,-0.258036,-0.09867,-0.406186,-1.325148,-0.206721,-1.328194,-1.191678,Non-Tor,AUDIO-STREAMING
3,-0.802078,-0.122979,-0.141549,-0.093384,-0.095111,-0.543622,-0.470417,-0.692019,-0.448772,-0.559008,...,-0.54527,-0.258036,-0.09867,-0.406186,-1.325148,-0.206721,-1.328194,-1.191678,Non-Tor,AUDIO-STREAMING
4,-0.793071,-0.121831,-0.140793,-0.093367,-0.095081,-0.44906,-0.404276,-0.520436,-0.386331,-0.485644,...,0.253743,-0.266906,-0.098356,-0.406186,0.758826,-0.206721,0.74472,0.830641,Non-Tor,AUDIO-STREAMING


In [11]:
from sklearn.utils import resample
import pandas as pd

# Load the dataset
file_path = "C:\\Users\\Nirusan03\\PycharmProjects\\FYP_POC\\Filtered_Standardized_Dataset.csv"
data = pd.read_csv(file_path)

# Separate majority and minority classes
non_tor = data[data['Label'] == 'Non-Tor']
vpn = data[data['Label'] == 'VPN']
nonvpn = data[data['Label'] == 'NonVPN']
tor = data[data['Label'] == 'Tor']

# Determine the target number of samples (based on the largest minority class)
target_samples = max(len(vpn), len(nonvpn), len(tor))

# Downsample the majority class (Non-Tor) to match the target sample size
non_tor_downsampled = resample(non_tor, replace=False, n_samples=target_samples, random_state=42)

# Combine the downsampled majority class with the minority classes
balanced_data = pd.concat([non_tor_downsampled, vpn, nonvpn, tor])

# Shuffle the dataset to mix the classes
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a new CSV file
balanced_data_file_path = "C:\\Users\\Nirusan03\\PycharmProjects\\FYP_POC\\Balanced_Filtered_Standardized_Dataset.csv"
balanced_data.to_csv(balanced_data_file_path, index=False)

# Display the class distribution in the balanced dataset
print(balanced_data['Label'].value_counts())

Label
VPN        3281
Non-Tor    3281
NonVPN     2907
Tor        1101
Name: count, dtype: int64
