In [7]:
# Cell 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pickle
import os

# Define paths
train_dataset_path = '/root/autodl-tmp/projects/USL_NSL/dataset/transformed/KDDTrain+.csv'
test_dataset_path = '/root/autodl-tmp/projects/USL_NSL/dataset/transformed/KDDTest+.csv'
output_directory = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin'

In [8]:
# Cell 2: Load and display initial data
print("Loading training dataset...")
df_train = pd.read_csv(train_dataset_path)

# Display initial information
print("Training dataset shape:", df_train.shape)
print("\nSample of training data:")
display(df_train.head())
print("\nData types:")
display(df_train.dtypes)

print("\nOriginal class distribution in training dataset:")
print(df_train['label'].value_counts())
print("Percentage:")
print(df_train['label'].value_counts(normalize=True) * 100)

Loading training dataset...
Training dataset shape: (125973, 42)

Sample of training data:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal



Data types:


duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          


Original class distribution in training dataset:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64
Percentage:
normal             53.458281
neptune            32.716534
satan               2.883951
ipsweep             2.856961
portsweep           2.326689
smurf               2.100450
nmap                1.185175
back                0.758893
teardrop            0.708088
warezclient         0.706501
pod                 0.159558
guess_passwd      

In [9]:
# Cell 3: Feature Scaling
def feature_scaling(df, scaler=None):
    """
    Scale numerical features using StandardScaler.
    Returns scaled dataframe and scaler object.
    """
    if scaler is None:
        scaler = StandardScaler()
        
    # Get numeric columns (excluding the label column)
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    numeric_cols = numeric_cols.drop('label') if 'label' in numeric_cols else numeric_cols
    
    # Fit and transform
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df, scaler

# Apply scaling to training data
df_train, scaler = feature_scaling(df_train)
print("Sample of scaled training data:")
display(df_train.head())

Sample of scaled training data:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,-0.110249,tcp,ftp_data,SF,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.81889,-0.782367,-0.280282,0.069972,-0.289103,-0.639532,-0.624871,-0.224532,-0.376387,normal
1,-0.110249,udp,other,SF,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-1.035688,-1.16103,2.736852,2.367737,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal
2,-0.110249,tcp,private,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.809857,-0.938287,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,neptune
3,-0.110249,tcp,http,SF,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.383108,0.066252,-0.572083,-0.602433,-0.387635,-0.345084,normal
4,-0.110249,tcp,http,SF,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal


In [10]:
# Cell 4: Get Dummies Transformation
def get_dummies_transform(df, reference_df=None):
    """
    Convert categorical variables to dummy/indicator variables.
    If reference_df is provided, ensure the same dummy columns as in reference_df.
    """
    categorical_columns = ['protocol_type', 'service', 'flag']
    
    # Create dummy variables
    df_dummy = pd.get_dummies(df, columns=categorical_columns, dtype=int)
    
    if reference_df is not None:
        # Add missing columns from reference_df
        for col in reference_df.columns:
            if col not in df_dummy.columns:
                df_dummy[col] = 0
        
        # Ensure same column order as reference_df
        df_dummy = df_dummy[reference_df.columns]
    
    return df_dummy


# Apply get_dummies transformation
df_train = get_dummies_transform(df_train)
print("Sample of data after dummy transformation:")
display(df_train.head())
print("New shape after creating dummy variables:", df_train.shape)

Sample of data after dummy transformation:


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,-0.110249,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0,0,0,0,0,0,0,0,1,0
1,-0.110249,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0,0,0,0,0,0,0,0,1,0
2,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0,0,0,0,1,0,0,0,0,0
3,-0.110249,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,0,0,0,0,0,0,0,0,1,0
4,-0.110249,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,0,0,0,0,0,0,0,0,1,0


New shape after creating dummy variables: (125973, 123)


In [11]:
# Cell 5: Unsupervised Feature Selection (Variance Threshold)
def unsupervised_feature_selection(df, threshold=0.1):
    """
    Perform feature selection using Variance Threshold.
    Removes low-variance features that are less informative.
    """
    selector = VarianceThreshold(threshold=threshold)
    X_new = selector.fit_transform(df.drop('label', axis=1))  # Exclude label column
    selected_features = df.drop('label', axis=1).columns[selector.get_support()].tolist()
    return X_new, selected_features, selector

# Apply unsupervised feature selection
X_train_selected, selected_features, selector = unsupervised_feature_selection(df_train)
print("Selected features:", selected_features)
print("Shape after feature selection:", X_train_selected.shape)

Selected features: ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_tcp', 'protocol_type_udp', 'service_http', 'service_private', 'flag_S0', 'flag_SF']
Shape after feature selection: (125973, 43)


In [12]:
# Cell 6: Save processed data and preprocessing objects
# Convert labels to binary (0 for normal, 1 for anomaly)
train_labels_binary = (df_train['label'] != 'normal').astype(int)

# Save processed datasets (features only)
train_processed_path = os.path.join(output_directory, 'KDDTrain_processed.csv')
pd.DataFrame(X_train_selected, columns=selected_features).to_csv(train_processed_path, index=False)

# Save binary labels
train_labels_path = os.path.join(output_directory, 'KDDTrain_labels.csv')
train_labels_binary.to_csv(train_labels_path, index=False)

# Save preprocessing objects
preprocessing_objects = {
    'scaler': scaler,
    'selector': selector,
    'selected_features': selected_features
}

encoders_path = os.path.join(output_directory, 'preprocessing_objects.pkl')
with open(encoders_path, 'wb') as f:
    pickle.dump(preprocessing_objects, f)

print("\nPreprocessing complete!")
print(f"Processed training data saved to: {train_processed_path}")
print(f"Training labels saved to: {train_labels_path}")
print(f"Preprocessing objects saved to: {encoders_path}")


Preprocessing complete!
Processed training data saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_processed.csv
Training labels saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_labels.csv
Preprocessing objects saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/preprocessing_objects.pkl


In [13]:
# Cell 7: Process Test Dataset
print("Loading testing dataset...")
df_test = pd.read_csv(test_dataset_path)

# 1. Apply feature scaling using training scaler
print("Applying feature scaling...")
df_test, _ = feature_scaling(df_test, scaler=scaler)

# 2. Apply get_dummies transformation using training data as reference
print("Applying dummy transformation...")
df_test = get_dummies_transform(df_test, reference_df=df_train)

# 3. Apply feature selection using selected features from training set
print("Applying feature selection...")
X_test_selected = df_test[selected_features].values

# Convert test labels to binary
test_labels_binary = (df_test['label'] != 'normal').astype(int)

# Save processed test dataset (features only)
test_processed_path = os.path.join(output_directory, 'KDDTest_processed.csv')
pd.DataFrame(X_test_selected, columns=selected_features).to_csv(test_processed_path, index=False)

# Save binary test labels
test_labels_path = os.path.join(output_directory, 'KDDTest_labels.csv')
test_labels_binary.to_csv(test_labels_path, index=False)

print("\nTest set preprocessing complete!")
print(f"Processed test data saved to: {test_processed_path}")
print(f"Test labels saved to: {test_labels_path}")

# Display final shapes
print("\nFinal dataset shapes:")
print(f"Training set: {X_train_selected.shape}")
print(f"Testing set: {X_test_selected.shape}")

# Display binary class distributions
print("\nBinary class distribution in training dataset:")
print(train_labels_binary.value_counts())
print("Percentage:")
print(train_labels_binary.value_counts(normalize=True) * 100)

print("\nBinary class distribution in test dataset:")
print(test_labels_binary.value_counts())
print("Percentage:")
print(test_labels_binary.value_counts(normalize=True) * 100)

# Display original class distributions for reference
print("\nOriginal class distribution in training dataset:")
print(df_train['label'].value_counts())
print("Percentage:")
print(df_train['label'].value_counts(normalize=True) * 100)

print("\nOriginal class distribution in test dataset:")
print(df_test['label'].value_counts())
print("Percentage:")
print(df_test['label'].value_counts(normalize=True) * 100)

Loading testing dataset...
Applying feature scaling...
Applying dummy transformation...
Applying feature selection...

Test set preprocessing complete!
Processed test data saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_processed.csv
Test labels saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_labels.csv

Final dataset shapes:
Training set: (125973, 43)
Testing set: (22544, 43)

Binary class distribution in training dataset:
0    67343
1    58630
Name: label, dtype: int64
Percentage:
0    53.458281
1    46.541719
Name: label, dtype: float64

Binary class distribution in test dataset:
1    12833
0     9711
Name: label, dtype: int64
Percentage:
1    56.924237
0    43.075763
Name: label, dtype: float64

Original class distribution in training dataset:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back           