In [None]:
# Cell 1 - Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import pickle
import os

# Define paths
train_dataset_path = '/root/autodl-tmp/projects/SL_NSL/dataset/transformed/KDDTrain+.csv'
test_dataset_path = '/root/autodl-tmp/projects/SL_NSL/dataset/transformed/KDDTest+.csv'
output_directory = '/root/autodl-tmp/projects/SL_NSL/dataset/processed'

In [None]:
# Cell 2 - Load and display initial data
# Load Training Dataset
print("Loading training dataset...")
df_train = pd.read_csv(train_dataset_path)

# Display initial information
print("Training dataset shape:", df_train.shape)
print("\nSample of training data:")
display(df_train.head())
print("\nData types:")
display(df_train.dtypes)

Loading training dataset...
Training dataset shape: (125973, 42)

Sample of training data:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal



Data types:


duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [None]:
# Cell 3 - Feature Scaling
def feature_scaling(df, scaler=None):
    """
    Scale numerical features using StandardScaler
    Returns scaled dataframe and scaler object
    """
    if scaler is None:
        scaler = StandardScaler()
        
    # Get numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    # Fit and transform
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df, scaler

# Apply scaling to training data
df_train, scaler = feature_scaling(df_train)
print("Sample of scaled training data:")
display(df_train.head())

Sample of scaled training data:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,-0.110249,tcp,ftp_data,SF,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.81889,-0.782367,-0.280282,0.069972,-0.289103,-0.639532,-0.624871,-0.224532,-0.376387,normal
1,-0.110249,udp,other,SF,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-1.035688,-1.16103,2.736852,2.367737,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal
2,-0.110249,tcp,private,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.809857,-0.938287,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,neptune
3,-0.110249,tcp,http,SF,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.383108,0.066252,-0.572083,-0.602433,-0.387635,-0.345084,normal
4,-0.110249,tcp,http,SF,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal


In [11]:
# Cell 4 - Get Dummies Transformation
def get_dummies_transform(df):
    """
    Convert categorical variables to dummy/indicator variables
    """
    categorical_columns = ['protocol_type', 'service', 'flag']
    df_dummy = pd.get_dummies(df, columns=categorical_columns, dtype=int)
    return df_dummy

# Apply get_dummies transformation
df_train = get_dummies_transform(df_train)
print("Sample of data after dummy transformation:")
display(df_train.head())
print("New shape after creating dummy variables:", df_train.shape)

Sample of data after dummy transformation:


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,-0.110249,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0,0,0,0,0,0,0,0,1,0
1,-0.110249,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0,0,0,0,0,0,0,0,1,0
2,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0,0,0,0,1,0,0,0,0,0
3,-0.110249,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,0,0,0,0,0,0,0,0,1,0
4,-0.110249,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,0,0,0,0,0,0,0,0,1,0


New shape after creating dummy variables: (125973, 123)


In [12]:
# Cell 5 - SelectKBest Feature Selection
def select_best_features(X, y, k=20):
    """
    Select K best features using mutual information classification
    """
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    
    return X_new, selected_features, selector

# Create binary labels for selection
df_train['binary_label'] = df_train['label'].apply(
    lambda x: 'normal' if x == 'normal' else 'anomaly'
)
df_train['binary_label'] = df_train['binary_label'].map(
    {'normal': 0, 'anomaly': 1}
)

# Apply SelectKBest
X_train = df_train.drop(['label', 'binary_label'], axis=1)
y_train = df_train['binary_label']
X_train_selected, selected_features, selector = select_best_features(X_train, y_train)

print("Selected features:", selected_features)
print("Shape after SelectKBest:", X_train_selected.shape)

Selected features: ['src_bytes', 'dst_bytes', 'logged_in', 'count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'service_http', 'service_private', 'flag_S0', 'flag_SF']
Shape after SelectKBest: (125973, 20)


In [13]:
# Cell 6 - Correlation Feature Selection (CFS)
def correlation_feature_selection(df, target_col, threshold=0.8):
    """
    Select features based on correlation with target and between features
    """
    # Make a copy of dataframe to avoid modifying original
    df_copy = df.copy()
    
    # Ensure target column is numeric
    if df_copy[target_col].dtype == 'object':
        print("Converting target column to numeric...")
        df_copy[target_col] = pd.to_numeric(df_copy[target_col])
    
    # Remove constant columns
    print("Removing constant columns...")
    constant_columns = [col for col in df_copy.columns if df_copy[col].nunique() == 1]
    df_copy = df_copy.drop(columns=constant_columns)
    print(f"Removed {len(constant_columns)} constant columns")
    
    # Calculate correlations
    print("Calculating correlation matrix...")
    try:
        corr_matrix = df_copy.corr().abs()
    except Exception as e:
        print(f"Error in correlation calculation: {str(e)}")
        print("Attempting to convert all columns to numeric...")
        for col in df_copy.columns:
            if df_copy[col].dtype == 'object':
                df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
        corr_matrix = df_copy.corr().abs()
    
    # Get correlations with target
    print("Getting target correlations...")
    target_corr = corr_matrix[target_col].sort_values(ascending=False)
    
    # Remove highly correlated features
    print("Selecting features...")
    selected_features = []
    for feature in target_corr.index:
        if feature == target_col:
            continue
            
        include = True
        for selected in selected_features:
            if corr_matrix.loc[feature, selected] > threshold:
                include = False
                break
                
        if include:
            selected_features.append(feature)
    
    print(f"Selected {len(selected_features)} features")
    
    return df[selected_features + [target_col]], selected_features

# Apply CFS
print("\nStarting Correlation Feature Selection...")
print(f"Initial shape: {df_train.shape}")
print(f"Initial columns: {len(df_train.columns)}")

try:
    df_train_cfs, cfs_features = correlation_feature_selection(df_train, 'binary_label')
    print("\nFeatures selected by CFS:", cfs_features)
    print("Shape after CFS:", df_train_cfs.shape)
except Exception as e:
    print(f"\nError during CFS: {str(e)}")
    print("Column types:", df_train.dtypes)


Starting Correlation Feature Selection...
Initial shape: (125973, 124)
Initial columns: 124
Removing constant columns...
Removed 1 constant columns
Calculating correlation matrix...
Error in correlation calculation: could not convert string to float: 'normal'
Attempting to convert all columns to numeric...
Getting target correlations...
Selecting features...
Selected 109 features

Features selected by CFS: ['flag_SF', 'dst_host_srv_count', 'logged_in', 'dst_host_srv_serror_rate', 'count', 'service_http', 'service_private', 'dst_host_count', 'service_domain_u', 'srv_rerror_rate', 'dst_host_diff_srv_rate', 'protocol_type_udp', 'service_smtp', 'diff_srv_rate', 'protocol_type_icmp', 'service_eco_i', 'service_ecr_i', 'flag_RSTR', 'srv_diff_host_rate', 'wrong_fragment', 'dst_host_same_src_port_rate', 'service_ftp_data', 'service_Z39_50', 'flag_RSTO', 'service_uucp', 'service_courier', 'service_bgp', 'service_whois', 'service_uucp_path', 'service_iso_tsap', 'service_imap4', 'service_nnsp', '

In [14]:
# Cell 7 - Save processed data and preprocessing objects
# Save processed datasets
train_processed_path = os.path.join(output_directory, 'KDDTrain_processed.csv')
df_train_cfs.to_csv(train_processed_path, index=False)

# Save preprocessing objects
preprocessing_objects = {
    'scaler': scaler,
    'selector': selector,
    'selected_features': selected_features,
    'cfs_features': cfs_features
}

encoders_path = os.path.join(output_directory, 'preprocessing_objects.pkl')
with open(encoders_path, 'wb') as f:
    pickle.dump(preprocessing_objects, f)

print("\nPreprocessing complete!")
print(f"Processed training data saved to: {train_processed_path}")
print(f"Preprocessing objects saved to: {encoders_path}")


Preprocessing complete!
Processed training data saved to: D:/TUoS/Year_3/After Week_0/EEE380/2. SL_NSL/dataset/processed\KDDTrain_processed.csv
Preprocessing objects saved to: D:/TUoS/Year_3/After Week_0/EEE380/2. SL_NSL/dataset/processed\preprocessing_objects.pkl


In [15]:
# Cell 8 - Process Test Dataset
print("Loading testing dataset...")
df_test = pd.read_csv(test_dataset_path)

# 1. Apply feature scaling using training scaler
print("Applying feature scaling...")
df_test, _ = feature_scaling(df_test, scaler=scaler)

# 2. Apply get_dummies transformation
print("Applying dummy transformation...")
df_test = get_dummies_transform(df_test)

# 3. Create binary labels
print("Creating binary labels...")
df_test['binary_label'] = df_test['label'].apply(
    lambda x: 'normal' if x == 'normal' else 'anomaly'
)
df_test['binary_label'] = df_test['binary_label'].map(
    {'normal': 0, 'anomaly': 1}
)

# 4. Ensure all features from training set exist in test set
print("Aligning features with training set...")
for col in df_train.columns:
    if col not in df_test.columns and col != 'binary_label':
        df_test[col] = 0

# 5. Apply CFS with same features as training set
print("Applying feature selection...")
df_test_processed = df_test[cfs_features + ['binary_label']]

# Save processed test dataset
test_processed_path = os.path.join(output_directory, 'KDDTest_processed.csv')
df_test_processed.to_csv(test_processed_path, index=False)

print("\nTest set preprocessing complete!")
print(f"Processed test data saved to: {test_processed_path}")

# Display final shapes
print("\nFinal dataset shapes:")
print(f"Training set: {df_train_cfs.shape}")
print(f"Testing set: {df_test_processed.shape}")

Loading testing dataset...
Applying feature scaling...
Applying dummy transformation...
Creating binary labels...
Aligning features with training set...
Applying feature selection...

Test set preprocessing complete!
Processed test data saved to: D:/TUoS/Year_3/After Week_0/EEE380/2. SL_NSL/dataset/processed\KDDTest_processed.csv

Final dataset shapes:
Training set: (125973, 110)
Testing set: (22544, 110)
