In [3]:
# Cell 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import pickle
import os

# Define paths
train_dataset_path = '/root/autodl-tmp/projects/SL_NSL/dataset/transformed/KDDTrain+.csv'
test_dataset_path = '/root/autodl-tmp/projects/SL_NSL/dataset/transformed/KDDTest+.csv'
output_directory = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi'

In [4]:
# Cell 2: Define attck type
# Define attack type mappings
attack_mapping = {
    # Normal (0)
    'normal': 0,
    
    # DOS attacks (1)
    'back': 1,
    'land': 1,
    'neptune': 1,
    'pod': 1,
    'smurf': 1,
    'teardrop': 1,
    
    # Probe attacks (2)
    'ipsweep': 2,
    'nmap': 2,
    'portsweep': 2,
    'satan': 2,
    'mscan': 2,
    'saint': 2,
    
    # R2L attacks (3)
    'ftp_write': 3,
    'guess_passwd': 3,
    'imap': 3,
    'multihop': 3,
    'phf': 3,
    'spy': 3,
    'warezclient': 3,
    'warezmaster': 3,
    
    # U2R attacks (4)
    'buffer_overflow': 4,
    'loadmodule': 4,
    'perl': 4,
    'rootkit': 4,
    'sqlattack': 4,
    'xterm': 4
}

# Define class names for printing
class_names = {
    0: "Normal Traffic",
    1: "DOS (Denial of Service)",
    2: "Probe (Surveillance/Scanning)",
    3: "R2L (Remote to Local)",
    4: "U2R (User to Root)"
}

# Print class mappings
print("Class mappings for multiclass classification:")
for class_id, class_name in class_names.items():
    print(f"Class {class_id}: {class_name}")
print("\nDetailed attack mappings:")
for attack, class_id in sorted(attack_mapping.items()):
    print(f"{attack}: Class {class_id} ({class_names[class_id]})")

Class mappings for multiclass classification:
Class 0: Normal Traffic
Class 1: DOS (Denial of Service)
Class 2: Probe (Surveillance/Scanning)
Class 3: R2L (Remote to Local)
Class 4: U2R (User to Root)

Detailed attack mappings:
back: Class 1 (DOS (Denial of Service))
buffer_overflow: Class 4 (U2R (User to Root))
ftp_write: Class 3 (R2L (Remote to Local))
guess_passwd: Class 3 (R2L (Remote to Local))
imap: Class 3 (R2L (Remote to Local))
ipsweep: Class 2 (Probe (Surveillance/Scanning))
land: Class 1 (DOS (Denial of Service))
loadmodule: Class 4 (U2R (User to Root))
mscan: Class 2 (Probe (Surveillance/Scanning))
multihop: Class 3 (R2L (Remote to Local))
neptune: Class 1 (DOS (Denial of Service))
nmap: Class 2 (Probe (Surveillance/Scanning))
normal: Class 0 (Normal Traffic)
perl: Class 4 (U2R (User to Root))
phf: Class 3 (R2L (Remote to Local))
pod: Class 1 (DOS (Denial of Service))
portsweep: Class 2 (Probe (Surveillance/Scanning))
rootkit: Class 4 (U2R (User to Root))
saint: Class 2 (Pr

In [5]:
# Cell 3: Load and display initial data
print("\nLoading training dataset...")
df_train = pd.read_csv(train_dataset_path)

# Display initial information
print("Training dataset shape:", df_train.shape)
print("\nSample of training data:")
print(df_train.head())
print("\nData types:")
print(df_train.dtypes)


Loading training dataset...
Training dataset shape: (125973, 42)

Sample of training data:
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1             

In [6]:
# Cell 4: Feature Scaling
def feature_scaling(df, scaler=None):
    """
    Scale numerical features using StandardScaler
    Returns scaled dataframe and scaler object
    """
    if scaler is None:
        scaler = StandardScaler()
        
    # Get numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    # Fit and transform
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df, scaler

# Apply scaling to training data
df_train, scaler = feature_scaling(df_train)
print("\nSample of scaled training data:")
print(df_train.head())


Sample of scaled training data:
   duration protocol_type   service flag  src_bytes  dst_bytes      land  \
0 -0.110249           tcp  ftp_data   SF  -0.007679  -0.004919 -0.014089   
1 -0.110249           udp     other   SF  -0.007737  -0.004919 -0.014089   
2 -0.110249           tcp   private   S0  -0.007762  -0.004919 -0.014089   
3 -0.110249           tcp      http   SF  -0.007723  -0.002891 -0.014089   
4 -0.110249           tcp      http   SF  -0.007728  -0.004814 -0.014089   

   wrong_fragment    urgent       hot  ...  dst_host_srv_count  \
0       -0.089486 -0.007736 -0.095076  ...           -0.818890   
1       -0.089486 -0.007736 -0.095076  ...           -1.035688   
2       -0.089486 -0.007736 -0.095076  ...           -0.809857   
3       -0.089486 -0.007736 -0.095076  ...            1.258754   
4       -0.089486 -0.007736 -0.095076  ...            1.258754   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0               -0.782367               -0.280282   
1      

In [7]:
# Cell 5: Get Dummies Transformation
def get_dummies_transform(df):
    """
    Convert categorical variables to dummy/indicator variables
    """
    categorical_columns = ['protocol_type', 'service', 'flag']
    df_dummy = pd.get_dummies(df, columns=categorical_columns, dtype=int)
    return df_dummy

# Apply get_dummies transformation
df_train = get_dummies_transform(df_train)
print("\nSample of data after dummy transformation:")
print(df_train.head())
print("New shape after creating dummy variables:", df_train.shape)



Sample of data after dummy transformation:
   duration  src_bytes  dst_bytes      land  wrong_fragment    urgent  \
0 -0.110249  -0.007679  -0.004919 -0.014089       -0.089486 -0.007736   
1 -0.110249  -0.007737  -0.004919 -0.014089       -0.089486 -0.007736   
2 -0.110249  -0.007762  -0.004919 -0.014089       -0.089486 -0.007736   
3 -0.110249  -0.007723  -0.002891 -0.014089       -0.089486 -0.007736   
4 -0.110249  -0.007728  -0.004814 -0.014089       -0.089486 -0.007736   

        hot  num_failed_logins  logged_in  num_compromised  ...  flag_REJ  \
0 -0.095076          -0.027023  -0.809262        -0.011664  ...         0   
1 -0.095076          -0.027023  -0.809262        -0.011664  ...         0   
2 -0.095076          -0.027023  -0.809262        -0.011664  ...         0   
3 -0.095076          -0.027023   1.235694        -0.011664  ...         0   
4 -0.095076          -0.027023   1.235694        -0.011664  ...         0   

   flag_RSTO  flag_RSTOS0  flag_RSTR  flag_S0  flag_S1

In [8]:
# Cell 6: Create multiclass labels and apply SelectKBest
def select_best_features(X, y, k=20):
    """
    Select K best features using mutual information classification
    """
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    
    return X_new, selected_features, selector

# Create multiclass labels
df_train['multiclass_label'] = df_train['label'].map(attack_mapping)

# Print class distribution
print("\nClass distribution in training data:")
class_dist = df_train['multiclass_label'].value_counts().sort_index()
for class_id, count in class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(df_train)*100:.2f}%)")

# Apply SelectKBest
X_train = df_train.drop(['label', 'multiclass_label'], axis=1)
y_train = df_train['multiclass_label']
X_train_selected, selected_features, selector = select_best_features(X_train, y_train)

print("\nSelected features:", selected_features)
print("Shape after SelectKBest:", X_train_selected.shape)


Class distribution in training data:
Class 0 (Normal Traffic): 67343 samples (53.46%)
Class 1 (DOS (Denial of Service)): 45927 samples (36.46%)
Class 2 (Probe (Surveillance/Scanning)): 11656 samples (9.25%)
Class 3 (R2L (Remote to Local)): 995 samples (0.79%)
Class 4 (U2R (User to Root)): 52 samples (0.04%)

Selected features: ['src_bytes', 'dst_bytes', 'logged_in', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'service_http', 'flag_S0', 'flag_SF']
Shape after SelectKBest: (125973, 20)


In [9]:
# Cell 7: Correlation Feature Selection (CFS)
def correlation_feature_selection(df, target_col, threshold=0.8):
    """
    Select features based on correlation with target and between features
    """
    # Make a copy of dataframe to avoid modifying original
    df_copy = df.copy()
    
    # Remove constant columns
    print("Removing constant columns...")
    constant_columns = [col for col in df_copy.columns if df_copy[col].nunique() == 1]
    df_copy = df_copy.drop(columns=constant_columns)
    print(f"Removed {len(constant_columns)} constant columns")
    
    # Calculate correlations
    print("Calculating correlation matrix...")
    corr_matrix = df_copy.corr(numeric_only=True).abs()
    
    print("Getting target correlations...")
    target_corr = corr_matrix[target_col].sort_values(ascending=False)
    
    print("Selecting features...")
    selected_features = []
    for feature in target_corr.index:
        if feature == target_col:
            continue
            
        include = True
        for selected in selected_features:
            if corr_matrix.loc[feature, selected] > threshold:
                include = False
                break
                
        if include:
            selected_features.append(feature)
    
    print(f"Selected {len(selected_features)} features")
    
    return df[selected_features + [target_col]], selected_features

# Apply CFS
print("\nStarting Correlation Feature Selection...")
print(f"Initial shape: {df_train.shape}")
df_train_cfs, cfs_features = correlation_feature_selection(df_train, 'multiclass_label')
print("\nFeatures selected by CFS:", cfs_features)
print("Shape after CFS:", df_train_cfs.shape)

# Cell 7: Save processed data and preprocessing objects
# Save processed datasets
train_processed_path = os.path.join(output_directory, 'KDDTrain_processed.csv')
df_train_cfs.to_csv(train_processed_path, index=False)

# Save preprocessing objects
preprocessing_objects = {
    'scaler': scaler,
    'selector': selector,
    'selected_features': selected_features,
    'cfs_features': cfs_features,
    'attack_mapping': attack_mapping,
    'class_names': class_names
}

encoders_path = os.path.join(output_directory, 'preprocessing_objects.pkl')
with open(encoders_path, 'wb') as f:
    pickle.dump(preprocessing_objects, f)

print("\nPreprocessing complete!")
print(f"Processed training data saved to: {train_processed_path}")
print(f"Preprocessing objects saved to: {encoders_path}")


Starting Correlation Feature Selection...
Initial shape: (125973, 124)
Removing constant columns...
Removed 1 constant columns
Calculating correlation matrix...
Getting target correlations...
Selecting features...
Selected 109 features

Features selected by CFS: ['dst_host_srv_count', 'logged_in', 'flag_SF', 'service_http', 'service_private', 'dst_host_diff_srv_rate', 'count', 'dst_host_srv_serror_rate', 'service_eco_i', 'dst_host_same_src_port_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp', 'diff_srv_rate', 'flag_RSTR', 'dst_host_srv_diff_host_rate', 'service_domain_u', 'dst_host_count', 'service_smtp', 'protocol_type_udp', 'duration', 'flag_SH', 'service_ecr_i', 'hot', 'flag_RSTO', 'service_other', 'flag_RSTOS0', 'protocol_type_tcp', 'service_urp_i', 'wrong_fragment', 'service_Z39_50', 'service_uucp', 'service_whois', 'service_imap4', 'service_courier', 'service_ftp', 'service_bgp', 'service_uucp_path', 'service_iso_tsap', 'service_ctf', 'service_gopher', 'service_vmnet', '

In [10]:
# Cell 8: Save processed data and preprocessing objects
# Save processed datasets
train_processed_path = os.path.join(output_directory, 'KDDTrain_processed.csv')
df_train_cfs.to_csv(train_processed_path, index=False)

# Save preprocessing objects
preprocessing_objects = {
    'scaler': scaler,
    'selector': selector,
    'selected_features': selected_features,
    'cfs_features': cfs_features,
    'attack_mapping': attack_mapping,
    'class_names': class_names
}

encoders_path = os.path.join(output_directory, 'preprocessing_objects.pkl')
with open(encoders_path, 'wb') as f:
    pickle.dump(preprocessing_objects, f)

print("\nPreprocessing complete!")
print(f"Processed training data saved to: {train_processed_path}")
print(f"Preprocessing objects saved to: {encoders_path}")


Preprocessing complete!
Processed training data saved to: /root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTrain_processed.csv
Preprocessing objects saved to: /root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/preprocessing_objects.pkl


In [11]:
# Cell 9: Process Test Dataset
print("\nLoading testing dataset...")
df_test = pd.read_csv(test_dataset_path)

# 1. Apply feature scaling using training scaler
print("Applying feature scaling...")
df_test, _ = feature_scaling(df_test, scaler=scaler)

# 2. Apply get_dummies transformation
print("Applying dummy transformation...")
df_test = get_dummies_transform(df_test)

# 3. Create multiclass labels
print("Creating multiclass labels...")
df_test['multiclass_label'] = df_test['label'].map(attack_mapping)

# Print test set class distribution
print("\nClass distribution in test data:")
test_dist = df_test['multiclass_label'].value_counts().sort_index()
for class_id, count in test_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(df_test)*100:.2f}%)")

# 4. Ensure all features from training set exist in test set
print("\nAligning features with training set...")
for col in df_train.columns:
    if col not in df_test.columns and col != 'multiclass_label':
        df_test[col] = 0

# 5. Apply CFS with same features as training set
print("Applying feature selection...")
df_test_processed = df_test[cfs_features + ['multiclass_label']]

# Save processed test dataset
test_processed_path = os.path.join(output_directory, 'KDDTest_processed.csv')
df_test_processed.to_csv(test_processed_path, index=False)

print("\nTest set preprocessing complete!")
print(f"Processed test data saved to: {test_processed_path}")

# Display final shapes
print("\nFinal dataset shapes:")
print(f"Training set: {df_train_cfs.shape}")
print(f"Testing set: {df_test_processed.shape}")


Loading testing dataset...
Applying feature scaling...
Applying dummy transformation...
Creating multiclass labels...

Class distribution in test data:
Class 0.0 (Normal Traffic): 9711 samples (43.08%)
Class 1.0 (DOS (Denial of Service)): 5741 samples (25.47%)
Class 2.0 (Probe (Surveillance/Scanning)): 2421 samples (10.74%)
Class 3.0 (R2L (Remote to Local)): 2199 samples (9.75%)
Class 4.0 (U2R (User to Root)): 52 samples (0.23%)

Aligning features with training set...
Applying feature selection...

Test set preprocessing complete!
Processed test data saved to: /root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTest_processed.csv

Final dataset shapes:
Training set: (125973, 110)
Testing set: (22544, 110)
