In [1]:
# Cell 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pickle
import os

# Define paths
train_dataset_path = '/root/autodl-tmp/projects/USL_NSL/dataset/transformed/KDDTrain+.csv'
test_dataset_path = '/root/autodl-tmp/projects/USL_NSL/dataset/transformed/KDDTest+.csv'
output_directory = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi'

# Ensure output directory exists
os.makedirs(output_directory, exist_ok=True)

In [2]:
# Cell 2: Define attack mappings for multiclass classification
attack_mapping = {
    # Normal (0)
    'normal': 0,
    
    # DOS attacks (1)
    'back': 1,
    'land': 1,
    'neptune': 1,
    'pod': 1,
    'smurf': 1,
    'teardrop': 1,
    'apache2': 1,
    'udpstorm': 1,
    'processtable': 1,
    'mailbomb': 1,
    
    # Probe attacks (2)
    'ipsweep': 2,
    'nmap': 2,
    'portsweep': 2,
    'satan': 2,
    'mscan': 2,
    'saint': 2,
    
    # R2L attacks (3)
    'ftp_write': 3,
    'guess_passwd': 3,
    'imap': 3,
    'multihop': 3,
    'phf': 3,
    'spy': 3,
    'warezclient': 3,
    'warezmaster': 3,
    'snmpguess': 3,
    'worm': 3,
    'snmpgetattack': 3,
    'httptunnel': 3,
    'sendmail': 3,
    'named': 3,
    'xlock': 3,
    'xsnoop': 3,
    
    # U2R attacks (4)
    'buffer_overflow': 4,
    'loadmodule': 4,
    'perl': 4,
    'rootkit': 4,
    'sqlattack': 4,
    'xterm': 4,
    'ps': 4
}

# Define class names for better readability
class_names = {
    0: "Normal Traffic",
    1: "DOS (Denial of Service)",
    2: "Probe (Surveillance/Scanning)",
    3: "R2L (Remote to Local)",
    4: "U2R (User to Root)"
}

# Print class mappings
print("Class mappings for multiclass classification:")
for class_id, class_name in class_names.items():
    print(f"Class {class_id}: {class_name}")
print("\nDetailed attack mappings:")
for attack, class_id in sorted(attack_mapping.items()):
    print(f"{attack}: Class {class_id} ({class_names[class_id]})")

Class mappings for multiclass classification:
Class 0: Normal Traffic
Class 1: DOS (Denial of Service)
Class 2: Probe (Surveillance/Scanning)
Class 3: R2L (Remote to Local)
Class 4: U2R (User to Root)

Detailed attack mappings:
apache2: Class 1 (DOS (Denial of Service))
back: Class 1 (DOS (Denial of Service))
buffer_overflow: Class 4 (U2R (User to Root))
ftp_write: Class 3 (R2L (Remote to Local))
guess_passwd: Class 3 (R2L (Remote to Local))
httptunnel: Class 3 (R2L (Remote to Local))
imap: Class 3 (R2L (Remote to Local))
ipsweep: Class 2 (Probe (Surveillance/Scanning))
land: Class 1 (DOS (Denial of Service))
loadmodule: Class 4 (U2R (User to Root))
mailbomb: Class 1 (DOS (Denial of Service))
mscan: Class 2 (Probe (Surveillance/Scanning))
multihop: Class 3 (R2L (Remote to Local))
named: Class 3 (R2L (Remote to Local))
neptune: Class 1 (DOS (Denial of Service))
nmap: Class 2 (Probe (Surveillance/Scanning))
normal: Class 0 (Normal Traffic)
perl: Class 4 (U2R (User to Root))
phf: Class 3 

In [3]:
# Cell 3: Load and explore training dataset
print("\nLoading training dataset...")
df_train = pd.read_csv(train_dataset_path)

# Display initial information
print("Training dataset shape:", df_train.shape)
print("\nSample of training data:")
print(df_train.head())
print("\nData types:")
print(df_train.dtypes)

# Display original class distribution
print("\nOriginal class distribution in training dataset:")
print(df_train['label'].value_counts())
print("Percentage:")
print(df_train['label'].value_counts(normalize=True) * 100)


Loading training dataset...
Training dataset shape: (125973, 42)

Sample of training data:
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1             

In [4]:
# Cell 4: Feature scaling
def feature_scaling(df, scaler=None):
    """
    Scale numerical features using StandardScaler.
    Returns scaled dataframe and scaler object.
    """
    if scaler is None:
        scaler = StandardScaler()
        
    # Get numeric columns (excluding the label column)
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    # Fit and transform
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df, scaler

# Apply scaling to training data
print("\nApplying feature scaling...")
df_train, scaler = feature_scaling(df_train)
print("Sample of scaled training data:")
print(df_train.head())


Applying feature scaling...
Sample of scaled training data:
   duration protocol_type   service flag  src_bytes  dst_bytes      land  \
0 -0.110249           tcp  ftp_data   SF  -0.007679  -0.004919 -0.014089   
1 -0.110249           udp     other   SF  -0.007737  -0.004919 -0.014089   
2 -0.110249           tcp   private   S0  -0.007762  -0.004919 -0.014089   
3 -0.110249           tcp      http   SF  -0.007723  -0.002891 -0.014089   
4 -0.110249           tcp      http   SF  -0.007728  -0.004814 -0.014089   

   wrong_fragment    urgent       hot  ...  dst_host_srv_count  \
0       -0.089486 -0.007736 -0.095076  ...           -0.818890   
1       -0.089486 -0.007736 -0.095076  ...           -1.035688   
2       -0.089486 -0.007736 -0.095076  ...           -0.809857   
3       -0.089486 -0.007736 -0.095076  ...            1.258754   
4       -0.089486 -0.007736 -0.095076  ...            1.258754   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0               -0.782367       

In [5]:
# Cell 5: Categorical variables to dummy variables
def get_dummies_transform(df, reference_df=None):
    """
    Convert categorical variables to dummy/indicator variables.
    If reference_df is provided, ensure the same dummy columns as in reference_df.
    """
    categorical_columns = ['protocol_type', 'service', 'flag']
    
    # Create dummy variables
    df_dummy = pd.get_dummies(df, columns=categorical_columns, dtype=int)
    
    if reference_df is not None:
        # Add missing columns from reference_df
        for col in reference_df.columns:
            if col not in df_dummy.columns:
                df_dummy[col] = 0
        
        # Ensure same column order as reference_df
        df_dummy = df_dummy[reference_df.columns]
    
    return df_dummy

# Apply get_dummies transformation
print("\nApplying dummy transformation...")
df_train = get_dummies_transform(df_train)
print("Sample of data after dummy transformation:")
print(df_train.head())
print("New shape after creating dummy variables:", df_train.shape)


Applying dummy transformation...
Sample of data after dummy transformation:
   duration  src_bytes  dst_bytes      land  wrong_fragment    urgent  \
0 -0.110249  -0.007679  -0.004919 -0.014089       -0.089486 -0.007736   
1 -0.110249  -0.007737  -0.004919 -0.014089       -0.089486 -0.007736   
2 -0.110249  -0.007762  -0.004919 -0.014089       -0.089486 -0.007736   
3 -0.110249  -0.007723  -0.002891 -0.014089       -0.089486 -0.007736   
4 -0.110249  -0.007728  -0.004814 -0.014089       -0.089486 -0.007736   

        hot  num_failed_logins  logged_in  num_compromised  ...  flag_REJ  \
0 -0.095076          -0.027023  -0.809262        -0.011664  ...         0   
1 -0.095076          -0.027023  -0.809262        -0.011664  ...         0   
2 -0.095076          -0.027023  -0.809262        -0.011664  ...         0   
3 -0.095076          -0.027023   1.235694        -0.011664  ...         0   
4 -0.095076          -0.027023   1.235694        -0.011664  ...         0   

   flag_RSTO  flag_RS

In [6]:
# Cell 6: Create multiclass labels
print("\nCreating multiclass labels...")
df_train['multiclass_label'] = df_train['label'].map(attack_mapping)

# Print class distribution
print("\nMulticlass distribution in training data:")
class_dist = df_train['multiclass_label'].value_counts().sort_index()
for class_id, count in class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(df_train)*100:.2f}%)")


Creating multiclass labels...

Multiclass distribution in training data:
Class 0 (Normal Traffic): 67343 samples (53.46%)
Class 1 (DOS (Denial of Service)): 45927 samples (36.46%)
Class 2 (Probe (Surveillance/Scanning)): 11656 samples (9.25%)
Class 3 (R2L (Remote to Local)): 995 samples (0.79%)
Class 4 (U2R (User to Root)): 52 samples (0.04%)


In [7]:
# Cell 7: Feature selection using unsupervised method
def unsupervised_feature_selection(df, threshold=0.1):
    """
    Perform feature selection using Variance Threshold.
    Removes low-variance features that are less informative.
    """
    selector = VarianceThreshold(threshold=threshold)
    
    # Exclude label columns
    feature_cols = df.columns.difference(['label', 'multiclass_label'])
    X_new = selector.fit_transform(df[feature_cols])
    
    selected_features = feature_cols[selector.get_support()].tolist()
    
    return X_new, selected_features, selector

# Apply unsupervised feature selection
print("\nApplying unsupervised feature selection...")
X_train_selected, selected_features, selector = unsupervised_feature_selection(df_train)
print("Selected features:", selected_features)
print("Shape after feature selection:", X_train_selected.shape)


Applying unsupervised feature selection...
Selected features: ['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count', 'dst_host_diff_srv_rate', 'dst_host_rerror_rate', 'dst_host_same_src_port_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_count', 'dst_host_srv_diff_host_rate', 'dst_host_srv_rerror_rate', 'dst_host_srv_serror_rate', 'duration', 'flag_S0', 'flag_SF', 'hot', 'is_guest_login', 'is_host_login', 'land', 'logged_in', 'num_access_files', 'num_compromised', 'num_failed_logins', 'num_file_creations', 'num_root', 'num_shells', 'protocol_type_tcp', 'protocol_type_udp', 'rerror_rate', 'root_shell', 'same_srv_rate', 'serror_rate', 'service_http', 'service_private', 'src_bytes', 'srv_count', 'srv_diff_host_rate', 'srv_rerror_rate', 'srv_serror_rate', 'su_attempted', 'urgent', 'wrong_fragment']
Shape after feature selection: (125973, 43)


In [8]:
# Cell 8: Create final training dataset and save
# Create DataFrame with selected features and multiclass labels
df_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
df_train_selected['multiclass_label'] = df_train['multiclass_label'].values

# Save processed data and preprocessing objects
print("\nSaving processed data and preprocessing objects...")
train_processed_path = os.path.join(output_directory, 'KDDTrain_processed.csv')
df_train_selected.to_csv(train_processed_path, index=False)

# Save original attack labels for reference
train_labels_path = os.path.join(output_directory, 'KDDTrain_labels.csv')
df_train[['label', 'multiclass_label']].to_csv(train_labels_path, index=False)

# Save preprocessing objects
preprocessing_objects = {
    'scaler': scaler,
    'selector': selector,
    'selected_features': selected_features,
    'attack_mapping': attack_mapping,
    'class_names': class_names
}

encoders_path = os.path.join(output_directory, 'preprocessing_objects.pkl')
with open(encoders_path, 'wb') as f:
    pickle.dump(preprocessing_objects, f)

print("\nTraining set preprocessing complete!")
print(f"Processed training data saved to: {train_processed_path}")
print(f"Training labels saved to: {train_labels_path}")
print(f"Preprocessing objects saved to: {encoders_path}")


Saving processed data and preprocessing objects...

Training set preprocessing complete!
Processed training data saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_processed.csv
Training labels saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_labels.csv
Preprocessing objects saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/preprocessing_objects.pkl


In [9]:
# Cell 9: Process test dataset
print("\nLoading testing dataset...")
df_test = pd.read_csv(test_dataset_path)

# Apply feature scaling using training scaler
print("Applying feature scaling...")
df_test, _ = feature_scaling(df_test, scaler=scaler)

# Apply get_dummies transformation using training data as reference
print("Applying dummy transformation...")
df_test = get_dummies_transform(df_test, reference_df=df_train)

# Create multiclass labels for test set
print("Creating multiclass labels...")
df_test['multiclass_label'] = df_test['label'].map(attack_mapping)



Loading testing dataset...
Applying feature scaling...
Applying dummy transformation...
Creating multiclass labels...


In [10]:
# Cell 10: Handle unknown attacks in test data
unknown_attacks = df_test[df_test['multiclass_label'].isna()]['label'].unique()
if len(unknown_attacks) > 0:
    print(f"Warning: Found {len(unknown_attacks)} unknown attack types in test data:")
    print(unknown_attacks)
    
    # Assign most similar category based on naming convention
    # This is a simple heuristic and might need domain expertise refinement
    for attack in unknown_attacks:
        if any(dos in attack.lower() for dos in ['dos', 'flood', 'storm', 'smurf']):
            df_test.loc[df_test['label'] == attack, 'multiclass_label'] = 1  # DOS
        elif any(probe in attack.lower() for probe in ['scan', 'sweep', 'probe']):
            df_test.loc[df_test['label'] == attack, 'multiclass_label'] = 2  # Probe
        elif any(r2l in attack.lower() for r2l in ['ftp', 'guess', 'imap', 'pass', 'http']):
            df_test.loc[df_test['label'] == attack, 'multiclass_label'] = 3  # R2L
        elif any(u2r in attack.lower() for u2r in ['root', 'overflow', 'perl', 'sql']):
            df_test.loc[df_test['label'] == attack, 'multiclass_label'] = 4  # U2R
        else:
            # Default to most common attack class in training data (after normal)
            most_common_attack_class = df_train[df_train['multiclass_label'] != 0]['multiclass_label'].mode()[0]
            df_test.loc[df_test['label'] == attack, 'multiclass_label'] = most_common_attack_class
            print(f"  Assigned {attack} to class {most_common_attack_class} ({class_names[most_common_attack_class]}) by default")

# Print test set class distribution
print("\nMulticlass distribution in test data:")
test_dist = df_test['multiclass_label'].value_counts().sort_index()
for class_id, count in test_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(df_test)*100:.2f}%)")


Multiclass distribution in test data:
Class 0 (Normal Traffic): 9711 samples (43.08%)
Class 1 (DOS (Denial of Service)): 7458 samples (33.08%)
Class 2 (Probe (Surveillance/Scanning)): 2421 samples (10.74%)
Class 3 (R2L (Remote to Local)): 2887 samples (12.81%)
Class 4 (U2R (User to Root)): 67 samples (0.30%)


In [11]:
# Cell 11: Create final test dataset and save
# Extract features using the same selector from training
print("\nApplying feature selection to test data...")
df_test_selected = pd.DataFrame(
    df_test[selected_features].values,
    columns=selected_features
)
df_test_selected['multiclass_label'] = df_test['multiclass_label'].values

# Save processed test dataset
test_processed_path = os.path.join(output_directory, 'KDDTest_processed.csv')
df_test_selected.to_csv(test_processed_path, index=False)

# Save original attack labels for reference
test_labels_path = os.path.join(output_directory, 'KDDTest_labels.csv')
df_test[['label', 'multiclass_label']].to_csv(test_labels_path, index=False)

print("\nTest set preprocessing complete!")
print(f"Processed test data saved to: {test_processed_path}")
print(f"Test labels saved to: {test_labels_path}")


Applying feature selection to test data...

Test set preprocessing complete!
Processed test data saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_processed.csv
Test labels saved to: /root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_labels.csv


In [12]:
# Cell 12: Generate summary statistics and analysis
# Display final shapes
print("\nFinal dataset shapes:")
print(f"Training set: {df_train_selected.shape}")
print(f"Testing set: {df_test_selected.shape}")

# Display class distributions
print("\nMulticlass distribution in training dataset:")
train_class_dist = df_train_selected['multiclass_label'].value_counts().sort_index()
for class_id, count in train_class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(df_train_selected)*100:.2f}%)")

print("\nMulticlass distribution in test dataset:")
test_class_dist = df_test_selected['multiclass_label'].value_counts().sort_index()
for class_id, count in test_class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(df_test_selected)*100:.2f}%)")

# Compute imbalance ratios
train_ratio = train_class_dist.max() / train_class_dist.min()
test_ratio = test_class_dist.max() / test_class_dist.min()
print(f"\nClass imbalance ratio (max/min) in training set: {train_ratio:.2f}")
print(f"Class imbalance ratio (max/min) in test set: {test_ratio:.2f}")

# Summary statistics of key features by class
print("\nSummary statistics for a few key features by class in the training set:")
key_features = selected_features[:5]  # Select a few important features
for feature in key_features:
    print(f"\nFeature: {feature}")
    for class_id in range(5):
        class_values = df_train_selected[df_train_selected['multiclass_label'] == class_id][feature]
        print(f"  Class {class_id} ({class_names[class_id]}): mean={class_values.mean():.4f}, std={class_values.std():.4f}")


Final dataset shapes:
Training set: (125973, 44)
Testing set: (22544, 44)

Multiclass distribution in training dataset:
Class 0 (Normal Traffic): 67343 samples (53.46%)
Class 1 (DOS (Denial of Service)): 45927 samples (36.46%)
Class 2 (Probe (Surveillance/Scanning)): 11656 samples (9.25%)
Class 3 (R2L (Remote to Local)): 995 samples (0.79%)
Class 4 (U2R (User to Root)): 52 samples (0.04%)

Multiclass distribution in test dataset:
Class 0 (Normal Traffic): 9711 samples (43.08%)
Class 1 (DOS (Denial of Service)): 7458 samples (33.08%)
Class 2 (Probe (Surveillance/Scanning)): 2421 samples (10.74%)
Class 3 (R2L (Remote to Local)): 2887 samples (12.81%)
Class 4 (U2R (User to Root)): 67 samples (0.30%)

Class imbalance ratio (max/min) in training set: 1295.06
Class imbalance ratio (max/min) in test set: 144.94

Summary statistics for a few key features by class in the training set:

Feature: count
  Class 0 (Normal Traffic): mean=-0.5379, std=0.4718
  Class 1 (DOS (Denial of Service)): mean