# ==========================================
# 1. Imports & Configuration
# ==========================================

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle # To save the scalers/encoders for later use

# Display settings
pd.set_option('display.max_columns', None)

# ==========================================
# 2. Load Cleaned Data
# ==========================================

In [2]:
input_dir = '../data/processed'
output_dir = '../data/processed' # We will save final arrays here

print("Loading cleaned data...")
train_df = pd.read_csv(os.path.join(input_dir, 'train_cleaned.csv'))
test_df = pd.read_csv(os.path.join(input_dir, 'test_cleaned.csv'))

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Loading cleaned data...
Train shape: (125973, 42)
Test shape: (22544, 42)


# ==========================================
# 3. Label Mapping (Attack Classes)
# ==========================================

In [3]:
# NSL-KDD has specific attack types (e.g., 'neptune', 'satan') that belong to 4 broad categories.
# We map them to: Normal, DoS, Probe, R2L, U2R.

# Mapping dictionary based on NSL-KDD documentation
attack_mapping = {
    'normal': 'Normal',
    
    # DoS (Denial of Service)
    'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS', 'smurf': 'DoS',
    'teardrop': 'DoS', 'mailbomb': 'DoS', 'apache2': 'DoS', 'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    # Probe (Surveillance and other probing)
    'satan': 'Probe', 'ipsweep': 'Probe', 'nmap': 'Probe', 'portsweep': 'Probe',
    'mscan': 'Probe', 'saint': 'Probe',
    
    # R2L (Remote to Local)
    'guess_passwd': 'R2L', 'ftp_write': 'R2L', 'imap': 'R2L', 'phf': 'R2L',
    'multihop': 'R2L', 'warezmaster': 'R2L', 'warezclient': 'R2L', 'spy': 'R2L',
    'xlock': 'R2L', 'xsnoop': 'R2L', 'snmpguess': 'R2L', 'snmpgetattack': 'R2L',
    'httptunnel': 'R2L', 'sendmail': 'R2L', 'named': 'R2L',
    
    # U2R (User to Root)
    'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'perl': 'U2R', 'rootkit': 'U2R',
    'ps': 'U2R', 'sqlattack': 'U2R', 'xterm': 'U2R'
}

# Map labels to high-level attack categories
train_df['attack_class'] = train_df['label'].map(attack_mapping).fillna('Unknown')
test_df['attack_class']  = test_df['label'].map(attack_mapping).fillna('Unknown')

print("Train attack class distribution:")
print(train_df['attack_class'].value_counts())

print("\nTest attack class distribution (including Unknown):")
print(test_df['attack_class'].value_counts())

# Binary target for anomaly detection
# Normal = 0, Attack / Unknown = 1
train_df['binary_target'] = (train_df['attack_class'] != 'Normal').astype(int)
test_df['binary_target']  = (test_df['attack_class'] != 'Normal').astype(int)

Train attack class distribution:
attack_class
Normal    67343
DoS       45927
Probe     11656
R2L         995
U2R          52
Name: count, dtype: int64

Test attack class distribution (including Unknown):
attack_class
Normal     9711
DoS        7458
R2L        2885
Probe      2421
U2R          67
Unknown       2
Name: count, dtype: int64


# ==========================================
# 4. Feature Selection: Numeric vs Categorical
# ==========================================

In [4]:
# We separate features to treat them differently.
# 'label', 'attack_class', 'binary_target' are targets, not features.

target_cols = ['label', 'attack_class', 'binary_target']
features = [c for c in train_df.columns if c not in target_cols]

# Identify categorical columns (strings) and numeric columns
categorical_cols = ['protocol_type', 'service', 'flag']
numeric_cols = [c for c in features if c not in categorical_cols]

print(f"\n[2] Features defined.")
print(f"Numerical features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")


[2] Features defined.
Numerical features: 38
Categorical features: 3


# ==========================================
# 5. Preprocessing Pipeline (Encoding & Scaling)
# ==========================================

In [5]:
print("\n[3] Building Preprocessing Pipeline...")

# Define the transformer
# 1. StandardScaler for numeric features (mean=0, std=1)
# 2. OneHotEncoder for categorical features (handle_unknown='ignore' is CRITICAL for test set)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# Split X (features) and y (targets)
X_train_raw = train_df[features]
y_train_multi = train_df['attack_class']
y_train_binary = train_df['binary_target']

X_test_raw = test_df[features]
y_test_multi = test_df['attack_class']
y_test_binary = test_df['binary_target']

# Fit on TRAIN, Transform on BOTH
# This ensures we don't "peek" at the test data statistics (Data Leakage prevention)
print("Fitting preprocessor on Train set and transforming...")
X_train_processed = preprocessor.fit_transform(X_train_raw)
X_test_processed = preprocessor.transform(X_test_raw)

print(f"New X_train shape: {X_train_processed.shape}")
print(f"New X_test shape: {X_test_processed.shape}")

# Get feature names after One-Hot Encoding (for interpretability later)
# Note: This syntax depends on sklearn version. If error, wrap in try-except.
try:
    num_names = numeric_cols
    cat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    feature_names = np.r_[num_names, cat_names]
    print(f"Total features names extracted: {len(feature_names)}")
except:
    print("Could not extract feature names directly (sklearn version difference).")
    feature_names = None


[3] Building Preprocessing Pipeline...
Fitting preprocessor on Train set and transforming...
New X_train shape: (125973, 122)
New X_test shape: (22544, 122)
Total features names extracted: 122


# ==========================================
# 6. Save Processed Data for Modeling
# ==========================================

In [6]:
print("\n[4] Saving processed arrays and objects...")

# Save numpy arrays (efficient for ML models)
np.save(os.path.join(output_dir, 'X_train.npy'), X_train_processed)
np.save(os.path.join(output_dir, 'X_test.npy'), X_test_processed)
np.save(os.path.join(output_dir, 'y_train_binary.npy'), y_train_binary)
np.save(os.path.join(output_dir, 'y_test_binary.npy'), y_test_binary)
np.save(os.path.join(output_dir, 'y_train_multi.npy'), y_train_multi)
np.save(os.path.join(output_dir, 'y_test_multi.npy'), y_test_multi)

# Save feature names and the preprocessor object (optional, good for inference)
if feature_names is not None:
    pd.DataFrame({'feature': feature_names}).to_csv(os.path.join(output_dir, 'feature_names.csv'), index=False)

# Saving the preprocessor allows us to inverse_transform or process single inputs later
with open(os.path.join(output_dir, 'preprocessor.pkl'), 'wb') as f:
    pickle.dump(preprocessor, f)

print("Done! Preprocessing complete. Ready for Model Training (03_model_training).")


[4] Saving processed arrays and objects...
Done! Preprocessing complete. Ready for Model Training (03_model_training).
