In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# Load column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count',
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

# Load dataset
df = pd.read_csv("../data/raw/kddcup.data_10_percent_corrected", names=column_names)

# Create binary label (0 = normal, 1 = attack)
df['binary_label'] = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)

# Select top 10 features
selected_features = [
    'logged_in', 'count', 'dst_host_count', 'srv_count',
    'dst_host_same_src_port_rate', 'srv_diff_host_rate',
    'same_srv_rate', 'dst_host_srv_serror_rate',
    'serror_rate', 'dst_host_serror_rate'
]

# Filter normal data for training
df_normal = df[df['label'] == 'normal.']
X_train = df_normal[selected_features].copy()

# Prepare full data for evaluation
X_all = df[selected_features].copy()
y_all = df['binary_label'].copy()

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_all_scaled = scaler.transform(X_all)

# Save outputs
pd.DataFrame(X_train_scaled, columns=selected_features).to_csv("../data/processed/X_train_scaled.csv", index=False)
pd.DataFrame(X_all_scaled, columns=selected_features).to_csv("../data/processed/X_all_scaled.csv", index=False)
y_all.to_csv("../data/processed/y_all.csv", index=False)
joblib.dump(scaler, "../models/scaler.pkl")

print("✅ Preprocessing complete. Scaled data saved.")


✅ Preprocessing complete. Scaled data saved.
