In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Mount Google Drive (if needed)
from google.colab import drive
drive.mount('/content/drive')

# File paths
file_paths = [
    "/content/drive/MyDrive/Datasets/NSLKDD.csv",
    "/content/drive/MyDrive/Datasets/UNSW_NB15_merged.csv",
    "/content/drive/MyDrive/Datasets/kddcup.csv",
    "/content/drive/MyDrive/Datasets/CICIDS2017.csv"
]
dataset_names = ["NSLKDD", "UNSW_NB15", "KDDCup", "CICIDS2017"]

# ✅ Correct target column names
target_columns = {
    "NSLKDD": "anomaly",
    "UNSW_NB15": "label",
    "KDDCup": "Label",  # Verify correct column name
    "CICIDS2017": " Label"
}

# ✅ Tunable anomaly detection threshold
THRESHOLD_PERCENTILE = 95  # Try 90%, 95%, 99%

print("\n==== Training Autoencoder Models for Each Dataset ====\n")

for file, name in zip(file_paths, dataset_names):
    print(f"\n🔹 Training on {name} dataset...\n")

    try:
        df = pd.read_csv(file, low_memory=False).dropna(axis=1, how='all')
    except FileNotFoundError:
        print(f"❌ Error: {file} not found. Skipping {name}.")
        continue

    # ✅ Print column names to verify target column
    print(f"📝 Columns in {name}: {df.columns.tolist()}")

    # ✅ Apply row limit
    row_limit = 100000
    if len(df) > row_limit:
        df = df.sample(n=row_limit, random_state=42)
        print(f"✅ {name} dataset limited to {row_limit} rows.")

    target_column = target_columns.get(name)

    # ✅ Ensure target column exists
    if target_column not in df.columns:
        print(f"⚠ Skipping {name}, target column '{target_column}' not found!")
        continue

    X = df.drop(columns=[target_column])
    y = df[target_column].values  # True labels

    # ✅ Print label distribution
    print("📊 Label Distribution:")
    print(df[target_column].value_counts())

    # ✅ Encode categorical features in X
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    # ✅ Encode labels for consistency
    if y.dtype == 'object':
        y = LabelEncoder().fit_transform(y)

    # ✅ Normalize labels for binary classification
    y_binary = (y > 0).astype(int)  # If 0 = normal, make all others 1

    # ✅ Convert X to float32 & handle NaN/Inf
    X = X.astype(np.float32)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.mean(), inplace=True)

    # Normalize data
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data (70% train, 30% test)
    split = int(0.7 * len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y_binary[:split], y_binary[split:]

    # Define Autoencoder Model
    input_layer = Input(shape=(X.shape[1],))
    encoded = Dense(128, activation='relu')(input_layer)
    encoded = Dense(64, activation='relu')(encoded)
    decoded = Dense(128, activation='relu')(encoded)
    decoded = Dense(X.shape[1], activation='sigmoid')(decoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    # Train Autoencoder
    history = autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, verbose=1)

    # Compute reconstruction error on test set
    X_test_pred = autoencoder.predict(X_test)
    reconstruction_error = np.mean(np.square(X_test - X_test_pred), axis=1)

    # ✅ Adjust threshold dynamically
    threshold = np.percentile(reconstruction_error, THRESHOLD_PERCENTILE)

    # ✅ Convert errors to binary classification (0 = normal, 1 = anomaly)
    y_pred = (reconstruction_error > threshold).astype(int)

    # ✅ Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🎯 Accuracy for {name} (Threshold {THRESHOLD_PERCENTILE}%): {accuracy:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

==== Training Autoencoder Models for Each Dataset ====


🔹 Training on NSLKDD dataset...

📝 Columns in NSLKDD: ['0', 'tcp', 'private', 'REJ', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '0.10', '0.11', '0.12', '0.13', '0.14', '0.15', '0.16', '0.17', '0.18', '229', '10', '0.0', '0.0.1', '1.0', '1.0.1', '0.04', '0.06', '0.0.2', '255', '10.1', '0.04.1', '0.06.1', '0.0.3', '0.0.4', '0.0.5', '0.0.6', '1.0.2', '1.0.3', 'anomaly']
✅ NSLKDD dataset limited to 100000 rows.
📊 Label Distribution:
anomaly
anomaly    50001
normal     49999
Name: count, dtype: int64
Epoch 1/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - loss: 0.6609
Epoch 2/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.6415
Epoch 3/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/st