In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive

# ✅ Mount Google Drive
drive.mount('/content/drive')

# ✅ File paths
file_paths = [
    "/content/drive/MyDrive/Datasets/NSLKDD.csv",
    "/content/drive/MyDrive/Datasets/UNSW_NB15_merged.csv",
    "/content/drive/MyDrive/Datasets/kddcup.csv",
    "/content/drive/MyDrive/Datasets/CICIDS2017.csv"
]
dataset_names = ["NSLKDD", "UNSW_NB15", "KDDCup", "CICIDS2017"]

# ✅ Correct target column names
target_columns = {
    "NSLKDD": "anomaly",
    "UNSW_NB15": "label",
    "KDDCup": "Label",
    "CICIDS2017": " Label"
}

print("\n==== Training LSTM Models for Each Dataset ====\n")

for file, name in zip(file_paths, dataset_names):
    print(f"\n🔹 Training on {name} dataset...\n")

    try:
        df = pd.read_csv(file, low_memory=False).dropna(axis=1, how='all')
    except FileNotFoundError:
        print(f"❌ Error: {file} not found. Skipping {name}.")
        continue

    # ✅ Limit dataset to 100,000 rows
    row_limit = 100000
    if len(df) > row_limit:
        df = df.sample(n=row_limit, random_state=42)
        print(f"✅ {name} dataset limited to {row_limit} rows.")

    target_column = target_columns.get(name)
    if target_column not in df.columns:
        print(f"⚠ Skipping {name}, target column '{target_column}' not found!")
        continue

    X = df.drop(columns=[target_column])
    y = df[target_column]

    # ✅ Convert categorical columns to numerical
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    # ✅ Handle infinite and NaN values before scaling
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(), inplace=True)

    # ✅ Scale features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # ✅ Encode target labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    num_classes = len(np.unique(y))
    binary_classification = num_classes == 2  # Check if it's binary

    if not binary_classification:
        y = to_categorical(y, num_classes)  # One-hot encode for multi-class

    # ✅ Reshape input for LSTM (Samples, Time Steps, Features)
    X = X.reshape(X.shape[0], X.shape[1], 1)

    # ✅ Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # ✅ Define LSTM Model
    model = Sequential([
        Input(shape=(X.shape[1], 1)),  # Use Input layer for Sequential models
        LSTM(64, return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1 if binary_classification else num_classes, activation='sigmoid' if binary_classification else 'softmax')
    ])

    # ✅ Compile Model
    model.compile(
        loss='binary_crossentropy' if binary_classification else 'categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )

    # ✅ Train LSTM Model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

    # ✅ Evaluate Model
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1) if not binary_classification else (y_pred > 0.5).astype(int).flatten()
    y_test = np.argmax(y_test, axis=1) if not binary_classification else y_test.flatten()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"🎯 LSTM Accuracy for {name}: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

==== Training LSTM Models for Each Dataset ====


🔹 Training on NSLKDD dataset...

✅ NSLKDD dataset limited to 100000 rows.
Epoch 1/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 44ms/step - accuracy: 0.8923 - loss: 0.2586
Epoch 2/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 38ms/step - accuracy: 0.9514 - loss: 0.1203
Epoch 3/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 38ms/step - accuracy: 0.9605 - loss: 0.0990
Epoch 4/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 38ms/step - accuracy: 0.9636 - loss: 0.0901
Epoch 5/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 38ms/step - accuracy: 0.9683 - loss: 0.0795
Epoch 6/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 39ms/step - accuracy: 0.9723 - loss: 0.07

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ CICIDS2017 dataset limited to 100000 rows.
Epoch 1/10


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 74ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Epoch 2/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 71ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Epoch 3/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 70ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Epoch 4/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 72ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Epoch 5/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 72ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Epoch 6/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 71ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Epoch 7/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 71ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Epoch 8/10
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 70ms/step - accuracy: 1.000



[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 20ms/step
🎯 LSTM Accuracy for CICIDS2017: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30000

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000

