In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive

# ✅ Mount Google Drive
drive.mount('/content/drive')

# ✅ File paths
file_paths = [
    "/content/drive/MyDrive/Datasets/NSLKDD.csv",
    "/content/drive/MyDrive/Datasets/UNSW_NB15_merged.csv",
    "/content/drive/MyDrive/Datasets/kddcup.csv",
    "/content/drive/MyDrive/Datasets/CICIDS2017.csv"
]
dataset_names = ["NSLKDD", "UNSW_NB15", "KDDCup", "CICIDS2017"]

# ✅ Correct target column names
target_columns = {
    "NSLKDD": "anomaly",
    "UNSW_NB15": "label",
    "KDDCup": "Label",
    "CICIDS2017": " Label"
}

print("\n==== Training Random Forest Models for Each Dataset ====\n")

for file, name in zip(file_paths, dataset_names):
    print(f"\n🔹 Training on {name} dataset...\n")

    try:
        df = pd.read_csv(file, low_memory=False).dropna(axis=1, how='all')
    except FileNotFoundError:
        print(f"❌ Error: {file} not found. Skipping {name}.")
        continue

    # ✅ Limit dataset to 100,000 rows
    row_limit = 100000
    if len(df) > row_limit:
        df = df.sample(n=row_limit, random_state=42)
        print(f"✅ {name} dataset limited to {row_limit} rows.")

    target_column = target_columns.get(name)
    if target_column not in df.columns:
        print(f"⚠ Skipping {name}, target column '{target_column}' not found!")
        continue

    X = df.drop(columns=[target_column])
    y = df[target_column]

    # ✅ Convert categorical columns to numerical
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    # ✅ Handle infinite and NaN values before scaling
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(), inplace=True)

    # ✅ Scale features
    X = StandardScaler().fit_transform(X)
    y = LabelEncoder().fit_transform(y)

    # ✅ Train-Test Split (70-30)
    split = int(0.7 * len(X))
    X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

    # ✅ Check class distribution
    unique, counts = np.unique(y_train, return_counts=True)
    print(f"Class distribution in training set: {dict(zip(unique, counts))}")

    # ✅ Train Random Forest Model
    clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
    clf.fit(X_train, y_train)

    # ✅ Predictions
    y_pred = clf.predict(X_test)

    # ✅ Model Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🎯 Random Forest Accuracy for {name}: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, zero_division=1))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

==== Training Random Forest Models for Each Dataset ====


🔹 Training on NSLKDD dataset...

✅ NSLKDD dataset limited to 100000 rows.
Class distribution in training set: {0: 35114, 1: 34886}
🎯 Random Forest Accuracy for NSLKDD: 0.9957
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14887
           1       1.00      1.00      1.00     15113

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000


🔹 Training on UNSW_NB15 dataset...

✅ UNSW_NB15 dataset limited to 100000 rows.
Class distribution in training set: {0: 25278, 1: 44722}
🎯 Random Forest Accuracy for UNSW_NB15: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10805
           1   