In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive

# ✅ Mount Google Drive
drive.mount('/content/drive')

# ✅ File paths
file_paths = [
    "/content/drive/MyDrive/Datasets/NSLKDD.csv",
    "/content/drive/MyDrive/Datasets/UNSW_NB15_merged.csv",
    "/content/drive/MyDrive/Datasets/kddcup.csv",
    "/content/drive/MyDrive/Datasets/CICIDS2017.csv"
]
dataset_names = ["NSLKDD", "UNSW_NB15", "KDDCup", "CICIDS2017"]

# ✅ Correct target column names
target_columns = {
    "NSLKDD": "anomaly",
    "UNSW_NB15": "label",
    "KDDCup": "Label",  # Verify correct column name
    "CICIDS2017": " Label"
}

print("\n==== Training Decision Tree Models for Each Dataset ====\n")

for file, name in zip(file_paths, dataset_names):
    print(f"\n🔹 Training on {name} dataset...\n")

    try:
        df = pd.read_csv(file, low_memory=False).dropna(axis=1, how='all')
    except FileNotFoundError:
        print(f"❌ Error: {file} not found. Skipping {name}.")
        continue

    # ✅ Limit dataset to 100,000 rows
    row_limit = 100000
    if len(df) > row_limit:
        df = df.sample(n=row_limit, random_state=42)
        print(f"✅ {name} dataset limited to {row_limit} rows.")

    target_column = target_columns.get(name)
    if target_column not in df.columns:
        print(f"⚠ Skipping {name}, target column '{target_column}' not found!")
        continue

    X = df.drop(columns=[target_column])
    y = df[target_column]

    # ✅ Encode categorical features
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    # ✅ Convert X to float32 & handle NaN/Inf
    X = X.astype(np.float32)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.mean(), inplace=True)

    # ✅ Apply Standard Scaling
    X = StandardScaler().fit_transform(X)
    y = LabelEncoder().fit_transform(y)

    # ✅ Split data (70% train, 30% test)
    split = int(0.7 * len(X))
    X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

    # ✅ Train Decision Tree Model
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # ✅ Predictions
    y_pred = clf.predict(X_test)

    # ✅ Model Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🎯 Decision Tree Accuracy for {name}: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, zero_division=1))  # ✅ Fix undefined precision warnings


Mounted at /content/drive

==== Training Decision Tree Models for Each Dataset ====


🔹 Training on NSLKDD dataset...

✅ NSLKDD dataset limited to 100000 rows.
🎯 Decision Tree Accuracy for NSLKDD: 0.9940
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     14887
           1       0.99      0.99      0.99     15113

    accuracy                           0.99     30000
   macro avg       0.99      0.99      0.99     30000
weighted avg       0.99      0.99      0.99     30000


🔹 Training on UNSW_NB15 dataset...

✅ UNSW_NB15 dataset limited to 100000 rows.
🎯 Decision Tree Accuracy for UNSW_NB15: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10805
           1       1.00      1.00      1.00     19195

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000


🔹 Training on