In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from google.colab import drive

# ✅ Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# ✅ File paths
file_paths = [
    "/content/drive/MyDrive/Datasets/NSLKDD.csv",
    "/content/drive/MyDrive/Datasets/UNSW_NB15_merged.csv",
    "/content/drive/MyDrive/Datasets/kddcup.csv",
    "/content/drive/MyDrive/Datasets/CICIDS2017.csv"
]
dataset_names = ["NSLKDD", "UNSW_NB15", "KDDCup", "CICIDS2017"]

# ✅ Correct target column names
target_columns = {
    "NSLKDD": "anomaly",
    "UNSW_NB15": "label",
    "KDDCup": "Label",
    "CICIDS2017": " Label"
}

print("\n==== Training SVM Models for Each Dataset ====\n")

for file, name in zip(file_paths, dataset_names):
    print(f"\n🔹 Training on {name} dataset...\n")

    try:
        df = pd.read_csv(file, low_memory=False).dropna(axis=1, how='all')
    except FileNotFoundError:
        print(f"❌ Error: {file} not found. Skipping {name}.")
        continue

    # ✅ Limit dataset to 100,000 rows
    if len(df) > 100000:
        df = df.sample(n=100000, random_state=42)
        print(f"✅ {name} dataset limited to 100,000 rows.")

    target_column = target_columns.get(name)
    if target_column not in df.columns:
        print(f"⚠ Skipping {name}, target column '{target_column}' not found!")
        continue

    X = df.drop(columns=[target_column])
    y = df[target_column]

    # ✅ Convert categorical columns to numerical
    label_encoders = {}
    for col in X.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

    # ✅ Handle infinite and NaN values
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in X.columns:
        if X[col].isna().sum() > 0:
            if X[col].dtype == 'object':
                X[col].fillna(X[col].mode()[0], inplace=True)
            else:
                X.fillna({col: X[col].median()}, inplace=True)  # ✅ Fix FutureWarning

    # ✅ Scale features
    X = StandardScaler().fit_transform(X)
    y = LabelEncoder().fit_transform(y)

    # ✅ Check class distribution
    class_counts = Counter(y)
    print(f"📊 Class distribution before split: {class_counts}")

    # ✅ Handle single-class datasets (Avoids crash)
    if len(class_counts) == 1:
        print(f"❌ Skipping training for {name} (Only one class present: {list(class_counts.keys())[0]}).")
        continue

    # ✅ Handle case where any class has <2 samples (avoiding stratify error)
    min_class_count = min(class_counts.values())
    stratify_option = y if min_class_count >= 2 else None

    # ✅ Train-Test Split (Avoids stratify error)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=stratify_option
    )

    print(f"📊 Class distribution after split: {Counter(y_train)}")

    # ✅ Handle datasets with only 1 class in training
    if len(set(y_train)) == 1:
        print(f"❌ Skipping training for {name} (Only 1 class in training data).")
        continue

    # ✅ Apply SMOTE only if every class has at least 6 samples
    if all(count >= 6 for count in Counter(y_train).values()):
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print("✅ Applied SMOTE to balance dataset.")
    else:
        print("⚠ Skipping SMOTE due to insufficient class samples.")

    # ✅ Train SVM Model with Class Weighting
    clf = SVC(kernel="rbf", class_weight="balanced", random_state=42)
    clf.fit(X_train, y_train)

    # ✅ Predictions
    y_pred = clf.predict(X_test)

    # ✅ Model Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🎯 SVM Accuracy for {name}: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))


Mounted at /content/drive

==== Training SVM Models for Each Dataset ====


🔹 Training on NSLKDD dataset...

✅ NSLKDD dataset limited to 100,000 rows.
📊 Class distribution before split: Counter({0: 50001, 1: 49999})
📊 Class distribution after split: Counter({0: 35001, 1: 34999})
✅ Applied SMOTE to balance dataset.
🎯 SVM Accuracy for NSLKDD: 0.9780
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     15000
           1       0.98      0.98      0.98     15000

    accuracy                           0.98     30000
   macro avg       0.98      0.98      0.98     30000
weighted avg       0.98      0.98      0.98     30000


🔹 Training on UNSW_NB15 dataset...

✅ UNSW_NB15 dataset limited to 100,000 rows.
📊 Class distribution before split: Counter({1: 63917, 0: 36083})
📊 Class distribution after split: Counter({1: 44742, 0: 25258})
✅ Applied SMOTE to balance dataset.
🎯 SVM Accuracy for UNSW_NB15: 0.9936
              precision    recall  f1-s