In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import silhouette_score
from google.colab import drive

# ✅ Mount Google Drive
drive.mount('/content/drive')

# ✅ File paths
file_paths = [
    "/content/drive/MyDrive/Datasets/NSLKDD.csv",
    "/content/drive/MyDrive/Datasets/UNSW_NB15_merged.csv",
    "/content/drive/MyDrive/Datasets/kddcup.csv",
    "/content/drive/MyDrive/Datasets/CICIDS2017.csv"
]
dataset_names = ["NSLKDD", "UNSW_NB15", "KDDCup", "CICIDS2017"]

# ✅ Correct target column names
target_columns = {
    "NSLKDD": "anomaly",
    "UNSW_NB15": "label",
    "KDDCup": "Label",  # Verify correct column name
    "CICIDS2017": " Label"
}

print("\n==== Training KMeans Models for Each Dataset ====\n")

for file, name in zip(file_paths, dataset_names):
    print(f"\n🔹 Training on {name} dataset...\n")

    try:
        df = pd.read_csv(file, low_memory=False).dropna(axis=1, how='all')
    except FileNotFoundError:
        print(f"❌ Error: {file} not found. Skipping {name}.")
        continue

    # ✅ Limit dataset to 100,000 rows
    row_limit = 100000
    if len(df) > row_limit:
        df = df.sample(n=row_limit, random_state=42)
        print(f"✅ {name} dataset limited to {row_limit} rows.")

    target_column = target_columns.get(name)
    if target_column not in df.columns:
        print(f"⚠ Skipping {name}, target column '{target_column}' not found!")
        continue

    X = df.drop(columns=[target_column])

    # ✅ Encode categorical features
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    # ✅ Convert X to float32 & handle NaN/Inf
    X = X.astype(np.float32)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.mean(), inplace=True)

    # ✅ Apply Standard Scaling
    X = StandardScaler().fit_transform(X)

    # ✅ Train KMeans Model
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(X)

    # ✅ Evaluate Clustering
    score = silhouette_score(X, kmeans.labels_)
    print(f"✅ KMeans Silhouette Score for {name}: {score:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

==== Training KMeans Models for Each Dataset ====


🔹 Training on NSLKDD dataset...

✅ NSLKDD dataset limited to 100000 rows.
✅ KMeans Silhouette Score for NSLKDD: 0.3270

🔹 Training on UNSW_NB15 dataset...

✅ UNSW_NB15 dataset limited to 100000 rows.
✅ KMeans Silhouette Score for UNSW_NB15: 0.2636

🔹 Training on KDDCup dataset...

✅ KDDCup dataset limited to 100000 rows.
✅ KMeans Silhouette Score for KDDCup: 0.7890

🔹 Training on CICIDS2017 dataset...

✅ CICIDS2017 dataset limited to 100000 rows.
✅ KMeans Silhouette Score for CICIDS2017: 0.1920
