In [None]:
import os
DATA_PATH = '/kaggle/input/unsw-nb15'   # adjust only if your dataset folder name differs
print("Listing files in dataset folder:")
for root, dirs, files in os.walk(DATA_PATH):
    for f in files:
        print(os.path.join(root, f))


In [None]:
import os, pandas as pd

DATA_PATH = "/kaggle/input/unsw-nb15"
train_fn = os.path.join(DATA_PATH, "UNSW_NB15_training-set.csv")
test_fn  = os.path.join(DATA_PATH, "UNSW_NB15_testing-set.csv")

df_train = pd.read_csv(train_fn)
df_test  = pd.read_csv(test_fn)

df_train.shape, df_test.shape, df_train.head(3)


In [None]:
import numpy as np
import pandas as pd

label_col = "label"  # 0 = Normal, 1 = Attack in UNSW-NB15

# mark split, concat so one-hot columns align, then build X/y
df_train["__split"] = "train"
df_test["__split"]  = "test"
df_all = pd.concat([df_train, df_test], ignore_index=True)

# columns to drop (IDs / potential leakage)
drop_cols = [c for c in ["id","srcip","dstip","sport","dport","stime","ltime",label_col] if c in df_all.columns]

# small set of categoricals to one-hot if present
cat_candidates = ["proto","service","state","is_ftp_login"]  # last is already numeric in some releases; harmless if absent
cat_cols = [c for c in cat_candidates if c in df_all.columns]

num = df_all.drop(columns=drop_cols + cat_cols, errors="ignore").select_dtypes(include=[np.number]).fillna(0)
dmy = pd.get_dummies(df_all[cat_cols], drop_first=True) if cat_cols else pd.DataFrame(index=df_all.index)
X_all = pd.concat([num, dmy], axis=1)
y_all = df_all[label_col].astype(int)

print("X_all:", X_all.shape, "| y dist:", y_all.value_counts(normalize=True))

# split back to train/test
mask = df_all["__split"] == "train"
X_train, y_train = X_all[mask], y_all[mask]
X_test,  y_test  = X_all[~mask], y_all[~mask]
X_train.shape, X_test.shape


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

clf = RandomForestClassifier(
    n_estimators=300, n_jobs=-1, random_state=42, class_weight="balanced"
)
clf.fit(X_train, y_train)

y_pred  = clf.predict(X_test)
y_prob  = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred, target_names=["Normal","Attack"]))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Normal","Attack"],
            yticklabels=["Normal","Attack"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()


In [None]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot top 15 features
plt.figure(figsize=(8,6))
plt.title("Top 15 Feature Importances")
plt.bar(range(15), importances[indices[:15]], align="center")
plt.xticks(range(15), X_train.columns[indices[:15]], rotation=90)
plt.tight_layout()
plt.show()


In [None]:
import joblib

# Save model
joblib.dump(clf, "/kaggle/working/ids_random_forest.pkl")

# Save predictions
import pandas as pd
pd.DataFrame({
    "y_true": y_test,
    "y_pred": y_pred,
    "y_prob": y_prob
}).to_csv("/kaggle/working/predictions_sample.csv", index=False)
