In [1]:
# === My new cell ===
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

# 1. Paths
data_dir = Path(r"C:\Users\Raghav Singla\Desktop\PBL-4")
train_fp = data_dir / "UNSW_NB15_training-set.csv"
test_fp  = data_dir / "UNSW_NB15_testing-set.csv"

# 2. Load
df_train = pd.read_csv(train_fp)
df_test  = pd.read_csv(test_fp)

# 3. Drop unwanted cols & split X/y
drop_cols = ["id", "attack_cat"]
X_train = df_train.drop(columns=drop_cols + ["label"])
y_train = df_train["label"]
X_test  = df_test.drop(columns=drop_cols + ["label"])
y_test  = df_test["label"]

# 4. Label‐encode categoricals
cat_cols = ["proto", "service", "state"]
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X_train[col], X_test[col]]).astype(str)
    le.fit(combined)
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col]  = le.transform(X_test[col].astype(str))

# 5. Numeric coercion + missing‐value imputation
X_train = X_train.apply(pd.to_numeric, errors="coerce")
X_test  = X_test.apply(pd.to_numeric, errors="coerce")
medians = X_train.median()
X_train.fillna(medians, inplace=True)
X_test.fillna(medians, inplace=True)

# 6. Display dataset info
print("=== TRAIN SET INFO ===")
print(X_train.shape)
print(X_train.info())
print("\n=== TEST SET INFO ===")
print(X_test.shape)
print(X_test.info())
print("\n=== LABEL DISTRIBUTIONS ===")
print("Train:", y_train.value_counts(normalize=True).round(3).to_dict())
print("Test: ", y_test.value_counts(normalize=True).round(3).to_dict())


=== TRAIN SET INFO ===
(175341, 42)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 42 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   dur                175341 non-null  float64
 1   proto              175341 non-null  int32  
 2   service            175341 non-null  int32  
 3   state              175341 non-null  int32  
 4   spkts              175341 non-null  int64  
 5   dpkts              175341 non-null  int64  
 6   sbytes             175341 non-null  int64  
 7   dbytes             175341 non-null  int64  
 8   rate               175341 non-null  float64
 9   sttl               175341 non-null  int64  
 10  dttl               175341 non-null  int64  
 11  sload              175341 non-null  float64
 12  dload              175341 non-null  float64
 13  sloss              175341 non-null  int64  
 14  dloss              175341 non-null  int64  
 15  sinpkt         

In [2]:
# === My new cell ===
# 7. Define models & metrics container
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1
    ),
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "SVM": SVC(kernel="rbf", C=1.0, probability=True, random_state=42),
    "LogisticRegression": LogisticRegression(
        max_iter=1000, class_weight="balanced", random_state=42
    ),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
}

# prepare DataFrame to collect results
metrics_df = pd.DataFrame(
    columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "ROC AUC"]
)


In [None]:
# === My new cell ===
# 8. Train & evaluate each model (updated to avoid DataFrame.append)
from IPython.display import display

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob),
    })

# Build the DataFrame at once
metrics_df = pd.DataFrame(results).set_index("Model").round(4)
print("=== MODEL PERFORMANCE ON TEST SET ===")
display(metrics_df)


In [None]:
# === My new cell ===
# 9. Bar chart comparison of Accuracy & F1-Score
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
metrics_df[["Accuracy", "F1-Score"]].plot(
    kind="bar", ylim=(0,1), rot=45, figsize=(8,5), legend=True
)
plt.title("Model Comparison: Accuracy vs F1-Score")
plt.ylabel("Score")
plt.tight_layout()
plt.show()


In [None]:
# === My new cell ===
# 10. Plot confusion matrices for each model
import seaborn as sns
from sklearn.metrics import confusion_matrix

for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


In [None]:
# === My new cell ===
# 11. Plot ROC curves for all models
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(8,6))
for name, model in models.items():
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, lw=2, label=f"{name} (AUC = {auc(fpr,tpr):.3f})")

plt.plot([0,1], [0,1], "--", color="gray")
plt.title("ROC Curves Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()
