In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, precision_score, recall_score
from sklearn.tree import plot_tree
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load the data
# Download from: https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15
# Use 'UNSW_NB15_training-set.csv' for this demo
df = pd.read_csv('UNSW_NB15_training-set.csv')
print(f"Original shape: {df.shape}")

# Step 2: EDA - Basic overview
print(df.info())
print(df.describe())
print(df['label'].value_counts(normalize=True))  # Class distribution (target: label, 0=normal, 1=attack)

# Visualize class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=df)
plt.xticks([0, 1], ['Normal', 'Attack'])
plt.title('Class Distribution of Target (label)')
plt.ylabel('Count')
plt.show()

# Step 3: Clean the data
# No missing values in this dataset, but drop any if present
df = df.dropna()

# Handle categorical columns: Limit levels for simplicity
df = df[df['proto'].isin(['tcp', 'udp', 'arp'])]
df = df[df['service'].isin(['http', 'ftp', 'ssh', '-'])]
df = df[df['state'].isin(['INT', 'FIN', 'CON'])]

# Subset to 10 key features for simplicity
selected_features = ['dur', 'proto', 'service', 'state', 'spkts', 'sbytes', 'rate', 'sttl', 'dload', 'sloss', 'label']
df_subset = df[selected_features]
print(f"Subset shape: {df_subset.shape}")

# Encode categorical features
df_encoded = pd.get_dummies(df_subset, columns=['proto', 'service', 'state'], drop_first=True)

# Step 4: Check class distribution again after subset
print(df_encoded['label'].value_counts(normalize=True))

# Step 5: Split data into X and y, train and test
X = df_encoded.drop('label', axis=1)
y = df_encoded['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Step 6: Standardization (for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Build Logistic Regression model
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train_scaled, y_train)

# Predict on test
y_pred_log = log_model.predict(X_test_scaled)
y_prob_log = log_model.predict_proba(X_test_scaled)[:, 1]

# Check ROC AUC
auc_log = roc_auc_score(y_test, y_prob_log)
print(f"Logistic Regression AUC: {auc_log:.2f}")

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_log)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'Logistic (AUC = {auc_log:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend()
plt.show()

# Confusion Matrix
cm_log = confusion_matrix(y_test, y_pred_log)
disp_log = ConfusionMatrixDisplay(confusion_matrix=cm_log, display_labels=['Normal', 'Attack'])
disp_log.plot(cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

# Step 8: Build Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)  # No scaling needed for trees

# Predict
y_pred_dt = dt_model.predict(X_test)

# Confusion Matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
disp_dt = ConfusionMatrixDisplay(confusion_matrix=cm_dt, display_labels=['Normal', 'Attack'])
disp_dt.plot(cmap='Blues')
plt.title('Confusion Matrix - Decision Tree')
plt.show()

# Visualize Decision Tree (limit depth for visibility)
plt.figure(figsize=(12, 8))
plot_tree(dt_model, max_depth=2, feature_names=X.columns, class_names=['Normal', 'Attack'], filled=True)
plt.title('Decision Tree Visualization (Partial)')
plt.show()

# Step 9: Decision Tree with Hyperparameters and Pruning
# Hyperparameters: max_depth, min_samples_leaf
dt_tuned = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, random_state=42)
dt_tuned.fit(X_train, y_train)
y_pred_dt_tuned = dt_tuned.predict(X_test)
print(f"Tuned Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt_tuned):.2f}")

# Pruning using cost complexity
path = dt_model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
dt_pruned = DecisionTreeClassifier(ccp_alpha=ccp_alphas[-2], random_state=42)  # Non-zero alpha for pruning
dt_pruned.fit(X_train, y_train)
y_pred_dt_pruned = dt_pruned.predict(X_test)
print(f"Pruned Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt_pruned):.2f}")

# Visualize pruned tree
plt.figure(figsize=(12, 8))
plot_tree(dt_pruned, max_depth=2, feature_names=X.columns, class_names=['Normal', 'Attack'], filled=True)
plt.title('Pruned Decision Tree Visualization (Partial)')
plt.show()

# Step 10: Build Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=['Normal', 'Attack'])
disp_rf.plot(cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.show()

# Step 11: Model Evaluation
def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return acc, prec, rec, f1

# Logistic
acc_log, prec_log, rec_log, f1_log = evaluate_model(y_test, y_pred_log)
print(f"Logistic: Acc={acc_log:.2f}, Prec={prec_log:.2f}, Rec={rec_log:.2f}, F1={f1_log:.2f}")

# Decision Tree (tuned)
acc_dt, prec_dt, rec_dt, f1_dt = evaluate_model(y_test, y_pred_dt_tuned)
print(f"Tuned DT: Acc={acc_dt:.2f}, Prec={prec_dt:.2f}, Rec={rec_dt:.2f}, F1={f1_dt:.2f}")

# Random Forest
acc_rf, prec_rf, rec_rf, f1_rf = evaluate_model(y_test, y_pred_rf)
print(f"Random Forest: Acc={acc_rf:.2f}, Prec={prec_rf:.2f}, Rec={rec_rf:.2f}, F1={f1_rf:.2f}")

# Step 12: Feature Selection (using SelectKBest)
selector = SelectKBest(f_classif, k=5)  # Select top 5 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_features = X.columns[selector.get_support()]
print(f"Selected features: {selected_features}")

# Retrain Random Forest on selected features
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_selected, y_train)
y_pred_rf_selected = rf_selected.predict(X_test_selected)
print(f"RF with Selected Features Accuracy: {accuracy_score(y_test, y_pred_rf_selected):.2f}")

# Step 13: Model Evaluation with Standardization (for Logistic, already done; for comparison)
X_train_selected_scaled = scaler.fit_transform(X_train_selected)
X_test_selected_scaled = scaler.transform(X_test_selected)
lr_selected = LogisticRegression(max_iter=1000, random_state=42).fit(X_train_selected_scaled, y_train)
y_pred_lr_selected = lr_selected.predict(X_test_selected_scaled)
print(f"Logistic with Selected Features Accuracy: {accuracy_score(y_test, y_pred_lr_selected):.2f}")