In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, precision_score, recall_score
from sklearn.tree import plot_tree
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Step 1: Load the data
# Download from: https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15
# Use 'UNSW_NB15_training-set.csv' for this demo
df = pd.read_parquet('UNSW_NB15_training-set.parquet')
print(f"Original shape: {df.shape}")

In [None]:
# Step 2: EDA - Basic overview
print(df.info())

In [None]:
df.describe().T

In [None]:
df.proto.value_counts()

In [None]:
# Handle categorical columns: Limit levels for simplicity
df = df[df['proto'].isin(['tcp', 'udp', 'arp'])]
df = df[df['service'].isin(['http', 'ftp', 'ssh', '-'])]
df = df[df['state'].isin(['INT', 'FIN', 'CON'])]

# Subset to 10 key features for simplicity
selected_features = ['dur', 'proto', 'service', 'state', 'spkts', 'sbytes', 'rate','dload', 'sloss', 'label']
df_subset = df[selected_features]
print(f"Subset shape: {df_subset.shape}")

In [None]:
df.isna().sum()

In [None]:
df.proto.nunique()

In [None]:
df.proto.value_counts()

In [None]:
df_subset.shape

In [None]:
sns.boxplot(df_subset['sbytes'])

In [None]:
plt.scatter(df_subset['sbytes'],df_subset['spkts'])

In [None]:
df_subset.proto.value_counts()

In [None]:
sns.countplot(x=df_subset['proto'], hue=df_subset['label'])

In [None]:
# considering we have done detailed analysis

In [None]:
df.proto.unique()

In [None]:
df.info()

In [None]:
df_subset.service.unique()

In [None]:
df_subset.label.value_counts()

In [None]:
print(df['label'].value_counts(normalize=True))  # Class distribution (target: label, 0=normal, 1=attack)

In [None]:
# baseline accuracy

In [None]:
df_subset.info()

In [None]:
df_subset[['proto','service','state']]=df_subset[['proto','service','state']].astype('object')

In [None]:
# Encode categorical features
df_encoded = pd.get_dummies(df_subset, columns=['proto', 'service', 'state'], drop_first=True)

In [None]:
df_encoded.columns

In [None]:
x= df_encoded[['dur', 'spkts', 'sbytes', 'rate', 'dload', 'sloss',
       'proto_tcp', 'proto_udp', 'service_ftp', 'service_http', 'service_ssh',
       'state_FIN', 'state_INT']][:]

In [None]:
x[:2]

In [None]:
y=df_encoded.label

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
df_encoded[:2]

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression(max_iter=500)

In [None]:
lg_model = lg.fit(xtrain,ytrain)

In [None]:
lg_model.coef_

In [None]:
# Check ROC AUC
auc_log = roc_auc_score(ytest, y_prob_log)
print(f"Logistic Regression AUC: {auc_log:.2f}")

# ROC Curve
fpr, tpr, _ = roc_curve(ytest, y_prob_log)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'Logistic (AUC = {auc_log:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend()
plt.show()

In [None]:
lg_pred = lg_model.predict(xtest)

In [None]:
lg_model.predict(xtest)[:5]

In [None]:
y_prob_log=lg_model.predict_proba(xtest)[:, 1]

In [None]:
xtest.shape

In [None]:
pd.crosstab(ytest,lg_pred)

In [None]:
confusion_matrix(ytest,lg_pred)

In [None]:
acc=(5259+8307)/17470

In [None]:
accuracy_score(ytest,lg_pred)

In [None]:
8307/(3408+8307)

In [None]:
8307/(8307+496)

In [None]:
# Confusion Matrix
cm_log = confusion_matrix(ytest, lg_pred)
disp_log = ConfusionMatrixDisplay(confusion_matrix=cm_log, display_labels=['Normal', 'Attack'])
disp_log.plot(cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

In [None]:
# Step 8: Build Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(xtrain, ytrain)  # No scaling needed for trees

In [None]:
# Predict
y_pred_dt = dt_model.predict(xtest)

In [None]:
pd.crosstab(ytest,y_pred_dt)

In [None]:
7740/(7740+989)

In [None]:
# Confusion Matrix
cm_dt = confusion_matrix(ytest, y_pred_dt)
disp_dt = ConfusionMatrixDisplay(confusion_matrix=cm_dt, display_labels=['Normal', 'Attack'])
disp_dt.plot(cmap='Blues')
plt.title('Confusion Matrix - Decision Tree')
plt.show()

In [None]:
7740/(7740+989)

In [None]:
(7740+7678)/17470

In [None]:
# Visualize Decision Tree (limit depth for visibility)
plt.figure(figsize=(12, 8))
plot_tree(dt_model,feature_names=x.columns, class_names=['Normal', 'Attack'], filled=True)
plt.title('Decision Tree Visualization (Partial)')
plt.show()

In [None]:
# Step 9: Decision Tree with Hyperparameters and Pruning
# Hyperparameters: max_depth, min_samples_leaf
dt_tuned = DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10, min_samples_leaf=1, random_state=42)
dt_tuned.fit(xtrain, ytrain)
y_pred_dt_tuned = dt_tuned.predict(xtest)
print(f"Tuned Decision Tree Accuracy: {accuracy_score(ytest, y_pred_dt_tuned):.2f}")

In [None]:
pd.crosstab(ytest,y_pred_dt_tuned)

In [None]:
# Visualize Decision Tree (limit depth for visibility)
plt.figure(figsize=(12, 8))
plot_tree(dt_tuned,max_depth=3,feature_names=x.columns, class_names=['Normal', 'Attack'], filled=True)
plt.title('Decision Tree Visualization (Partial)')
plt.show()

In [None]:
pip install matplotlib==3.4.3

In [None]:
from dtreeplt import dtreeplt

In [None]:
dtree_reg = dtreeplt(
model = dt_tuned,
feature_names=xtrain.columns,
target_names= np.array(['Normal', 'Attack']))

In [None]:
dtree_reg.view()

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Visualize the decision tree up to depth 3
plt.figure(figsize=(12, 8))
plot_tree(
    dt_tuned,
    max_depth=3,
    feature_names=x.columns,
    class_names=['Normal', 'Attack'],
    filled=True,
    rounded=True
)
plt.title('Decision Tree Visualization (Partial, max_depth=3)')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
DecisionTreeClassifier(max_depth=2,max_leaf_nodes=5,min_samples_leaf=1)

In [None]:
params = {'max_depth' : [2,4,6,8,10],
         'max_leaf_nodes' : [5,10],
         'min_samples_leaf' : [1,2]}

clf = DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf, param_grid=params)
gcv.fit(xtrain,ytrain)

In [None]:
gcv.best_params_

In [None]:
# Pruning using cost complexity
path = dt_model.cost_complexity_pruning_path(xtrain, ytrain)
ccp_alphas = path.ccp_alphas
dt_pruned = DecisionTreeClassifier(ccp_alpha=ccp_alphas[-2], random_state=42)  # Non-zero alpha for pruning
dt_pruned.fit(xtrain, ytrain)
y_pred_dt_pruned = dt_pruned.predict(xtest)
print(f"Pruned Decision Tree Accuracy: {accuracy_score(ytest, y_pred_dt_pruned):.2f}")

In [None]:
# Step 10: Build Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(xtrain, ytrain)

# Predict
y_pred_rf = rf_model.predict(xtest)

# Confusion Matrix
cm_rf = confusion_matrix(ytest, y_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=['Normal', 'Attack'])
disp_rf.plot(cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.show()

In [None]:
7983/(1049+7983)

In [None]:
rf_model.feature_importances_

In [None]:
xtrain.columns

In [None]:
# Data Preparation / Hyper parameter tuning / Model Evaluation

In [None]:
# Data Pre-processing --> Data Transformation [Standardisation, Normalisation]

# Sampling --> Train/Test, Cross Validation [KFold CV, Repeated K-fold CV, Stratified CV]

# Feature Selection --> P-Value,Step function, Lasso, AUC(ROC), K-Best,Chi-square,RFE

# Feature Extraction --> PCA, LDA

# Handling Class Imbalances --> Undersampling, Oversampling, SMOTE

In [None]:
# Standardisation : (xi-mean(x))/std(x) 
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Normalisation : (xi-min(x))/(min(x)-max(x)) --> range function(0,1)

In [None]:
# Kfold CV --> 3 folds (n_splits = 3)
# 10,000 Records

In [None]:
# 1st chunk --> 3500

# 2nd chunk --> 3500

# 3rd chunk --> 3000

In [None]:
# 1st model --> use 1st and 2nd chunk for training and test on 3rd chunk --> accuracy (80%)

# 2nd model --> use 1st and 3rd chunk for training and test on 2nd chunk --> accuracy (70%)

# 3rd model --> use 3rd and 2nd chunk for training and test on 1st chunk --> accuracy (75%)

In [None]:
# return the avg(3 models accuracy as the avg accuracy of a algorithm on the given dataset)

In [None]:
# Feature Selection Techniques

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, RFE

In [None]:
lr = LogisticRegression(max_iter=1000)

In [None]:
rfe_lr = RFE(lr, n_features_to_select=5)
result = rfe_lr.fit(xtrain,ytrain)

In [None]:
result.ranking_

In [None]:
# Chi2 with Kbest

In [None]:
model = SelectKBest(score_func=chi2,k=5)
result_kbest = model.fit(x,y)

In [None]:
result_kbest.scores_

In [None]:
for i in result_kbest.scores_:
    print(round(i,2))

In [None]:
xtrain.columns

In [None]:
result_kbest.get_support()

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
model = ExtraTreesClassifier()

In [None]:
model.fit(x,y)

In [None]:
model.feature_importances_

In [None]:
# Handling Class imbalance

# Under Sampling

# Over Sampling

# SMOTE --> Synthetic Minority Over Sampling

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
df_encoded[df_encoded['label']==1][:6]

In [None]:
s = SMOTE()

In [None]:
x_train_smote, y_train_smote = s.fit_sample(x_train,y_train)

In [None]:
# Evaluation of Algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
models = []

In [None]:
models.append(("LR", LogisticRegression()))
models.append(("Tree", DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)))
models.append(("SVM", SVC()))
models.append(("RF", RandomForestClassifier()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("NB", GaussianNB()))
models.append(("XGB", XGBClassifier()))

In [None]:
models

In [None]:
results = []
names = []

In [None]:
from sklearn.model_selection import KFold , cross_val_score, GridSearchCV

In [None]:
for name,model in models:
    kfold = KFold(n_splits=5)
    cv_results = cross_val_score(model,x,y,cv=kfold, scoring="precision")
    results.append(cv_results)
    names.append(name)
    
    outcome = "%s: %f (%f)" % (name,cv_results.mean()*100, cv_results.std()*100)
    print(outcome)

LR: 49.527607 (40.489871)
Tree: 53.478271 (43.925599)


In [None]:
fig = plt.figure()
axis = fig.add_subplot(111)
plt.boxplot(results)
axis.set_xticklabels(names)
plt.show()