In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Replace with your actual file paths
train_path = '/content/UNSW_NB15_training-set.csv'
test_path = '/content/UNSW_NB15_testing-set.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_all = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
print(df_all.shape)

(257673, 45)


In [None]:
print(df_all.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 257673 non-null  int64  
 1   dur                257673 non-null  float64
 2   proto              257673 non-null  object 
 3   service            257673 non-null  object 
 4   state              257673 non-null  object 
 5   spkts              257673 non-null  int64  
 6   dpkts              257673 non-null  int64  
 7   sbytes             257673 non-null  int64  
 8   dbytes             257673 non-null  int64  
 9   rate               257673 non-null  float64
 10  sttl               257673 non-null  int64  
 11  dttl               257673 non-null  int64  
 12  sload              257673 non-null  float64
 13  dload              257673 non-null  float64
 14  sloss              257673 non-null  int64  
 15  dloss              257673 non-null  int64  
 16  si

In [None]:
print(df_all.describe())

                  id            dur          spkts          dpkts  \
count  257673.000000  257673.000000  257673.000000  257673.000000   
mean    72811.823858       1.246715      19.777144      18.514703   
std     48929.917641       5.974305     135.947152     111.985965   
min         1.000000       0.000000       1.000000       0.000000   
25%     32210.000000       0.000008       2.000000       0.000000   
50%     64419.000000       0.004285       4.000000       2.000000   
75%    110923.000000       0.685777      12.000000      10.000000   
max    175341.000000      59.999989   10646.000000   11018.000000   

             sbytes        dbytes          rate           sttl           dttl  \
count  2.576730e+05  2.576730e+05  2.576730e+05  257673.000000  257673.000000   
mean   8.572952e+03  1.438729e+04  9.125391e+04     180.000931      84.754957   
std    1.737739e+05  1.461993e+05  1.603446e+05     102.488268     112.762131   
min    2.400000e+01  0.000000e+00  0.000000e+00       

In [None]:
print(df_all.columns)

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')


In [None]:
print(df_all['label'].value_counts())
print(df_all['attack_cat'].value_counts())  # Optional: for multiclass

label
1    164673
0     93000
Name: count, dtype: int64
attack_cat
Normal            93000
Generic           58871
Exploits          44525
Fuzzers           24246
DoS               16353
Reconnaissance    13987
Analysis           2677
Backdoor           2329
Shellcode          1511
Worms               174
Name: count, dtype: int64


In [None]:
cols_to_drop = ['id', 'attack_cat', 'label']
X_all = df_all.drop(columns=cols_to_drop)
y_all = df_all['label']


In [None]:
df_all['state'].unique()

array(['FIN', 'INT', 'CON', 'ECO', 'REQ', 'RST', 'PAR', 'URN', 'no',
       'ACC', 'CLO'], dtype=object)

In [None]:
df_train['service'].unique()

array(['-', 'ftp', 'smtp', 'snmp', 'http', 'ftp-data', 'dns', 'ssh',
       'radius', 'pop3', 'dhcp', 'ssl', 'irc'], dtype=object)

In [None]:
df_train['proto'].unique()

array(['tcp', 'udp', 'arp', 'ospf', 'icmp', 'igmp', 'rtp', 'ddp',
       'ipv6-frag', 'cftp', 'wsn', 'pvp', 'wb-expak', 'mtp', 'pri-enc',
       'sat-mon', 'cphb', 'sun-nd', 'iso-ip', 'xtp', 'il', 'unas',
       'mfe-nsp', '3pc', 'ipv6-route', 'idrp', 'bna', 'swipe',
       'kryptolan', 'cpnx', 'rsvp', 'wb-mon', 'vmtp', 'ib', 'dgp',
       'eigrp', 'ax.25', 'gmtp', 'pnni', 'sep', 'pgm', 'idpr-cmtp',
       'zero', 'rvd', 'mobile', 'narp', 'fc', 'pipe', 'ipcomp', 'ipv6-no',
       'sat-expak', 'ipv6-opts', 'snp', 'ipcv', 'br-sat-mon', 'ttp',
       'tcf', 'nsfnet-igp', 'sprite-rpc', 'aes-sp3-d', 'sccopmce', 'sctp',
       'qnx', 'scps', 'etherip', 'aris', 'pim', 'compaq-peer', 'vrrp',
       'iatp', 'stp', 'l2tp', 'srp', 'sm', 'isis', 'smp', 'fire', 'ptp',
       'crtp', 'sps', 'merit-inp', 'idpr', 'skip', 'any', 'larp', 'ipip',
       'micp', 'encap', 'ifmp', 'tp++', 'a/n', 'ipv6', 'i-nlsp',
       'ipx-n-ip', 'sdrp', 'tlsp', 'gre', 'mhrp', 'ddx', 'ippc', 'visa',
       'secure-vmtp', 

In [None]:
service_categories =['-', 'ftp', 'smtp', 'snmp', 'http', 'ftp-data', 'dns', 'ssh',
       'radius', 'pop3', 'dhcp', 'ssl', 'irc']

proto_categories = ['tcp', 'udp', 'arp', 'ospf', 'icmp', 'igmp', 'rtp', 'ddp',
       'ipv6-frag', 'cftp', 'wsn', 'pvp', 'wb-expak', 'mtp', 'pri-enc',
       'sat-mon', 'cphb', 'sun-nd', 'iso-ip', 'xtp', 'il', 'unas',
       'mfe-nsp', '3pc', 'ipv6-route', 'idrp', 'bna', 'swipe',
       'kryptolan', 'cpnx', 'rsvp', 'wb-mon', 'vmtp', 'ib', 'dgp',
       'eigrp', 'ax.25', 'gmtp', 'pnni', 'sep', 'pgm', 'idpr-cmtp',
       'zero', 'rvd', 'mobile', 'narp', 'fc', 'pipe', 'ipcomp', 'ipv6-no',
       'sat-expak', 'ipv6-opts', 'snp', 'ipcv', 'br-sat-mon', 'ttp',
       'tcf', 'nsfnet-igp', 'sprite-rpc', 'aes-sp3-d', 'sccopmce', 'sctp',
       'qnx', 'scps', 'etherip', 'aris', 'pim', 'compaq-peer', 'vrrp',
       'iatp', 'stp', 'l2tp', 'srp', 'sm', 'isis', 'smp', 'fire', 'ptp',
       'crtp', 'sps', 'merit-inp', 'idpr', 'skip', 'any', 'larp', 'ipip',
       'micp', 'encap', 'ifmp', 'tp++', 'a/n', 'ipv6', 'i-nlsp',
       'ipx-n-ip', 'sdrp', 'tlsp', 'gre', 'mhrp', 'ddx', 'ippc', 'visa',
       'secure-vmtp', 'uti', 'vines', 'crudp', 'iplt', 'ggp', 'ip',
       'ipnip', 'st2', 'argus', 'bbn-rcc', 'egp', 'emcon', 'igp', 'nvp',
       'pup', 'xnet', 'chaos', 'mux', 'dcn', 'hmp', 'prm', 'trunk-1',
       'xns-idp', 'leaf-1', 'leaf-2', 'rdp', 'irtp', 'iso-tp4', 'netblt',
       'trunk-2', 'cbt']
state_categories = ['no', 'FIN', 'INT', 'CON', 'ECO', 'REQ', 'RST', 'PAR', 'URN', 'CLO', 'ACC']
categorical_cols = ['proto', 'service', 'state']

In [None]:
all_categories = [proto_categories, service_categories, state_categories]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=all_categories)
X_all[categorical_cols] = oe.fit_transform(X_all[categorical_cols])

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all)

In [None]:
print("All features scaled, shape:", X_all_scaled.shape)

All features scaled, shape: (257673, 42)


In [None]:
print("All features scaled, shape:", X_all_scaled.shape)

All features scaled, shape: (257673, 42)


In [None]:
print("Unique values per categorical after encoding:")
for col in categorical_cols:
    print(col, sorted(X_all[col].unique()))

Unique values per categorical after encoding:
proto [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.

In [None]:
X_all.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,0.121478,0.0,0.0,1.0,6,4,258,172,74.08749,252,...,1,1,1,1,0,0,0,1,1,0
1,0.649902,0.0,0.0,1.0,14,38,734,42014,78.473372,62,...,1,1,1,2,0,0,0,1,6,0
2,1.623129,0.0,0.0,1.0,8,16,364,13186,14.170161,62,...,2,1,1,3,0,0,0,2,6,0
3,1.681642,0.0,1.0,1.0,12,12,628,770,13.677108,62,...,2,1,1,3,1,1,0,2,1,0
4,0.449454,0.0,0.0,1.0,10,6,534,268,33.373826,254,...,2,2,1,40,0,0,0,2,39,0


In [None]:
X_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Data columns (total 42 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   dur                257673 non-null  float64
 1   proto              257673 non-null  float64
 2   service            257673 non-null  float64
 3   state              257673 non-null  float64
 4   spkts              257673 non-null  int64  
 5   dpkts              257673 non-null  int64  
 6   sbytes             257673 non-null  int64  
 7   dbytes             257673 non-null  int64  
 8   rate               257673 non-null  float64
 9   sttl               257673 non-null  int64  
 10  dttl               257673 non-null  int64  
 11  sload              257673 non-null  float64
 12  dload              257673 non-null  float64
 13  sloss              257673 non-null  int64  
 14  dloss              257673 non-null  int64  
 15  sinpkt             257673 non-null  float64
 16  di

In [None]:
from sklearn.model_selection import train_test_split

# Let's say you want 70% training, 30% testing:
X_train, X_test, y_train, y_test = train_test_split(
    X_all_scaled,
    y_all,
    test_size=0.2,
    random_state=42,
    stratify=y_all    # <--- THIS ensures balanced labels
)

# Confirm balance:
import numpy as np

print("Train label distribution:", np.bincount(y_train))
print("Test label distribution:", np.bincount(y_test))
print("Total samples:", len(y_all))
print("Train:", len(y_train), "Test:", len(y_test))

Train label distribution: [ 74400 131738]
Test label distribution: [18600 32935]
Total samples: 257673
Train: 206138 Test: 51535


In [None]:
# Save
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

In [None]:
import joblib
# Save
joblib.dump(X_train, 'X_train.joblib')
joblib.dump(X_test, 'X_test.joblib')
joblib.dump(y_train, 'y_train.joblib')
joblib.dump(y_test, 'y_test.joblib')

['y_test.joblib']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

# Train
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

# Save model
joblib.dump(rf, 'rf_model.joblib')

['rf_model.joblib']

In [None]:
# Predict
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate
print("Random Forest Results")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

Random Forest Results
[[17455  1145]
 [ 1278 31657]]
              precision    recall  f1-score   support

           0     0.9318    0.9384    0.9351     18600
           1     0.9651    0.9612    0.9631     32935

    accuracy                         0.9530     51535
   macro avg     0.9484    0.9498    0.9491     51535
weighted avg     0.9531    0.9530    0.9530     51535

ROC AUC: 0.9924736659533033


In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[17455  1145]
 [ 1278 31657]]


In [None]:
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, roc_auc_score
)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

Accuracy: 0.9529834093334627
Confusion Matrix:
 [[17455  1145]
 [ 1278 31657]]
              precision    recall  f1-score   support

           0     0.9318    0.9384    0.9351     18600
           1     0.9651    0.9612    0.9631     32935

    accuracy                         0.9530     51535
   macro avg     0.9484    0.9498    0.9491     51535
weighted avg     0.9531    0.9530    0.9530     51535

ROC AUC: 0.9924736659533033


In [None]:
from sklearn.svm import SVC
# Support Vector Machine (SVM)

svm = SVC(probability=True, random_state=42)
svm.fit(X_train, y_train)

# Save model
joblib.dump(svm, 'svm_model.joblib')

In [None]:
# Predict
y_pred_svm = svm.predict(X_test)
y_proba_svm = svm.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate
print("SVM Results")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_proba_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Save model
joblib.dump(knn, 'knn_model.joblib')

In [None]:
# Predict
y_pred_knn = knn.predict(X_test)
y_proba_knn = knn.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate
print("KNN Results")
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_proba_knn))
print("Accuracy:", accuracy_score(y_test, y_pred_knn))

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming 'df' is your DataFrame and 'target' is the target column
# Replace 'df' and 'target' with your actual DataFrame and target column name
# Example: Assuming 'df' contains the data and 'target_column' is the name of the target variable
# X = df.drop('target_column', axis=1)
# y = df['target_column']

# Split data into training and testing sets (example using dummy data)
X = np.random.rand(100, 10) # Replace with your actual feature data
y = np.random.randint(0, 2, 100) # Replace with your actual target data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input
from tensorflow.keras.utils import to_categorical

# (binary classification)
y_train_lstm = to_categorical(y_train)
y_test_lstm = to_categorical(y_test)

# Reshape data
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [None]:
# LSTM
lstm_model = Sequential([
    Input(shape=(1, X_train.shape[1])),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='softmax')  # 2 classes
])

lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=40, batch_size=128, validation_split=0.2)

Epoch 1/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.4688 - loss: 0.6928 - val_accuracy: 0.6250 - val_loss: 0.6851
Epoch 2/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.5312 - loss: 0.6889 - val_accuracy: 0.6250 - val_loss: 0.6845
Epoch 3/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.5625 - loss: 0.6885 - val_accuracy: 0.6250 - val_loss: 0.6836
Epoch 4/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.5156 - loss: 0.6851 - val_accuracy: 0.6250 - val_loss: 0.6828
Epoch 5/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.5625 - loss: 0.6850 - val_accuracy: 0.6250 - val_loss: 0.6821
Epoch 6/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.5156 - loss: 0.6855 - val_accuracy: 0.6250 - val_loss: 0.6812
Epoch 7/40
[1m1/1[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x79d51ec44590>

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Perdict
y_pred_lstm = lstm_model.predict(X_test_lstm)
y_pred_lstm_classes = np.argmax(y_pred_lstm, axis=1)
y_proba_lstm = y_pred_lstm[:, 1]

print("\nLSTM Results")
print("Accuracy:", accuracy_score(y_test, y_pred_lstm_classes))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm_classes))
print(classification_report(y_test, y_pred_lstm_classes, digits=4))
print("ROC AUC:", roc_auc_score(y_test, y_proba_lstm))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 223ms/step

LSTM Results
Accuracy: 0.45
Confusion Matrix:
 [[2 9]
 [2 7]]
              precision    recall  f1-score   support

           0     0.5000    0.1818    0.2667        11
           1     0.4375    0.7778    0.5600         9

    accuracy                         0.4500        20
   macro avg     0.4688    0.4798    0.4133        20
weighted avg     0.4719    0.4500    0.3987        20

ROC AUC: 0.6161616161616161
