In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("programmer3/unsw-nb15-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/programmer3/unsw-nb15-dataset?dataset_version_number=2...


100%|██████████| 1.11M/1.11M [00:00<00:00, 2.16MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/programmer3/unsw-nb15-dataset/versions/2





In [2]:
import os, glob
import pandas as pd

# Find all relevant CSV files in the downloaded dataset folder
csvs = glob.glob('/root/.cache/kagglehub/datasets/programmer3/unsw-nb15-dataset/versions/2/**/*.csv', recursive=True)
print(len(csvs), 'CSV files found')
for p in csvs[:10]:
    print(p)

# Load and preview the first CSV file
df = pd.read_csv(csvs[0])
print('Shape:', df.shape)
print('Columns:', list(df.columns)[:20], '...')
print(df.head(3))


1 CSV files found
/root/.cache/kagglehub/datasets/programmer3/unsw-nb15-dataset/versions/2/UNSW_NB15.csv
Shape: (7465, 44)
Columns: ['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin'] ...
          dur proto service state  spkts  dpkts  sbytes  dbytes        rate  \
0  374.540119   tcp       -   CON     70     38   45337   84540  154.791136   
1  950.714306   udp     ftp   CON     31     70   15586   63543  229.870435   
2  731.993942   tcp    http   INT     80     45   95651   47001  549.325669   

   sttl  ...  ct_ftp_cmd  ct_srv_dst  ct_dst_ltm  ct_src_ ltm  \
0   111  ...           0          71          94            7   
1   209  ...           4          81           4           86   
2   186  ...           0          48          83            4   

   ct_src_dport_ltm  ct_dst_sport_ltm  ct_dst_src_ltm  is_sm_ips_ports  \
0                95              

In [3]:
# Identify possible label and category columns
label_col_candidates = [c for c in df.columns if c.lower() in ['label', 'is_attack', 'class', 'binary_label']]
cat_col_candidates = [c for c in df.columns if 'attack' in c.lower() and 'cat' in c.lower()]
print('Binary candidates:', label_col_candidates)
print('Attack category candidates:', cat_col_candidates)

BINARY_COL = label_col_candidates[0]
ATTACK_COL = cat_col_candidates[0] if cat_col_candidates else None

# Normalize binary label to 0/1
df['y_bin'] = (df[BINARY_COL].astype(int) > 0).astype(int)

# Normalize multiclass (optional)
if ATTACK_COL:
    df['y_cat'] = df[ATTACK_COL].fillna('Benign').replace({'-': 'Benign'})
print(df[['y_bin']].head())
if ATTACK_COL:
    print(df[['y_cat']].head())


Binary candidates: ['label']
Attack category candidates: ['attack_cat']
   y_bin
0      1
1      1
2      1
3      1
4      1
       y_cat
0  Shellcode
1  Shellcode
2      Worms
3   Analysis
4   Analysis


In [4]:
# Identify pure identifier, network, and timing columns
id_like = [c for c in df.columns if any(k in c.lower() for k in ['id', 'flowid'])]
net_like = [c for c in df.columns if c.lower() in ['srcip', 'dstip', 'sport', 'dsport']]
time_like = [c for c in df.columns if 'time' in c.lower() or c.lower() in ['timestamp', 'stime', 'ltime']]
print('id_like:', id_like)
print('net_like:', net_like)
print('time_like:', time_like)


id_like: []
net_like: []
time_like: []


In [5]:
import numpy as np

# Check for missing values, duplicates, and class balance
na_counts = df.isna().sum().sort_values(ascending=False)
print('Top NA columns:\n', na_counts.head(15))
dup_count = df.duplicated().sum()
print('Duplicate rows:', dup_count)

# Binary class distribution and attack categories
print('y_bin value counts:\n', df['y_bin'].value_counts(dropna=False))
if 'y_cat' in df:
    print('Top attack categories:\n', df['y_cat'].value_counts().head(10))

# Numeric columns summary
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print('Numeric columns:', len(num_cols))
print(df[num_cols].describe().T.head(12))


Top NA columns:
 dur        0
proto      0
service    0
state      0
spkts      0
dpkts      0
sbytes     0
dbytes     0
rate       0
sttl       0
dttl       0
sload      0
dload      0
sloss      0
dloss      0
dtype: int64
Duplicate rows: 0
y_bin value counts:
 y_bin
1    6748
0     717
Name: count, dtype: int64
Top attack categories:
 y_cat
Worms             788
Backdoors         777
Exploits          774
Analysis          765
Generic           763
Reconnaissance    751
DoS               722
Normal            717
Shellcode         704
Fuzzers           704
Name: count, dtype: int64
Numeric columns: 41
         count          mean           std       min           25%  \
dur     7465.0    495.760230    289.081491  0.011635    243.995302   
spkts   7465.0     50.240857     28.528471  1.000000     26.000000   
dpkts   7465.0     49.709444     28.500227  1.000000     25.000000   
sbytes  7465.0  49887.177093  28969.259568  5.000000  24809.000000   
dbytes  7465.0  49924.408707  28703.74

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Drop target columns for feature set
X_num = df.drop(columns=['label', 'attack_cat', 'y_bin', 'y_cat'], errors='ignore').select_dtypes(include=[np.number]).copy()
y = df['y_bin']

print('Numeric features:', X_num.shape[1])
print('First few feature columns:', X_num.columns.tolist()[:10])

# Train/test split and random forest model
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))


Numeric features: 39
First few feature columns: ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload']
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       143
           1     0.9042    1.0000    0.9497      1350

    accuracy                         0.9042      1493
   macro avg     0.4521    0.5000    0.4749      1493
weighted avg     0.8176    0.9042    0.8587      1493



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Manually balance dataset: equal records per class
n_benign = df[df['y_bin']==0].shape[0]
n_attack = n_benign  # Same number for balance

benign_sample = df[df['y_bin'] == 0]
attack_sample = df[df['y_bin'] == 1].sample(n=n_attack, random_state=42)

sample = pd.concat([benign_sample, attack_sample]).sample(frac=1, random_state=42)  # shuffle

# Save files
sample.to_csv('unsw_stage1_sample.csv', index=False)
pd.Series(X_num.columns).to_csv('unsw_stage1_feature_columns.csv', index=False)

import os
print("Files saved:", os.listdir())


Files saved: ['.config', 'unsw_stage1_feature_columns.csv', 'unsw_stage1_sample.csv', 'sample_data']


In [10]:
from google.colab import files

# Download the dataset sample and the features file
files.download('unsw_stage1_sample.csv')
files.download('unsw_stage1_feature_columns.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>