# UNSW-NB15 Full Dataset Preparation (Local Execution)
This notebook prepares the full UNSW-NB15 dataset from the 4 split parts and GT labels. It downsamples benign traffic, filters columns, and prepares the data for model training.

## Load and Combine Data Parts, add on the missing headers

In [None]:

import pandas as pd
from pathlib import Path
COLUMNS = [
    'id','dur','proto','service','state','spkts','dpkts','sbytes','dbytes','rate',
    'sttl','dttl','sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit',
    'swin','stcpb','dtcpb','dwin','tcprtt','synack','ackdat','smean','dmean',
    'trans_depth','response_body_len','ct_srv_src','ct_state_ttl','ct_dst_ltm',
    'ct_src_dport_ltm','ct_dst_sport_ltm','ct_dst_src_ltm','is_ftp_login',
    'ct_ftp_cmd','ct_flw_http_mthd','ct_src_ltm','ct_srv_dst','is_sm_ips_ports',
    'attack_cat','label'
]               # ← Column names from the dataset documentation, used as headers

# CONFIG: Set the path to your dataset folder
DATA_DIR = Path("/Users/jasmine/Documents/CSCI7783_Information Security/Project1_Data/NUSW-NB15/data")  # ← Change this!
csv_parts = [f"NUSW-NB15_{i}.csv" for i in range(1, 5)]

# Load and combine CSV files with column names as headers
df_list = [pd.read_csv(DATA_DIR / part, names=COLUMNS, header=0) for part in csv_parts]

first_row = df_list[0].iloc[[0]]  # Keep the first row from the first part
df_combined = pd.concat(df_list, ignore_index=True)
df_combined = pd.concat([first_row, df_combined], ignore_index=True)
print("length of columns:", len(df_combined.columns))

# Nicely formatted preview
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 0)  # auto-fit
pd.set_option("display.width", 0)
pd.set_option("display.max_colwidth", 200)

print("\n[check] first 22 rows:")
display(df_combined.head(22))
len(df_rows := df_combined)
print(f"\n[info] total rows: {len(df_rows)}")



  df_list = [pd.read_csv(DATA_DIR / part, names=COLUMNS, header=0) for part in csv_parts]
  df_list = [pd.read_csv(DATA_DIR / part, names=COLUMNS, header=0) for part in csv_parts]


length of columns: 45

[check] first 22 rows:


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,udp,CON,0.036133,528,304,31,29,0,0,-,87676.09,50480.17,4,4,0,0,0,0,132,76,0,0,9.89101,10.682733,1421927414,1421927414,7.005,7.564333,0.0,0.0,0.0,0,0,0.0,0.0,0,2,4,2,3,1,1,2,,0
1,udp,CON,0.036133,528,304,31,29,0,0,-,87676.09,50480.17,4,4,0,0,0,0,132,76,0,0,9.89101,10.682733,1421927414,1421927414,7.005,7.564333,0.0,0.0,0.0,0,0,0.0,0.0,0,2,4,2,3,1,1,2,,0
2,udp,CON,0.001119,146,178,31,29,0,0,dns,521894.5,636282.4,2,2,0,0,0,0,73,89,0,0,0.0,0.0,1421927414,1421927414,0.017,0.013,0.0,0.0,0.0,0,0,0.0,0.0,0,12,8,1,2,2,1,1,,0
3,udp,CON,0.001209,132,164,31,29,0,0,dns,436724.6,542597.2,2,2,0,0,0,0,66,82,0,0,0.0,0.0,1421927414,1421927414,0.043,0.014,0.0,0.0,0.0,0,0,0.0,0.0,0,6,9,1,1,1,1,1,,0
4,udp,CON,0.001169,146,178,31,29,0,0,dns,499572.2,609067.6,2,2,0,0,0,0,73,89,0,0,0.0,0.0,1421927414,1421927414,0.005,0.003,0.0,0.0,0.0,0,0,0.0,0.0,0,7,9,1,1,1,1,1,,0
5,udp,CON,0.078339,568,312,31,29,0,0,-,43503.23,23896.14,4,4,0,0,0,0,142,78,0,0,29.682221,34.37034,1421927414,1421927414,21.003,24.315,0.0,0.0,0.0,0,0,0.0,0.0,0,2,4,2,3,1,1,2,,0
6,udp,CON,0.001134,132,164,31,29,0,0,dns,465608.5,578483.2,2,2,0,0,0,0,66,82,0,0,0.0,0.0,1421927414,1421927414,0.017,0.013,0.0,0.0,0.0,0,0,0.0,0.0,0,12,7,1,2,2,1,1,,0
7,arp,INT,0.0,46,0,0,0,0,0,-,0.0,0.0,1,0,0,0,0,0,46,0,0,0,0.0,0.0,1421927415,1421927415,0.0,0.0,0.0,0.0,0.0,1,2,0.0,0.0,0,2,2,2,2,2,2,2,,0
8,udp,CON,0.001126,146,178,31,29,0,0,dns,518650.1,632326.8,2,2,0,0,0,0,73,89,0,0,0.0,0.0,1421927415,1421927415,0.018,0.013,0.0,0.0,0.0,0,0,0.0,0.0,0,6,7,3,1,1,1,1,,0
9,udp,CON,0.001167,132,164,31,29,0,0,dns,452442.2,562125.1,2,2,0,0,0,0,66,82,0,0,0.0,0.0,1421927415,1421927415,0.018,0.013,0.0,0.0,0.0,0,0,0.0,0.0,0,6,7,2,1,1,1,1,,0



[info] total rows: 2540044


## Keep Only Required Rows

In [201]:
mask_nan = df_combined.isna().any(axis=1)
to_drop = df_combined[mask_nan].sample(frac=0.6, random_state=42).index
df_combined = df_combined.drop(index=to_drop).reset_index(drop=True)

df_combined = df_combined.fillna("Normal")

print("Remaining rows:", len(df_combined))

Remaining rows: 163213


## Save Cleaned Data to CSV. ZIP

In [198]:

OUTPUT_CSV = DATA_DIR / "UNSW_NB15_cleaned.csv"
OUTPUT_ZIP = DATA_DIR / "UNSW_NB15_cleaned.csv.zip"

df_combined.to_csv(OUTPUT_CSV, index=False)
df_combined.to_csv(OUTPUT_ZIP, index=False, compression={
    "method": "zip",
    "archive_name": OUTPUT_CSV.name
})

print("[save] CSV ->", OUTPUT_CSV, "size:", OUTPUT_CSV.stat().st_size, "bytes")
print("[save] ZIP ->", OUTPUT_ZIP, "size:", OUTPUT_ZIP.stat().st_size, "bytes")

[save] CSV -> /Users/jasmine/Documents/CSCI7783_Information Security/Project1_Data/NUSW-NB15/data/UNSW_NB15_cleaned.csv size: 75327586 bytes
[save] ZIP -> /Users/jasmine/Documents/CSCI7783_Information Security/Project1_Data/NUSW-NB15/data/UNSW_NB15_cleaned.csv.zip size: 19526281 bytes


## Define x and y, then Split into Train/Test

In [202]:

from sklearn.model_selection import train_test_split

import pandas as pd
from sklearn.model_selection import train_test_split

df = df_combined.copy()
# Drop non-predictive columns
df = df.drop(columns=['id', 'label'])
# Separate features and target
X = df.drop(columns=['attack_cat'])
y = df['attack_cat']
# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

print("X_train:", X_train.shape, "| X_test:", X_test.shape)
print("y_train distribution:", y_train.value_counts(normalize=True).round(3).to_dict())


X_train: (114249, 90) | X_test: (48964, 90)
y_train distribution: {'Normal': 0.761, 'Generic': 0.117, 'Exploits': 0.047, ' Fuzzers': 0.031, 'DoS': 0.012, 'Reconnaissance': 0.011, ' Fuzzers ': 0.006, ' Reconnaissance ': 0.004, 'Analysis': 0.004, 'Backdoors': 0.003, 'Shellcode': 0.001, 'Backdoor': 0.001, ' Shellcode ': 0.001, 'Worms': 0.0}
