# Lightweight UNSW-NB15 ML Pipeline

In [23]:
 print(df.columns)


Index(['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss',
       'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin',
       'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len',
       'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'],
      dtype='object')


In [25]:
# UNSW-NB15 Pipeline with Correct Column Headers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data with proper column names (standard UNSW-NB15 columns)
correct_columns = [
    'srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
    'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
    'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt',
    'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl',
    'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst',
    'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
    'ct_dst_src_ltm', 'attack_cat', 'label'
]

try:
    # Load with correct column names (skip header if it exists)
    df1 = pd.read_csv('/content/drive/MyDrive/UNSW-NB15_1.csv', names=correct_columns, header=None)
    df2 = pd.read_csv('/content/drive/MyDrive/UNSW-NB15_2.csv', names=correct_columns, header=None)

    df = pd.concat([df1, df2], ignore_index=True)
    df = df.sample(n=5000, random_state=42).reset_index(drop=True)

    print("Data loaded successfully with proper column names")
    print("\nFirst 3 rows:")
    print(df.head(3))

except Exception as e:
    print(f"Error loading data: {e}")
    raise

# 2. Verify we have the label column
if 'label' not in df.columns:
    raise ValueError("Label column not found after assigning proper column names")

# 3. Data Preprocessing
# Drop network identifiers and timestamps
cols_to_drop = ['srcip', 'sport', 'dstip', 'dsport', 'Stime', 'Ltime']
df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

# Separate features and label
y = df['label']
X = df.drop(columns=['label'])

# Convert label to binary (0=normal, 1=attack)
y = y.apply(lambda x: 0 if x == 0 else 1)  # Assuming 0 is normal, others are attacks

# Handle categorical features
cat_cols = X.select_dtypes(include=['object']).columns
for col in cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Fill missing values
X.fillna(0, inplace=True)

# 4. Feature Processing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

selector = SelectKBest(mutual_info_classif, k=20)
X_selected = selector.fit_transform(X_scaled, y)

# 5. Model Training
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

model = DecisionTreeClassifier(random_state=42, max_depth=5)
model.fit(X_train, y_train)

# 6. Evaluation
y_pred = model.predict(X_test)

print("\nModel Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

  df1 = pd.read_csv('/content/drive/MyDrive/UNSW-NB15_1.csv', names=correct_columns, header=None)
  df2 = pd.read_csv('/content/drive/MyDrive/UNSW-NB15_2.csv', names=correct_columns, header=None)


Data loaded successfully with proper column names

First 3 rows:
        srcip  sport          dstip dsport proto state       dur  sbytes  \
0  59.166.0.0   6620  149.171.126.8     21   tcp   FIN  1.455835    2934   
1  59.166.0.6  43350  149.171.126.4     21   tcp   FIN  1.686769    2934   
2  59.166.0.8  43404  149.171.126.5     25   tcp   FIN  0.025943   37428   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0    3742    31  ...           0           1           1          5   
1    3742    31  ...           1           1           1          5   
2    3172    31  ...           0          14           9          5   

   ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_ltm  ct_dst_src_ltm  attack_cat  \
0           9                 1                 1               3         NaN   
1           3                 1                 1               3         NaN   
2           3                 1                 1               2         NaN   

   label  
0      0 

In [26]:
# Robust UNSW-NB15 Pipeline with Column Validation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Data Validation Checks
print("\n=== Data Validation ===")
print(f"Total samples: {len(df)}")
print(f"Duplicate rows: {df.duplicated().sum()}")
print("\nCurrent columns in DataFrame:")
print(df.columns.tolist())

# Identify which columns actually exist in the DataFrame
columns_to_drop = ['label']  # We always keep this as our target
optional_columns_to_drop = ['attack_cat', 'srcip', 'dstip', 'sport', 'dsport', 'Stime', 'Ltime']
existing_columns_to_drop = [col for col in optional_columns_to_drop if col in df.columns]

print("\nColumns that will be dropped:")
print(existing_columns_to_drop)

# Enhanced Preprocessing
X = df.drop(columns=['label'] + existing_columns_to_drop)
y = np.where(df['label'] == 0, 0, 1)  # Binary classification

# Convert categoricals (more robust method)
cat_cols = X.select_dtypes(include=['object']).columns
print(f"\nCategorical columns to encode: {list(cat_cols)}")

for col in cat_cols:
    X[col] = X[col].astype(str).replace('nan', 'unknown')
    X[col] = LabelEncoder().fit_transform(X[col])

# Handle numerical features
num_cols = X.select_dtypes(include=np.number).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# Create balanced train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Build pipeline with SMOTE for class balancing
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(mutual_info_classif, k=15)),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=8,
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    ))
])

# Cross-validation
print("\n=== Cross-Validation ===")
cv_scores = cross_val_score(pipeline, X_train, y_train,
                          cv=5, scoring='roc_auc')
print(f"Mean ROC-AUC: {np.mean(cv_scores):.3f} (±{np.std(cv_scores):.3f})")

# Final training and evaluation
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("\n=== Final Evaluation ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importance Analysis
if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
    print("\n=== Top Important Features ===")
    selected_features = X.columns[pipeline.named_steps['feature_selection'].get_support()]
    feature_importance = pd.DataFrame({
        'feature': selected_features,
        'importance': pipeline.named_steps['classifier'].feature_importances_
    }).sort_values('importance', ascending=False)
    print(feature_importance.head(10))


=== Data Validation ===
Total samples: 5000
Duplicate rows: 13

Current columns in DataFrame:
['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'label']

Columns that will be dropped:
['attack_cat']

Categorical columns to encode: ['proto', 'state', 'service', 'ct_ftp_cmd']

=== Cross-Validation ===
Mean ROC-AUC: 0.999 (±0.001)

=== Final Evaluation ===
Accuracy: 0.9927
ROC-AUC: 0.9986

Confusion Matrix:
[[1408   10]
 [   1   81]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.