# Supervised Classification on Processed Flow Data

This notebook trains baseline models on the preprocessed flow features saved in `processed_data/`. It mirrors the CLI script but keeps everything in an interactive workflow.

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

DATA_DIR = Path('processed_data')
X_TRAIN_PATH = DATA_DIR / 'X_train_processed.csv'
X_TEST_PATH = DATA_DIR / 'X_test_processed.csv'
Y_TRAIN_PATH = DATA_DIR / 'y_train_processed.csv'
Y_TEST_PATH = DATA_DIR / 'y_test_processed.csv'

MAX_ITER = 60  # adjust if the logistic models need more steps to converge

TRAIN_FRAC = 1.0  # fraction of training rows to use (1.0 keeps all)
TEST_FRAC = 1.0   # fraction of test rows to use (keep full set for evaluation)
RANDOM_STATE = 42

In [6]:
def _subsample(X, y, frac, seed, split):
    if frac >= 1.0:
        return X.reset_index(drop=True), y.reset_index(drop=True)
    sampled_idx = X.sample(frac=frac, random_state=seed).index
    X_sub = X.loc[sampled_idx].reset_index(drop=True)
    y_sub = y.loc[sampled_idx].reset_index(drop=True)
    print(f'{split.title()} subset: {len(X_sub):,} rows (fraction={frac})')
    return X_sub, y_sub


def load_dataset():
    X_train = pd.read_csv(X_TRAIN_PATH)
    X_test = pd.read_csv(X_TEST_PATH)
    y_train = pd.read_csv(Y_TRAIN_PATH)['category']
    y_test = pd.read_csv(Y_TEST_PATH)['category']

    X_train, y_train = _subsample(X_train, y_train, TRAIN_FRAC, RANDOM_STATE, 'train')
    X_test, y_test = _subsample(X_test, y_test, TEST_FRAC, RANDOM_STATE, 'test')

    print('Loaded:')
    print(f'  X_train: {X_train.shape} | y_train distribution:{y_train.value_counts()}')
    print(f'  X_test:  {X_test.shape} | y_test distribution:{y_test.value_counts()}')
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = load_dataset()


Loaded:
  X_train: (429795, 240) | y_train distribution:category
Normal    85959
DoS       85959
Probe     85959
U2R       85959
R2L       85959
Name: count, dtype: int64
  X_test:  (56773, 240) | y_test distribution:category
Normal    21490
Probe     17545
DoS       16892
R2L         507
U2R         339
Name: count, dtype: int64


## Binary Classification (Normal vs Attack)

In [7]:
y_train_binary = np.where(y_train == 'Normal', 'Normal', 'Attack')
y_test_binary = np.where(y_test == 'Normal', 'Normal', 'Attack')

logreg_bin = LogisticRegression(
    solver='saga',
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    max_iter=MAX_ITER,
    n_jobs=-1,
    verbose=0,
)

logreg_bin.fit(X_train, y_train_binary)
logreg_bin_preds = logreg_bin.predict(X_test)
print('Binary accuracy:', accuracy_score(y_test_binary, logreg_bin_preds))
print('Binary classification report:')
print(classification_report(y_test_binary, logreg_bin_preds, target_names=['Normal', 'Attack']))
print('Confusion matrix:')
print(confusion_matrix(y_test_binary, logreg_bin_preds, labels=['Normal', 'Attack']))



Binary accuracy: 0.7281806492522854
Binary classification report:
              precision    recall  f1-score   support

      Normal       0.91      0.62      0.74     35283
      Attack       0.59      0.90      0.72     21490

    accuracy                           0.73     56773
   macro avg       0.75      0.76      0.73     56773
weighted avg       0.79      0.73      0.73     56773

Confusion matrix:
[[19406  2084]
 [13348 21935]]


## Multiclass Classification

In [8]:
logreg_multi = LogisticRegression(
    multi_class='multinomial',
    solver='saga',
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    max_iter=MAX_ITER,
    n_jobs=-1,
    verbose=0,
)

logreg_multi.fit(X_train, y_train)
logreg_multi_preds = logreg_multi.predict(X_test)
print('Multiclass accuracy:', accuracy_score(y_test, logreg_multi_preds))
print('Multiclass classification report:')
print(classification_report(y_test, logreg_multi_preds, target_names=sorted(y_train.unique())))
print('Confusion matrix:')
print(confusion_matrix(y_test, logreg_multi_preds, labels=sorted(y_train.unique())))



Multiclass accuracy: 0.4646574956405333
Multiclass classification report:
              precision    recall  f1-score   support

         DoS       0.99      0.76      0.86     16892
      Normal       0.83      0.38      0.52     21490
       Probe       0.58      0.27      0.37     17545
         R2L       0.05      0.78      0.09       507
         U2R       0.02      0.90      0.03       339

    accuracy                           0.46     56773
   macro avg       0.49      0.62      0.37     56773
weighted avg       0.79      0.46      0.57     56773

Confusion matrix:
[[12907   583   579  2468   355]
 [   24  8064  2752   810  9840]
 [  116  1032  4709  4514  7174]
 [    0    52    14   394    47]
 [    0     0    33     0   306]]


## Optional: XGBoost on Binary Task

In [None]:
xgb_bin = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
)

# Convert labels to numeric encoding expected by XGBoost
y_train_bin_num = (y_train_binary == 'Attack').astype(int)
y_test_bin_num = (y_test_binary == 'Attack').astype(int)

xgb_bin.fit(X_train, y_train_bin_num)
xgb_preds_num = xgb_bin.predict(X_test)

print('XGBoost binary accuracy:', accuracy_score(y_test_bin_num, xgb_preds_num))
print('XGBoost binary classification report:')
print(classification_report(y_test_bin_num, xgb_preds_num, target_names=['Normal', 'Attack']))
print('Confusion matrix:')
print(confusion_matrix(y_test_bin_num, xgb_preds_num))
