# Supervised Classification on Processed Flow Data

This notebook trains baseline models on the preprocessed flow features saved in `processed_data/`. It mirrors the CLI script but keeps everything in an interactive workflow.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

DATA_DIR = Path('processed_data')
X_TRAIN_PATH = DATA_DIR / 'X_train_processed.csv'
X_TEST_PATH = DATA_DIR / 'X_test_processed.csv'
Y_TRAIN_PATH = DATA_DIR / 'y_train_processed.csv'
Y_TEST_PATH = DATA_DIR / 'y_test_processed.csv'

MAX_ITER = 60  # adjust if the logistic models need more steps to converge

TRAIN_FRAC = 0.1  # fraction of training rows to use (1.0 keeps all)
TEST_FRAC = 1.0   # fraction of test rows to use (keep full set for evaluation)
RANDOM_STATE = 42

In [None]:
def _subsample(X, y, frac, seed, split):
    if frac >= 1.0:
        return X.reset_index(drop=True), y.reset_index(drop=True)
    sampled_idx = X.sample(frac=frac, random_state=seed).index
    X_sub = X.loc[sampled_idx].reset_index(drop=True)
    y_sub = y.loc[sampled_idx].reset_index(drop=True)
    print(f'{split.title()} subset: {len(X_sub):,} rows (fraction={frac})')
    return X_sub, y_sub


def load_dataset():
    X_train = pd.read_csv(X_TRAIN_PATH)
    X_test = pd.read_csv(X_TEST_PATH)
    y_train = pd.read_csv(Y_TRAIN_PATH)['category']
    y_test = pd.read_csv(Y_TEST_PATH)['category']

    X_train, y_train = _subsample(X_train, y_train, TRAIN_FRAC, RANDOM_STATE, 'train')
    X_test, y_test = _subsample(X_test, y_test, TEST_FRAC, RANDOM_STATE, 'test')

    print('Loaded:')
    print(f'  X_train: {X_train.shape} | y_train distribution:
{y_train.value_counts()}')
    print(f'  X_test:  {X_test.shape} | y_test distribution:
{y_test.value_counts()}')
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = load_dataset()


## Binary Classification (Normal vs Attack)

In [None]:
y_train_binary = np.where(y_train == 'Normal', 'Normal', 'Attack')
y_test_binary = np.where(y_test == 'Normal', 'Normal', 'Attack')

logreg_bin = LogisticRegression(
    solver='saga',
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    max_iter=MAX_ITER,
    n_jobs=-1,
    verbose=0,
)

logreg_bin.fit(X_train, y_train_binary)
logreg_bin_preds = logreg_bin.predict(X_test)
print('Binary accuracy:', accuracy_score(y_test_binary, logreg_bin_preds))
print('Binary classification report:')
print(classification_report(y_test_binary, logreg_bin_preds, target_names=['Normal', 'Attack']))
print('Confusion matrix:')
print(confusion_matrix(y_test_binary, logreg_bin_preds, labels=['Normal', 'Attack']))

## Multiclass Classification

In [None]:
logreg_multi = LogisticRegression(
    multi_class='multinomial',
    solver='saga',
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    max_iter=MAX_ITER,
    n_jobs=-1,
    verbose=0,
)

logreg_multi.fit(X_train, y_train)
logreg_multi_preds = logreg_multi.predict(X_test)
print('Multiclass accuracy:', accuracy_score(y_test, logreg_multi_preds))
print('Multiclass classification report:')
print(classification_report(y_test, logreg_multi_preds, target_names=sorted(y_train.unique())))
print('Confusion matrix:')
print(confusion_matrix(y_test, logreg_multi_preds, labels=sorted(y_train.unique())))

## Optional: Random Forest on Binary Task

In [None]:
xgb_bin = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
)

# Convert labels to numeric encoding expected by XGBoost
y_train_bin_num = (y_train_binary == 'Attack').astype(int)
y_test_bin_num = (y_test_binary == 'Attack').astype(int)

xgb_bin.fit(X_train, y_train_bin_num)
xgb_preds_num = xgb_bin.predict(X_test)

print('XGBoost binary accuracy:', accuracy_score(y_test_bin_num, xgb_preds_num))
print('
XGBoost binary classification report:')
print(classification_report(y_test_bin_num, xgb_preds_num, target_names=['Normal', 'Attack']))
print('Confusion matrix:')
print(confusion_matrix(y_test_bin_num, xgb_preds_num))
