In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample
import joblib

def preprocess_train(train_path, target_column="delay"):
    df = pd.read_csv(train_path)

    # Drop metadata / non-predictive columns
    drop_cols = [
        "id", "Time", "t", "GooseTimestamp", "timestampDiff", "tDiff",
        "ethDst", "ethSrc", "goID", "datSet", "gocbRef", "TPID", "ethType"
    ]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    # Separate features and target
    y_raw = df[target_column].astype(str)
    X = df.drop(columns=[target_column])

    # Encode categorical feature columns
    encoders = {}
    for col in X.select_dtypes(include=["object"]).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        encoders[col] = le

    # Encode target labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y_raw)

    print("Before balancing:")
    print(pd.Series(y_raw).value_counts())

    # Downsample
    df_balanced = X.copy()
    df_balanced["class"] = y_encoded
    min_class_size = df_balanced["class"].value_counts().min()
    df_resampled = (
        df_balanced.groupby("class", group_keys=False)
        .apply(lambda x: resample(x, replace=False, n_samples=min_class_size, random_state=42))
    )

    print("\nAfter balancing:")
    print(df_resampled["class"].value_counts())

    # Separate back into X and y
    y_encoded = df_resampled["class"].values
    X = df_resampled.drop(columns=["class"])

    # Scale numeric features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Save preprocessing objects
    joblib.dump(encoders, "encoders.pkl")
    joblib.dump(label_encoder, "label_encoder.pkl")
    joblib.dump(scaler, "scaler.pkl")
    np.save("feature_columns.npy", X.columns)

    print("\nFinal shapes:")
    print("X:", X_scaled.shape)
    print("y:", y_encoded.shape)

    return X_scaled, y_encoded

if __name__ == "__main__":
    X_train, y_train = preprocess_train("train.csv")

Before balancing:
delay
normal                    2759425
random_replay               39000
high_StNum                  39000
injection                   39000
inverse_replay              26033
poisoned_high_rate          18574
masquerade_fake_normal      17419
masquerade_fake_fault       17287
Name: count, dtype: int64


  .apply(lambda x: resample(x, replace=False, n_samples=min_class_size, random_state=42))



After balancing:
class
0    17287
1    17287
2    17287
3    17287
4    17287
5    17287
6    17287
7    17287
Name: count, dtype: int64

Final shapes:
X: (138296, 56)
y: (138296,)


In [2]:
import pandas as pd
import numpy as np
import joblib

def preprocess_test(test_path, target_column="delay"):
    df_test = pd.read_csv(test_path)

    # Drop metadata / non-predictive columns
    drop_cols = [
        "id", "Time", "t", "GooseTimestamp", "timestampDiff", "tDiff",
        "ethDst", "ethSrc", "goID", "datSet", "gocbRef", "TPID", "ethType"
    ]
    df_test = df_test.drop(columns=[c for c in drop_cols if c in df_test.columns])

    # Separate features and target (if available)
    if target_column in df_test.columns:
        y_test_raw = df_test[target_column].astype(str)
        X_test = df_test.drop(columns=[target_column])
    else:
        y_test_raw = None
        X_test = df_test.copy()

    # Load saved encoders & scaler
    encoders = joblib.load("encoders.pkl")
    label_encoder = joblib.load("label_encoder.pkl")
    scaler = joblib.load("scaler.pkl")
    feature_columns = np.load("feature_columns.npy", allow_pickle=True)

    # Encode categorical columns using TRAIN encoders
    for col, le in encoders.items():
        if col in X_test.columns:
            unseen = set(X_test[col].astype(str)) - set(le.classes_)
            if unseen:
                le.classes_ = np.append(le.classes_, list(unseen))
            X_test[col] = le.transform(X_test[col].astype(str))

    # Ensure same feature alignment
    X_test = X_test.reindex(columns=feature_columns, fill_value=0)

    # Scale numeric features
    X_test_scaled = scaler.transform(X_test)

    # Encode target (if available)
    if y_test_raw is not None:
        try:
            y_test_encoded = label_encoder.transform(y_test_raw)
        except ValueError:
            print("⚠️ Warning: Test set contains unseen labels!")
            y_test_encoded = None
        y_test_labels = label_encoder.classes_
    else:
        y_test_encoded, y_test_labels = None, None

    print("\nFinal shapes:")
    print("X_test:", X_test_scaled.shape)
    if y_test_encoded is not None:
        print("y_test:", y_test_encoded.shape)
    else:
        print("No target column or unseen target labels in test dataset")

    return X_test_scaled, y_test_encoded

if __name__ == "__main__":
    X_test, y_test = preprocess_test("test.csv")



Final shapes:
X_test: (2955648, 56)
y_test: (2955648,)


In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import label_binarize
import numpy as np

# =====================================================
# Function for Multi-class Benchmarking
# =====================================================
def benchmark_models_multiclass(X_train, y_train, X_test, y_test):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "SVM": SVC(probability=True, random_state=42),
        "KNN": KNeighborsClassifier(),
    }
    
    results = {}
    classes = np.unique(y_train)
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        # Handle metrics for multiclass
        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average="weighted"),
            "Recall": recall_score(y_test, y_pred, average="weighted"),
            "F1-score": f1_score(y_test, y_pred, average="weighted")
        }
        
        # AUC (one-vs-rest) if model supports probabilities
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)
            y_test_bin = label_binarize(y_test, classes=classes)
            try:
                auc = roc_auc_score(y_test_bin, y_prob, average="weighted", multi_class="ovr")
                results[name]["AUC"] = auc
            except:
                results[name]["AUC"] = None
        else:
            results[name]["AUC"] = None
    
    return pd.DataFrame(results).T

# =====================================================
# Example usage
# =====================================================
# Assuming you already have: X_train, y_train, X_test, y_test
results_df = benchmark_models_multiclass(X_train, y_train, X_test, y_test)

print("\nBenchmarking Results (Multi-class Models):")
print(results_df)


Training Logistic Regression...
Training Random Forest...
Training SVM...
Training KNN...

Benchmarking Results (Multi-class Models):
                     Accuracy  Precision    Recall  F1-score       AUC
Logistic Regression  0.850058   0.972749  0.850058  0.899508  0.984347
Random Forest        0.997731   0.997856  0.997731  0.997759  0.999854
SVM                  0.926962   0.979197  0.926962  0.948145  0.992178
KNN                  0.922804   0.970008  0.922804  0.941667  0.966935
