In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample
import joblib

def preprocess_train(train_path, target_column="delay"):
    df = pd.read_csv(train_path)

    # Drop metadata / non-predictive columns
    drop_cols = [
        "id", "Time", "t", "GooseTimestamp", "timestampDiff", "tDiff",
        "ethDst", "ethSrc", "goID", "datSet", "gocbRef", "TPID", "ethType"
    ]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    # Separate features and target
    y_raw = df[target_column].astype(str)
    X = df.drop(columns=[target_column])

    # Encode categorical feature columns
    encoders = {}
    for col in X.select_dtypes(include=["object"]).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        encoders[col] = le

    # Encode target labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y_raw)

    print("Before balancing:")
    print(pd.Series(y_raw).value_counts())

    # Downsample
    df_balanced = X.copy()
    df_balanced["class"] = y_encoded
    min_class_size = df_balanced["class"].value_counts().min()
    df_resampled = (
        df_balanced.groupby("class", group_keys=False)
        .apply(lambda x: resample(x, replace=False, n_samples=min_class_size, random_state=42))
    )

    print("\nAfter balancing:")
    print(df_resampled["class"].value_counts())

    # Separate back into X and y
    y_encoded = df_resampled["class"].values
    X = df_resampled.drop(columns=["class"])

    # Scale numeric features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Save preprocessing objects
    joblib.dump(encoders, "encoders.pkl")
    joblib.dump(label_encoder, "label_encoder.pkl")
    joblib.dump(scaler, "scaler.pkl")
    np.save("feature_columns.npy", X.columns)

    print("\nFinal shapes:")
    print("X:", X_scaled.shape)
    print("y:", y_encoded.shape)

    return X_scaled, y_encoded

if __name__ == "__main__":
    X_train, y_train = preprocess_train("train.csv")


Before balancing:
delay
normal                    2759425
random_replay               39000
high_StNum                  39000
injection                   39000
inverse_replay              26033
poisoned_high_rate          18574
masquerade_fake_normal      17419
masquerade_fake_fault       17287
Name: count, dtype: int64


  .apply(lambda x: resample(x, replace=False, n_samples=min_class_size, random_state=42))



After balancing:
class
0    17287
1    17287
2    17287
3    17287
4    17287
5    17287
6    17287
7    17287
Name: count, dtype: int64

Final shapes:
X: (138296, 56)
y: (138296,)


In [None]:
import pandas as pd
import numpy as np
import joblib

def preprocess_test(test_path, target_column="delay"):
    df_test = pd.read_csv(test_path)

    # Drop metadata / non-predictive columns
    drop_cols = [
        "id", "Time", "t", "GooseTimestamp", "timestampDiff", "tDiff",
        "ethDst", "ethSrc", "goID", "datSet", "gocbRef", "TPID", "ethType"
    ]
    df_test = df_test.drop(columns=[c for c in drop_cols if c in df_test.columns])

    # Separate features and target (if available)
    if target_column in df_test.columns:
        y_test_raw = df_test[target_column].astype(str)
        X_test = df_test.drop(columns=[target_column])
    else:
        y_test_raw = None
        X_test = df_test.copy()

    # Load saved encoders & scaler
    encoders = joblib.load("encoders.pkl")
    label_encoder = joblib.load("label_encoder.pkl")
    scaler = joblib.load("scaler.pkl")
    feature_columns = np.load("feature_columns.npy", allow_pickle=True)

    # Encode categorical columns using TRAIN encoders
    for col, le in encoders.items():
        if col in X_test.columns:
            unseen = set(X_test[col].astype(str)) - set(le.classes_)
            if unseen:
                le.classes_ = np.append(le.classes_, list(unseen))
            X_test[col] = le.transform(X_test[col].astype(str))

    # Ensure same feature alignment
    X_test = X_test.reindex(columns=feature_columns, fill_value=0)

    # Scale numeric features
    X_test_scaled = scaler.transform(X_test)

    # Encode target (if available)
    if y_test_raw is not None:
        try:
            y_test_encoded = label_encoder.transform(y_test_raw)
        except ValueError:
            print("⚠️ Warning: Test set contains unseen labels!")
            y_test_encoded = None
        y_test_labels = label_encoder.classes_
    else:
        y_test_encoded, y_test_labels = None, None

    print("\nFinal shapes:")
    print("X_test:", X_test_scaled.shape)
    if y_test_encoded is not None:
        print("y_test:", y_test_encoded.shape)
    else:
        print("No target column or unseen target labels in test dataset")

    return X_test_scaled, y_test_encoded

if __name__ == "__main__":
    X_test, y_test = preprocess_test("test.csv")



Final shapes:
X_test: (2955648, 56)
y_test: (2955648,)


Model Design

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# Split data (use y_encoded)
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

n_features = X_train.shape[1]              # number of features
n_classes = len(np.unique(y_train))  # number of unique attack/normal classes

# Compute class weights (handle imbalance)
class_weights = compute_class_weight(
    class_weight="balanced", 
    classes=np.unique(y_train), 
    y=y_train
)
class_weights = dict(zip(np.unique(y_train), class_weights))

# Model
model = Sequential([
    tf.keras.Input(shape=(n_features,)),   # Input layer
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dropout(0.2),

    Dense(n_classes, activation='softmax')  # final layer matches class count
])

# Compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',   # since y_train are integers
    metrics=['accuracy']
)

# Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=25,
    batch_size=256,
    class_weight=class_weights,
    verbose=1
)

model.save("smart_grid_model.h5")


Epoch 1/25
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.8304 - loss: 0.4811 - val_accuracy: 0.9334 - val_loss: 0.2005
Epoch 2/25
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9153 - loss: 0.2392 - val_accuracy: 0.9523 - val_loss: 0.1372
Epoch 3/25
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.9342 - loss: 0.1887 - val_accuracy: 0.9681 - val_loss: 0.1063
Epoch 4/25
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.9462 - loss: 0.1561 - val_accuracy: 0.9671 - val_loss: 0.1011
Epoch 5/25
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.9539 - loss: 0.1361 - val_accuracy: 0.9774 - val_loss: 0.0728
Epoch 6/25
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9596 - loss: 0.1189 - val_accuracy: 0.9812 - val_loss: 0.0655
Epoch 7/25
[1m433/433



In [12]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
import joblib


# Your existing preprocess_test function
X_test, y_test = preprocess_test("test.csv")  # runs preprocessing once

# Load trained model
model = load_model("smart_grid_model .h5")

# Prepare one-hot labels if needed
n_classes = len(np.unique(y_test))
y_test_onehot = to_categorical(y_test, num_classes=n_classes) if y_test is not None else None

# Evaluate model
loss, acc = model.evaluate(X_test, y_test, verbose=1)
print(f"\nTest Loss: {loss:.4f}")
print(f"Test Accuracy: {acc:.4f}")

# Predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Load label classes for printing
label_encoder = joblib.load("label_encoder.pkl")
y_labels = label_encoder.classes_

# Classification report & confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=y_labels))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Final shapes:
X_test: (2955648, 56)
y_test: (2955648,)




[1m92364/92364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 3ms/step - accuracy: 0.9695 - loss: 0.0946

Test Loss: 0.0946
Test Accuracy: 0.9695
[1m92364/92364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 2ms/step

Classification Report:
                        precision    recall  f1-score   support

            high_StNum       0.99      1.00      1.00     39000
             injection       0.99      0.97      0.98     39000
        inverse_replay       0.75      0.76      0.75     30319
 masquerade_fake_fault       0.19      0.85      0.31     17200
masquerade_fake_normal       0.93      1.00      0.97     17420
                normal       1.00      0.97      0.98   2755139
    poisoned_high_rate       0.93      0.99      0.96     18570
         random_replay       0.76      0.99      0.86     39000

              accuracy                           0.97   2955648
             macro avg       0.82      0.94      0.85   2955648
          weighted avg       0.99