In [51]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.models import load_model 
import os
import joblib # For loading scikit-learn objects
import json # For loading the threshold
import warnings

In [52]:
warnings.filterwarnings('ignore') 

In [53]:
saved_knowledge = "trained_models_and_params"
file_directory = "processed/"

In [54]:
test_file_to_process = "Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv" # Example file name
full_file_path = os.path.join(file_directory, test_file_to_process)

In [55]:
try:
    loaded_scaler = joblib.load(os.path.join(saved_knowledge, 'fitted_scaler.pkl'))
    print("  StandardScaler loaded successfully for preprocessing.")
except FileNotFoundError:
    print(f"Error: fitted_scaler.pkl not found in {saved_knowledge}. Ensure saving ran successfully.")
    exit()

  StandardScaler loaded successfully for preprocessing.


In [56]:
try:
    loaded_autoencoder_model = load_model(
        os.path.join(saved_knowledge, 'trained_autoencoder.h5'),
        custom_objects={'mse': tf.keras.metrics.MeanSquaredError()} # Add this line
    )
    print("  Autoencoder model loaded successfully.")
except Exception as e:
    print(f"Error loading Autoencoder model: {e}. Ensure saving ran successfully.")
    exit()



  Autoencoder model loaded successfully.


In [57]:
try:
    with open(os.path.join(saved_knowledge, 'threshold.json'), 'r') as f:
        loaded_anomaly_threshold = json.load(f)["threshold"]
    print(f"  Anomaly threshold loaded: {loaded_anomaly_threshold:.6f}")
except FileNotFoundError:
    print(f"Error: anomaly_threshold.json not found in {saved_knowledge}. Ensure saving ran successfully.")
    exit()

  Anomaly threshold loaded: 0.001812


In [58]:
def clean_raw_dataframe(df_raw_input):
    df_proc = df_raw_input.copy()
    df_proc.columns = df_proc.columns.str.strip()
    df_proc = df_proc[df_proc['Label'] != 'Label']

    columns_to_drop = ['Dst Port', 'Protocol', 'Timestamp']
    df_features_and_label = df_proc.drop(columns=columns_to_drop, errors='ignore')

    for col in df_features_and_label.columns.drop('Label', errors='ignore'):
        df_features_and_label[col] = pd.to_numeric(df_features_and_label[col], errors='coerce')

    df_features_and_label.replace([np.inf, -np.inf], np.nan, inplace=True)
    for column in df_features_and_label.columns:
        if df_features_and_label[column].isnull().any():
            df_features_and_label[column].fillna(df_features_and_label[column].mean(), inplace=True)
    
    return df_features_and_label

In [59]:
def preprocess_for_testing(df_raw_input, scaler_obj):
    df_cleaned = clean_raw_dataframe(df_raw_input)
    
    y_true_binary = df_cleaned['Label'].apply(lambda x: 0 if x == 'Benign' else 1).copy()
    X_features = df_cleaned.drop('Label', axis=1)

    # Use the *loaded* scaler to transform (NOT fit) the new data
    X_scaled = scaler_obj.transform(X_features)

    return X_scaled, y_true_binary

In [60]:
try:
    # Load the specified test file
    current_df = pd.read_csv(full_file_path)
    print(f"  Successfully loaded {test_file_to_process}. Total rows: {current_df.shape[0]}")

    # Preprocess the current_df using your loaded_scaler
    X_scaled_current, y_true_binary_current = preprocess_for_testing(current_df, loaded_scaler)

    if X_scaled_current.shape[0] == 0:
        print(f"  Warning: No data remaining after preprocessing for {test_file_to_process}. Skipping Autoencoder application.")
    else:
        print(f"  Processed data shape: {X_scaled_current.shape}")
        print(f"  True anomalies in file: {y_true_binary_current.sum()} / {len(y_true_binary_current)}")

        # Apply Autoencoder for anomaly detection
        reconstructions_ae = loaded_autoencoder_model.predict(X_scaled_current, verbose=0)
        mse_errors_ae = np.mean(np.square(X_scaled_current - reconstructions_ae), axis=1)

        # Classify using the *loaded anomaly threshold*
        y_pred_binary_ae = (mse_errors_ae > loaded_anomaly_threshold).astype(int)

        # Evaluate Autoencoder performance for this file
        print("\n  --- Autoencoder Results for this file ---")
        roc_auc_ae = roc_auc_score(y_true_binary_current, mse_errors_ae)
        print(f"  Autoencoder AUC: {roc_auc_ae:.4f}")
        print("  Autoencoder Classification Report:")
        print(classification_report(y_true_binary_current, y_pred_binary_ae, target_names=['Normal', 'Anomaly'], zero_division=0))

except Exception as e:
    # This catches any errors during loading, preprocessing, or model application for the single file.
    print(f"!!! CRASH/ERROR ALERT for {test_file_to_process} !!! An error occurred during processing: {e}")


  Successfully loaded Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv. Total rows: 1048575
  Processed data shape: (1048575, 76)
  True anomalies in file: 52498 / 1048575

  --- Autoencoder Results for this file ---
  Autoencoder AUC: 0.3337
  Autoencoder Classification Report:
              precision    recall  f1-score   support

      Normal       0.00      0.00      0.00    996077
     Anomaly       0.05      1.00      0.10     52498

    accuracy                           0.05   1048575
   macro avg       0.03      0.50      0.05   1048575
weighted avg       0.00      0.05      0.00   1048575

