In [6]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import warnings

# Ignore potential version mismatch warnings for scikit-learn
warnings.filterwarnings("ignore", category=UserWarning)

class NeuralNetworkClassifier(nn.Module):
    """Deep Neural Network for binary classification"""
    def __init__(self, input_dim, hidden_dims=[1024, 1024, 512, 512, 256, 256, 128, 128], dropout_rate=0.3):
        super(NeuralNetworkClassifier, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, 2)) # Binary classification
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

def predict_new_data(new_data_path, model_path):
    """
    Loads a trained model and predicts on new, unseen data, correcting for
    feature list discrepancy in the saved model file.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    try:
        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
        print("Model checkpoint loaded successfully.")
    except FileNotFoundError:
        print(f"Error: Model file not found at {model_path}.")
        return None

    input_dim = checkpoint['input_dim']
    scaler = checkpoint['scaler']
    
    # FIX: Get the correct feature list directly from the scaler object
    correct_feature_names = scaler.get_feature_names_out()
    
    model = NeuralNetworkClassifier(input_dim=input_dim).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    print(f"Model was trained on {len(correct_feature_names)} features. Applying this list.")

    try:
        new_df = pd.read_csv(new_data_path)
        original_df = new_df.copy()
    except FileNotFoundError:
        print(f"Error: New data file not found at {new_data_path}")
        return None

    new_df.columns = new_df.columns.str.strip()

    if not all(feature in new_df.columns for feature in correct_feature_names):
        print("Error: The new data is missing one or more required columns.")
        missing_cols = [f for f in correct_feature_names if f not in new_df.columns]
        print(f"Missing columns: {missing_cols}")
        return None

    X_new = new_df[correct_feature_names]
    X_new.replace([np.inf, -np.inf], np.nan, inplace=True)
    if X_new.isnull().sum().sum() > 0:
        X_new.fillna(0, inplace=True)

    X_new_scaled = scaler.transform(X_new)
    X_new_tensor = torch.FloatTensor(X_new_scaled).to(device)

    print("Making predictions on the new data...")
    with torch.no_grad():
        outputs = model(X_new_tensor)
        probabilities = torch.softmax(outputs, dim=1)
        predictions = torch.argmax(probabilities, dim=1)

    predictions_np = predictions.cpu().numpy()
    probabilities_np = probabilities.cpu().numpy()

    original_df['Predicted_Label'] = ['Malicious' if p == 1 else 'Benign' for p in predictions_np]
    original_df['Confidence'] = probabilities_np.max(axis=1)

    print("Prediction complete.")
    return original_df

if __name__ == '__main__':
    # Define file paths
    model_file = 'cicids_neural_network_enhanced.pth'
    input_csv = 'SamplesTesting.csv'
    output_csv = 'predictions_on_new_samples.csv'
    
    # Get predictions
    predicted_df = predict_new_data(new_data_path=input_csv, model_path=model_file)
    
    if predicted_df is not None:
        print("\n--- Prediction Results ---")
        print(predicted_df)
        predicted_df.to_csv(output_csv, index=False)
        print(f"\nResults saved to '{output_csv}'")

Using device: cpu
Model checkpoint loaded successfully.
Model was trained on 55 features. Applying this list.
Making predictions on the new data...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new.replace([np.inf, -np.inf], np.nan, inplace=True)


Prediction complete.

--- Prediction Results ---
    Destination Port  Flow Duration  Total Fwd Packets  \
0                  0      112641719                  3   
1                  0      112641466                  3   
2                  0      112638623                  3   
3                 22        6453966                 15   
4                 22        8804066                 14   
5                 22        6989341                 16   
6                  0      112640480                  3   
7                  0      112641244                  3   
8                 80         476513                  5   
9                 21              2                  1   
10                21              1                  1   
11                21              1                  1   
12                21             21                  1   
13                21              2                  1   
14                21              2                  1   
15                21   