In [1]:
# Cell 1: Imports and Global Constants
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import urllib.parse
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
import numpy as np
import os # For checking file paths
from transformers import AutoTokenizer, AutoModel

# --- Configuration for Inference ---
# Path where your DistilRoBERTa model and tokenizer are saved locally
TRANSFORMER_LOCAL_PATH = r"D:\Internship ka kaam\data.csv\poroject\Horizon_log\saved_distilroberta_inference"
# Path where your trained MLP weights are saved
MLP_MODEL_PATH = r"D:\Internship ka kaam\data.csv\poroject\Horizon_log\best_pytorch_mlp_model.pth"
# Path to your saved StandardScaler for 'length'
LENGTH_SCALER_PATH = r"D:\Internship ka kaam\data.csv\poroject\Horizon_log\length_scaler2.pkl"
# Path to your saved OneHotEncoder for 'Method'
METHOD_ENCODER_PATH = r"D:\Internship ka kaam\data.csv\poroject\Horizon_log\method_encoder.pkl"

# Transformer parameters (must match training)
MAX_LENGTH = 256 # Or whatever max_length you found optimal during training
BATCH_SIZE_INFERENCE = 64 # Can be larger than training batch_size if only doing forward pass

# MLP Model Architecture parameters (must match training)
INPUT_SIZE_MLP = 1540 # Total features: 1 (length) + 2 (one-hot method) + 768 (content CLS) + 768 (URL CLS)
HIDDEN_SIZES_MLP = (512, 256, 128)
OUTPUT_SIZE_MLP = 1 # Binary classification
DROPOUT_RATE_MLP = 0.2 # Dropout is disabled in eval mode, but define for model consistency

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell 2: Load Pre-trained Models and Encoders

# Determine device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Inference will run on: {device}")

# --- Load DistilRoBERTa Components ---
try:
    print(f"Loading DistilRoBERTa tokenizer from {TRANSFORMER_LOCAL_PATH}...")
    global_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_LOCAL_PATH)
    print(f"Loading DistilRoBERTa model from {TRANSFORMER_LOCAL_PATH}...")
    global_model = AutoModel.from_pretrained(TRANSFORMER_LOCAL_PATH)
    global_model.eval() # Set to evaluation mode
    global_model.to(device)
    print("DistilRoBERTa loaded successfully.")
except Exception as e:
    print(f"Error loading DistilRoBERTa components from local path: {e}")
    print("Attempting to load from Hugging Face Hub (requires internet)...")
    try:
        global_tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
        global_model = AutoModel.from_pretrained('distilroberta-base')
        global_model.eval()
        global_model.to(device)
        print("DistilRoBERTa loaded successfully from Hugging Face Hub.")
    except Exception as e_hub:
        print(f"FATAL ERROR: Could not load DistilRoBERTa from local path or Hugging Face Hub: {e_hub}")
        exit() # Stop script if model can't be loaded

# --- Load Preprocessing Encoders/Scalers ---
try:
    print(f"Loading StandardScaler from {LENGTH_SCALER_PATH}...")
    scaler_for_length = joblib.load(LENGTH_SCALER_PATH)
    print(f"Loading OneHotEncoder from {METHOD_ENCODER_PATH}...")
    method_encoder = joblib.load(METHOD_ENCODER_PATH)
    print("Preprocessing encoders/scalers loaded successfully.")
except FileNotFoundError as e:
    print(f"FATAL ERROR: Required preprocessing file not found: {e}")
    print("Please ensure 'length_scaler.pkl' and 'method_encoder.pkl' are in the correct directory.")
    exit() # Stop script if preprocessing tools are missing
except Exception as e:
    print(f"FATAL ERROR: Error loading preprocessing tools: {e}")
    exit()

# --- Load PyTorch MLP Model ---
# 1. Re-define the SimpleMLP class (must be the exact same as during training)
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, dropout_rate=0.2):
        super(SimpleMLP, self).__init__()
        layers = []
        layers.append(nn.Linear(input_size, hidden_sizes[0]))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_rate)) # Dropout layer
        for i in range(len(hidden_sizes) - 1):
            layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate)) # Dropout layer
        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        layers.append(nn.Sigmoid())
        self.network = nn.Sequential(*layers)
    def forward(self, x):
        return self.network(x)

# 2. Instantiate the model and load its state_dict
try:
    print(f"Loading PyTorch MLP model from {MLP_MODEL_PATH}...")
    mlp_model = SimpleMLP(INPUT_SIZE_MLP, HIDDEN_SIZES_MLP, OUTPUT_SIZE_MLP, DROPOUT_RATE_MLP).to(device)
    mlp_model.load_state_dict(torch.load(MLP_MODEL_PATH, map_location=device))
    mlp_model.eval() # Set to evaluation mode (crucial for inference: disables dropout, uses moving averages for BatchNorm if present)
    print("PyTorch MLP model loaded successfully.")
except FileNotFoundError as e:
    print(f"FATAL ERROR: MLP model file not found: {e}")
    print("Please ensure 'best_pytorch_mlp_model.pth' is in the correct directory.")
    exit()
except Exception as e:
    print(f"FATAL ERROR: Error loading PyTorch MLP model: {e}")
    exit()

print("\nAll models and preprocessing tools are ready for inference.")

Inference will run on: cuda
Loading DistilRoBERTa tokenizer from D:\Internship ka kaam\data.csv\poroject\Horizon_log\saved_distilroberta_inference...
Loading DistilRoBERTa model from D:\Internship ka kaam\data.csv\poroject\Horizon_log\saved_distilroberta_inference...
DistilRoBERTa loaded successfully.
Loading StandardScaler from D:\Internship ka kaam\data.csv\poroject\Horizon_log\length_scaler2.pkl...
Loading OneHotEncoder from D:\Internship ka kaam\data.csv\poroject\Horizon_log\method_encoder.pkl...
Preprocessing encoders/scalers loaded successfully.
Loading PyTorch MLP model from D:\Internship ka kaam\data.csv\poroject\Horizon_log\best_pytorch_mlp_model.pth...
PyTorch MLP model loaded successfully.

All models and preprocessing tools are ready for inference.


  mlp_model.load_state_dict(torch.load(MLP_MODEL_PATH, map_location=device))


In [3]:
# Cell 3: Preprocessing Function for New Inference Data

# Helper function to get CLS embedding (from previous training code)
def get_cls_embedding(text: str, tokenizer, model, device, max_length: int):
    """
    Helper function to get [CLS] embedding for a given text for inference.
    Handles empty strings by returning a zero vector of the correct dimension.
    """
    if not text:
        # Return a zero vector, ensuring it's on the correct device initially
        return torch.zeros(model.config.hidden_size).to(device).cpu().numpy()

    encoded_input = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding='max_length', # Use max_length padding for consistent input shape
        max_length=max_length
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

    with torch.no_grad():
        model_output = model(**encoded_input)
        cls_embedding = model_output.last_hidden_state[0, 0, :]

    return cls_embedding.cpu().numpy()


def preprocess_inference_data(df: pd.DataFrame, batch_size: int, max_length: int):
    """
    Preprocesses new inference data in a DataFrame, consistent with training.

    Args:
        df (pd.DataFrame): Input DataFrame with 'Method', 'URL', 'length', 'content' columns.
        batch_size (int): Batch size for Transformer embedding generation.
        max_length (int): Max sequence length for Transformer.

    Returns:
        pd.DataFrame: Preprocessed DataFrame ready for MLP prediction.
    """
    print("\n--- Starting Data Preprocessing for Inference ---")

    print(f"Input DataFrame columns: {df.columns.tolist()}")
    if 'length' not in df.columns:
        print("CRITICAL WARNING: 'length' column is NOT found in the input DataFrame.")
        print("Please ensure your input CSV has a column named 'length' (case-sensitive).")
        # You might want to raise an error or handle this more robustly
        raise ValueError("Missing 'length' column in input DataFrame.")

    # --- 1. Prepare Text Data for Batch Processing ---
    print("Preparing text data for batch processing...")
    # Fill NaN values with empty string and convert to list of strings
    content_texts = df['content'].fillna('').astype(str).tolist()
    # URL-decode all URLs here
    url_texts = [urllib.parse.unquote(str(url)) for url in df['URL'].fillna('')]

    all_content_cls_embeddings = []
    all_url_cls_embeddings = []

    # --- 2. Generate CLS Embeddings in Batches ---
    print(f"Generating [CLS] embeddings in batches (batch_size={batch_size}, max_length={max_length})...")
    num_samples = len(df)
    for i in range(0, num_samples, batch_size):
        batch_content = content_texts[i:i + batch_size]
        batch_urls = url_texts[i:i + batch_size]

        with torch.no_grad():
            # Process Content Batch
            encoded_content_batch = global_tokenizer(
                batch_content, return_tensors='pt', truncation=True,
                padding='longest', max_length=max_length
            ).to(device)
            content_output = global_model(**encoded_content_batch)
            batch_content_cls_embeds = content_output.last_hidden_state[:, 0, :].cpu().numpy()
            all_content_cls_embeddings.extend(batch_content_cls_embeds)
            del content_output, encoded_content_batch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            # Process URL Batch
            encoded_url_batch = global_tokenizer(
                batch_urls, return_tensors='pt', truncation=True,
                padding='longest', max_length=max_length
            ).to(device)
            url_output = global_model(**encoded_url_batch)
            batch_url_cls_embeds = url_output.last_hidden_state[:, 0, :].cpu().numpy()
            all_url_cls_embeddings.extend(batch_url_cls_embeds)
            del url_output, encoded_url_batch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        if (i // batch_size + 1) % 10 == 0 or (i + batch_size) >= num_samples:
            print(f"Processed {min(i + batch_size, num_samples)}/{num_samples} samples...")

    print("Embedding generation complete.")

    content_embed_df = pd.DataFrame(all_content_cls_embeddings, index=df.index).add_prefix('content_embed_')
    url_embed_df = pd.DataFrame(all_url_cls_embeddings, index=df.index).add_prefix('url_embed_')

    # --- 3. One-Hot Encode 'Method' Column ---
    print("One-hot encoding 'Method' column...")
    # Use the LOADED method_encoder (IMPORTANT: DO NOT .fit_transform() here, only .transform())
    method_encoded = method_encoder.transform(df[['Method']])
    method_encoded_df = pd.DataFrame(
        method_encoded,
        columns=method_encoder.get_feature_names_out(['Method']),
        index=df.index
    )
    print("One-hot encoding complete.")

    # --- 4. Scale 'length' Column ---
    print("Scaling 'length' column...")
    # Use the LOADED scaler_for_length (IMPORTANT: DO NOT .fit_transform() here, only .transform())
    df_temp = df.copy() # Work on a copy to avoid modifying original df
    df_temp['length'] = scaler_for_length.transform(df_temp[['length']])
    print("Length scaling complete.")

    print(f"df_temp columns BEFORE scaling: {df_temp.columns.tolist()}")
    if 'length' in df_temp.columns:
        df_temp['length'] = scaler_for_length.transform(df_temp[['length']])
        print(f"First 5 scaled lengths: {df_temp['length'].head().tolist()}")
    else:
        print("CRITICAL WARNING: 'length' column is MISSING in df_temp before scaling.")

    # --- 5. Combine All Features into Final DataFrame ---
    print("Combining all features...")
    # Ensure columns are in the same order as trained data
    # Create an empty DataFrame to build features consistently
    preprocessed_X = pd.concat([
        df_temp[['length']], # Scaled length
        method_encoded_df,   # One-hot encoded methods
        content_embed_df,    # Content embeddings
        url_embed_df         # URL embeddings
    ], axis=1)

    # Ensure column order matches training data's X.columns if specific order is critical
    # If the combined data forms columns that consistently match the training X.columns order, this is fine.
    # Otherwise, you would need to store the exact X.columns from training and reindex preprocessed_X
    # For now, assuming consistent column generation.
    print(f"Final preprocessed_X columns: {preprocessed_X.columns.tolist()}")
    print(f"Final preprocessed_X shape: {preprocessed_X.shape}")

    print("Feature combination complete.")
    print(f"Preprocessed features shape: {preprocessed_X.shape}")
    print (preprocessed_X.head())
    return preprocessed_X

In [4]:
# Cell 4: Prediction Function

def predict_network_payload(input_csv_path: str):
    """
    Receives a CSV file, preprocesses it, and predicts if payloads are benign or malicious.

    Args:
        input_csv_path (str): Path to the input CSV file with
                              'Method', 'URL', 'length', 'content' columns.

    Returns:
        pd.DataFrame: Original DataFrame with an added 'Predicted_Label' column (0 or 1).
    """
    print(f"\n--- Starting Prediction for: {input_csv_path} ---")
    
    # 1. Load the new data
    try:
        new_data_df = pd.read_csv(input_csv_path)
        print(f"Loaded new data. Shape: {new_data_df.shape}")
        # Make a copy to work with, keeping original index
        original_df_for_output = new_data_df.copy()
    except FileNotFoundError:
        print(f"Error: Input CSV file not found at {input_csv_path}")
        return None
    except Exception as e:
        print(f"Error loading input CSV: {e}")
        return None

    # 2. Preprocess the data
    # Pass the inference batch size and max length
    processed_features_for_pred = preprocess_inference_data(
        new_data_df, BATCH_SIZE_INFERENCE, MAX_LENGTH
    )
    
    # Convert preprocessed features to PyTorch tensor and move to device
    X_tensor_for_pred = torch.tensor(processed_features_for_pred.values, dtype=torch.float32).to(device)

    # 3. Make predictions
    print("Making predictions with the MLP model...")
    mlp_model.eval() # Ensure model is in evaluation mode
    with torch.no_grad():
        raw_outputs = mlp_model(X_tensor_for_pred) # Get probabilities
        predictions_proba = raw_outputs.cpu().numpy().flatten() # Move to CPU and flatten
        # Apply threshold (0.5 for binary classification) to get binary labels
        predicted_labels = (predictions_proba > 0.5).astype(int)
    print("Predictions complete.")

    # 4. Add predictions to the original DataFrame and return
    original_df_for_output['Predicted_Probability'] = predictions_proba
    original_df_for_output['Predicted_Label'] = predicted_labels
    
    print("\n--- Prediction Process Finished ---")
    print("Prediction counts:")
    print(original_df_for_output['Predicted_Label'].value_counts())
    
    return original_df_for_output

In [8]:
# Cell 5: Example Usage of the Prediction Function
if __name__ == '__main__':
    # --- Specify the path to your actual inference CSV file ---
    # IMPORTANT: Replace 'path/to/your/actual_inference_data.csv' with the real path
    # Example: 'data/unseen_network_logs.csv'
    actual_inference_csv_path = r"D:\Internship ka kaam\data.csv\poroject\Horizon_log\abc.csv"

    # Check if the specified file exists before proceeding
    if not os.path.exists(actual_inference_csv_path):
        print(f"Error: The specified inference CSV file does not exist at '{actual_inference_csv_path}'")
        print("Please update 'actual_inference_csv_path' with the correct path to your data.")
    else:
        # --- Run Prediction ---
        predictions_df = predict_network_payload(actual_inference_csv_path)

        if predictions_df is not None:
            print("\nPredictions complete. Resulting DataFrame head:")
            # Print more rows to get a better overview of predictions
            print(predictions_df.tail(20)) # Display 20 rows
            
            # Optional: Save the predictions to a new CSV file
            output_csv_path = 'predictions_output.csv'
            predictions_df.to_csv(output_csv_path, index=False)
            print(f"\nPredictions saved to: {output_csv_path}")

        else:
            print("\nPrediction failed. Check error messages above.")


--- Starting Prediction for: D:\Internship ka kaam\data.csv\poroject\Horizon_log\abc.csv ---
Loaded new data. Shape: (916, 4)

--- Starting Data Preprocessing for Inference ---
Input DataFrame columns: ['Method', 'length', 'content', 'URL']
Preparing text data for batch processing...
Generating [CLS] embeddings in batches (batch_size=64, max_length=256)...
Processed 640/916 samples...
Processed 916/916 samples...
Embedding generation complete.
One-hot encoding 'Method' column...
One-hot encoding complete.
Scaling 'length' column...
Length scaling complete.
df_temp columns BEFORE scaling: ['Method', 'length', 'content', 'URL']
First 5 scaled lengths: [-0.42809160376909583, -0.42809160376909583, -0.42809160376909583, -0.42809160376909583, -0.42809160376909583]
Combining all features...
Final preprocessed_X columns: ['length', 'Method_GET', 'Method_POST', 'Method_PUT', 'content_embed_0', 'content_embed_1', 'content_embed_2', 'content_embed_3', 'content_embed_4', 'content_embed_5', 'conte