In [2]:
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import os
from sklearn.metrics import confusion_matrix
from src.model import MalConv  # Assuming MalConv is defined in /home/pk_02/Desktop/CAPSTONE/src/model.py
import numpy as np

# Set the number of bytes to read from each file
first_n_byte = 900000
window_size = 50  # This is specific to your MalConv model, usually 50 for MalConv

# Custom Dataset class to load PNG images and labels as raw bytes
class MalConvImageDataset(Dataset):
    def __init__(self, image_dir, first_n_byte, transform=None):
        self.image_dir = image_dir
        self.image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith('.png')]
        self.first_n_byte = first_n_byte
        self.transform = transform

        # Create labels based on filenames (malicious=1, benign=0)
        self.labels = [1 if 'malicious' in fname else 0 for fname in os.listdir(image_dir) if fname.endswith('.png')]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        
        # Read image as binary data (first_n_byte bytes)
        with open(image_path, 'rb') as f:
            byte_data = f.read(self.first_n_byte)

        # Pad or truncate to the first_n_byte length if necessary
        byte_data = bytearray(byte_data)
        if len(byte_data) < self.first_n_byte:
            byte_data.extend([0] * (self.first_n_byte - len(byte_data)))  # Pad with 0 bytes
        else:
            byte_data = byte_data[:self.first_n_byte]  # Truncate if too large

        # Convert byte data to a tensor (each byte as an integer)
        byte_tensor = torch.tensor(byte_data, dtype=torch.long)

        return byte_tensor, torch.tensor(label, dtype=torch.long)

# Preprocessing transformations (resize to a consistent size and normalize)
# No need for image resizing or normalization since we're using raw byte data.
# The transform parameter is included but will not be used.
transform = None  # No transformations required for byte data

# Load the dataset
image_dir = '/home/pk_02/Desktop/CAPSTONE/output_exe'
dataset = MalConvImageDataset(image_dir, first_n_byte, transform=transform)
test_loader = DataLoader(dataset, batch_size=32, shuffle=False)

# Define device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = MalConv(input_length=first_n_byte, window_size=window_size).to(device)  # Create an instance of the MalConv model
model.load_state_dict(torch.load('/home/pk_02/Desktop/CAPSTONE/malconv_model_10 steps.pth'))  # Load the trained weights
model.eval()  # Set the model to evaluation mode

# Make predictions and evaluate the model
y_pred = []
y_true = []

with torch.no_grad():  # Disable gradient calculation during inference
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass through the model
        outputs = model(inputs)  # Get model outputs
        outputs = outputs.squeeze()  # Remove unnecessary dimensions
        predicted = (outputs >= 0.5).long()  # Apply threshold to get binary predictions

        y_pred.extend(predicted.cpu().numpy())  # Move to CPU and convert to numpy array
        y_true.extend(labels.cpu().numpy())  # Same for true labels

# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Print confusion matrix components
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

# Calculate True Positive Rate (TPR) and False Positive Rate (FPR)
TPR = tp / (tp + fn) if (tp + fn) != 0 else 0
FPR = fp / (fp + tn) if (fp + tn) != 0 else 0

# Print TPR and FPR
print(f"True Positive Rate (TPR): {TPR:.6f}")
print(f"False Positive Rate (FPR): {FPR:.6f}")
print(f"1 - False Positive Rate (1 - FPR): {1 - FPR:.6f}")


  model.load_state_dict(torch.load('/home/pk_02/Desktop/CAPSTONE/malconv_model_10 steps.pth'))  # Load the trained weights


KeyboardInterrupt: 