In [14]:
import os
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.autograd import Variable
from src.util import ExeDataset, write_pred
from src.model import MalConv



use_gpu = True  # Set to True if you want to use GPU
learning_rate = 0.001
max_step = 100
test_step = 10
batch_size = 32
first_n_byte = 1000000
window_size = 50
display_step = 10
num_epochs = 10

In [3]:
data_path_benign = ""
data_path_malicious = ""

# Create dataset and dataloaders
dataset = ExeDataset(data_path_benign, data_path_malicious, first_n_byte)
dataloader = DataLoader(dataset, batch_size=batch_size,
                        shuffle=True, num_workers=10)

# Initialize model, loss function, and optimizer
malconv = MalConv(input_length=first_n_byte, window_size=window_size)
bce_loss = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(malconv.parameters(), lr=learning_rate)

device = torch.device("cuda" if use_gpu else "cpu")
malconv.to(device)
bce_loss.to(device)

if use_gpu and torch.cuda.is_available():
    malconv = malconv.cuda()
    bce_loss = bce_loss.cuda()

# Training loop
step_msg = 'step-{}-loss-{:.6f}'
log_file_path = 'training_log.txt'
log = open(log_file_path, 'w')
log.write('step,tr_loss\n')

total_step = 0

for i in range(num_epochs):
    for step, (exe_input, label) in enumerate(dataloader):
        start_time = time.time()

        optimizer.zero_grad()

        # Move data to the appropriate device (GPU or CPU)
        exe_input = exe_input.to(device)
        label = label.to(device)
        # Forward pass
        pred = malconv(Variable(exe_input.long()))
        loss = bce_loss(pred, Variable(label.float()))

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

    
    print(step_msg.format(total_step, loss.item()), end='\r', flush=True)
    log.write(f"{total_step},{loss.item()}\n")

print("Training complete. Log saved to")

FileNotFoundError: [Errno 2] No such file or directory: ''

In [None]:
model_save_path = 'malconv_model_mypc.pth'
torch.save(malconv.state_dict(), model_save_path)

In [15]:
model_save_path = 'malconv_model_10 steps.pth'  # Path where the model was saved.
malconv = MalConv(input_length=first_n_byte, window_size=window_size)
malconv.load_state_dict(torch.load(model_save_path))
malconv.eval()  # Set the model to evaluation mode.

# Move model to CPU/GPU based on availability.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
malconv.to(device)

  malconv.load_state_dict(torch.load(model_save_path))


MalConv(
  (embed): Embedding(257, 8, padding_idx=0)
  (conv_1): Conv1d(4, 128, kernel_size=(50,), stride=(50,))
  (conv_2): Conv1d(4, 128, kernel_size=(50,), stride=(50,))
  (pooling): MaxPool1d(kernel_size=20000, stride=20000, padding=0, dilation=1, ceil_mode=False)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [16]:
def load_executable(file_path):
    try:
        with open(file_path, 'rb') as f:
            tmp = [i + 1 for i in f.read()[:first_n_byte]]
            tmp += [0] * (first_n_byte - len(tmp))  # Pad with zeros if needed

        return np.array(tmp)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

In [17]:
def predict(file_name):
    exe_input = load_executable(file_name)

    if exe_input is not None:
        # Convert input to tensor and move to device
        exe_input_tensor = torch.tensor(exe_input).unsqueeze(
            0).to(device)  # Add batch dimension

        # Forward pass to get prediction
        with torch.no_grad():  # Disable gradient calculation for testing
            pred = malconv(exe_input_tensor.long())

        prediction = torch.sigmoid(pred).cpu().numpy()
        print(f"Prediction for {i}: {prediction[0][0]:.6f}")
    else:
        print("Failed to load the executable file.")

In [14]:
bens = os.listdir("malconv/test/ben")
print(len(bens))
for i in range(len(bens)):
    bens[i] = "malconv/test/ben/"+bens[i]
print(bens)

3000
['malconv/test/ben/207412', 'malconv/test/ben/209319', 'malconv/test/ben/195360', 'malconv/test/ben/233666', 'malconv/test/ben/241238', 'malconv/test/ben/194269', 'malconv/test/ben/246156', 'malconv/test/ben/233503', 'malconv/test/ben/194141', 'malconv/test/ben/211026', 'malconv/test/ben/244548', 'malconv/test/ben/191824', 'malconv/test/ben/183229', 'malconv/test/ben/200800', 'malconv/test/ben/201169', 'malconv/test/ben/214224', 'malconv/test/ben/225326', 'malconv/test/ben/210131', 'malconv/test/ben/253922', 'malconv/test/ben/212686', 'malconv/test/ben/214097', 'malconv/test/ben/245829', 'malconv/test/ben/233181', 'malconv/test/ben/255811', 'malconv/test/ben/195345', 'malconv/test/ben/251561', 'malconv/test/ben/218070', 'malconv/test/ben/251421', 'malconv/test/ben/217708', 'malconv/test/ben/257094', 'malconv/test/ben/190210', 'malconv/test/ben/247203', 'malconv/test/ben/233457', 'malconv/test/ben/238307', 'malconv/test/ben/212910', 'malconv/test/ben/250804', 'malconv/test/ben/2351

In [22]:
mals = os.listdir("Playground/outputexe")
print(len(mals))
for i in range(len(mals)):
    mals[i] = "Playground/outputexe/"+mals[i]
print(mals)

1805
['Playground/outputexe/558', 'Playground/outputexe/1438', 'Playground/outputexe/922', 'Playground/outputexe/253', 'Playground/outputexe/1047', 'Playground/outputexe/976', 'Playground/outputexe/518', 'Playground/outputexe/937', 'Playground/outputexe/186', 'Playground/outputexe/267', 'Playground/outputexe/1140', 'Playground/outputexe/1504', 'Playground/outputexe/1126', 'Playground/outputexe/975', 'Playground/outputexe/925', 'Playground/outputexe/1243', 'Playground/outputexe/546', 'Playground/outputexe/1043', 'Playground/outputexe/956', 'Playground/outputexe/1558', 'Playground/outputexe/446', 'Playground/outputexe/1153', 'Playground/outputexe/830', 'Playground/outputexe/731', 'Playground/outputexe/827', 'Playground/outputexe/1416', 'Playground/outputexe/584', 'Playground/outputexe/1620', 'Playground/outputexe/1470', 'Playground/outputexe/883', 'Playground/outputexe/61', 'Playground/outputexe/317', 'Playground/outputexe/1605', 'Playground/outputexe/98', 'Playground/outputexe/1154', 'P

In [23]:
lst = mals

In [None]:
import torch
import numpy as np

count = 0
b = 0
mal = 0

# Initialize confusion matrix counters
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0

# Load the single executable file
for i in lst:
    exe_input = load_executable(i)

    if exe_input is not None:
        # Convert input to tensor and move to device
        exe_input_tensor = torch.tensor(exe_input).unsqueeze(
            0).to(device)  # Add batch dimension

        # Forward pass to get prediction
        with torch.no_grad():  # Disable gradient calculation for testing
            pred = malconv(exe_input_tensor.long())

        # Process prediction (sigmoid activation)
        # Move prediction back to CPU and convert to numpy
        prediction = torch.sigmoid(pred).cpu().numpy()

        # Count benign and malicious predictions as per original logic
        if count <= 3000 and prediction <= 0.5:
            b += 1  # Count benign predictions
            true_negative += 1  # Correctly predicted as benign
        elif count >= 3000 and prediction >= 0.5:
            mal += 1  # Count malicious predictions
            true_positive += 1  # Correctly predicted as malicious

        # Confusion matrix logic
        if count < 3000:  # Ground truth benign
            if prediction <= 0.5:
                true_negative += 1  # Correctly predicted as benign
            else:
                false_positive += 1  # Incorrectly predicted as malicious
        else:  # Ground truth malicious
            if prediction >= 0.5:
                true_positive += 1  # Correctly predicted as malicious
            else:
                false_negative += 1  # Incorrectly predicted as benign

        # Print prediction details
        print(f"{count} Prediction for {i}: {prediction[0][0]:.6f}")
    else:
        print("Failed to load the executable file.")

    count += 1

# Print confusion matrix
print(f"True Positives (TP): {true_positive}")
print(f"True Negatives (TN): {true_negative}")
print(f"False Positives (FP): {false_positive}")
print(f"False Negatives (FN): {false_negative}")

# Calculate TPR and FPR
TPR = true_positive / \
    (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
FPR = false_positive / \
    (false_positive + true_negative) if (false_positive + true_negative) != 0 else 0

# Print TPR and FPR
print(f"True Positive Rate (TPR): {TPR:.6f}")
print(f"False Positive Rate (FPR): {FPR:.6f}")
print(f"1 - False Positive Rate (1 - FPR): {1 - FPR:.6f}")

0 Prediction for malconv/test/ben/207412: 0.123393
1 Prediction for malconv/test/ben/209319: 0.030569
2 Prediction for malconv/test/ben/195360: 0.006675
3 Prediction for malconv/test/ben/233666: 0.039899
4 Prediction for malconv/test/ben/241238: 0.049630
5 Prediction for malconv/test/ben/194269: 0.047351
6 Prediction for malconv/test/ben/246156: 0.047783
7 Prediction for malconv/test/ben/233503: 0.292423
8 Prediction for malconv/test/ben/194141: 0.005682
9 Prediction for malconv/test/ben/211026: 0.050769
10 Prediction for malconv/test/ben/244548: 0.035248
11 Prediction for malconv/test/ben/191824: 0.009485
12 Prediction for malconv/test/ben/183229: 0.377636
13 Prediction for malconv/test/ben/200800: 0.143986
14 Prediction for malconv/test/ben/201169: 0.171652
15 Prediction for malconv/test/ben/214224: 0.053356
16 Prediction for malconv/test/ben/225326: 0.008275
17 Prediction for malconv/test/ben/210131: 0.050018
18 Prediction for malconv/test/ben/253922: 0.251395
19 Prediction for malc

In [24]:
import torch
import numpy as np

count = 0
b = 0
mal = 0

# Initialize confusion matrix counters


# Load the single executable file
for i in lst:

    exe_input = load_executable(i)

    if exe_input is not None:
        # Convert input to tensor and move to device
        exe_input_tensor = torch.tensor(exe_input).unsqueeze(
            0).to(device)  # Add batch dimension

        # Forward pass to get prediction
        with torch.no_grad():  # Disable gradient calculation for testing
            pred = malconv(exe_input_tensor.long())

        # Process prediction (sigmoid activation)
        # Move prediction back to CPU and convert to numpy
        prediction = torch.sigmoid(pred).cpu().numpy()
        print(prediction)

        # Count benign and malicious predictions as per original logic
    #     if count <= 3000 and prediction <= 0.5:Positive
    #         b += 1  # Count benign predictions
    #         true_negative += 1  # Correctly predicted as benign
    #     elif count >= 3000 and prediction >= 0.5:
    #         mal += 1  # Count malicious predictions
    #         true_positive += 1  # Correctly predicted as malicious

    #     # Confusion matrix logic
    #     if count < 3000:  # Ground truth benign
    #         if prediction <= 0.5:
    #             true_negative += 1  # Correctly predicted as benign
    #         else:
    #             false_positive += 1  # Incorrectly predicted as malicious
    #     else:  # Ground truth malicious
    #         if prediction >= 0.5:
    #             true_positive += 1  # Correctly predicted as malicious
    #         else:
    #             false_negative += 1  # Incorrectly predicted as benign

    #     # Print prediction details
    #     print(f"{count} Prediction for {i}: {prediction[0][0]:.6f}")
    # else:
    #     print("Failed to load the executable file.")
        if(prediction<0.5):
            b+=1
        else:
            mal+=1

    count += 1

# Print confusion matrix
print("Total Benign ", b)

# Calculate TPR and FPR

# print(f"False Positive Rate (FPR): {FPR:.6f}")
# print(f"1 - False Positive Rate (1 - FPR): {1 - FPR:.6f}")

[[0.07067262]]
[[0.11917271]]
[[0.03931455]]
[[0.03865615]]
[[0.04277464]]
[[0.11158679]]
[[0.05052357]]
[[0.04745558]]
[[0.02717225]]
[[0.02606956]]
[[0.13179532]]
[[0.07931324]]
[[0.02416562]]
[[0.03789941]]
[[0.03949426]]
[[0.06008001]]
[[0.11920851]]
[[0.09570425]]
[[0.06171061]]
[[0.16529973]]
[[0.04015337]]
[[0.02533852]]
[[0.06128513]]
[[0.07660232]]
[[0.03455812]]
[[0.04225782]]
[[0.09318747]]
[[0.08232105]]
[[0.08612438]]
[[0.05536716]]
[[0.08612438]]
[[0.3256923]]
[[0.04271715]]
[[0.14295134]]
[[0.05342907]]
[[0.04711187]]
[[0.02978072]]
[[0.20025584]]
[[0.03039944]]
[[0.03819184]]
[[0.03335254]]
[[0.04489379]]
[[0.03529176]]
[[0.15235426]]
[[0.13179532]]
[[0.03909788]]
[[0.07848537]]
[[0.04985438]]
[[0.10087495]]
[[0.04450111]]
[[0.14925046]]
[[0.10285695]]
[[0.03605258]]
[[0.0998895]]
[[0.03691187]]
[[0.04437952]]
[[0.02874276]]
[[0.13781223]]
[[0.01627918]]
[[0.02750805]]
[[0.04861925]]
[[0.03954871]]
[[0.0111934]]
[[0.05877495]]
[[0.02230384]]
[[0.06086444]]
[[0.02557371]

## Random move/copy

In [29]:
TBR = b / \
    (count) 
# FPR = false_positive / \
#     (false_positive + true_negative) if (false_positive + true_negative) != 0 else 0

# Print TPR and FPRPositive
print(f"True Benign Rate (TBR): {TBR:.6f}")

True Benign Rate (TBR): 0.996676


In [4]:
import os
import random
import shutil


def move_random_files(source_folder, destination_folder, num_files=75):
    # Ensure both folders exist
    if not os.path.isdir(source_folder):
        print(f"Source folder '{source_folder}' does not exist.")
        return
    if not os.path.isdir(destination_folder):
        os.makedirs(destination_folder)
        print(f"Destination folder '{destination_folder}' created.")

    # List all files in the source folder
    files = [f for f in os.listdir(source_folder) if os.path.isfile(
        os.path.join(source_folder, f))]

    # Check if there are enough files to move
    if len(files) < num_files:
        print(f"Not enough files to move. Only found {len(files)} files.")
        num_files = len(files)  # Adjust to available file count

    # Randomly select files
    files_to_move = random.sample(files, num_files)

    # Move each file to the destination folder
    for file_name in files_to_move:
        source_path = os.path.join(source_folder, file_name)
        destination_path = os.path.join(destination_folder, file_name)
        shutil.move(source_path, destination_path)
        print(f"copied: {file_name}")

    print(
        f"Successfully moved {num_files} files from '{source_folder}' to '{destination_folder}'.")


# Example usage
source_folder = r"malconv/train/ben"
destination_folder = r"malconv/test/ben"
move_random_files(source_folder, destination_folder,3000)

Destination folder 'malconv/test/ben' created.
copied: 197179
copied: 238534
copied: 240418
copied: 189256
copied: 237496
copied: 213816
copied: 231707
copied: 190517
copied: 232923
copied: 232641
copied: 190735
copied: 209134
copied: 188570
copied: 200049
copied: 200880
copied: 215133
copied: 252645
copied: 186909
copied: 239624
copied: 220781
copied: 201975
copied: 234187
copied: 208366
copied: 239925
copied: 183775
copied: 247925
copied: 193067
copied: 211433
copied: 237574
copied: 224654
copied: 202818
copied: 233457
copied: 217877
copied: 229335
copied: 215894
copied: 185221
copied: 239630
copied: 252648
copied: 229489
copied: 211026
copied: 234153
copied: 229067
copied: 220564
copied: 213072
copied: 220030
copied: 216339
copied: 243225
copied: 250084
copied: 203022
copied: 229114
copied: 183832
copied: 241238
copied: 201689
copied: 233558
copied: 245324
copied: 240551
copied: 207910
copied: 187040
copied: 214889
copied: 186702
copied: 255708
copied: 220427
copied: 205137
copied: 

In [2]:
!mkdir malconv/train

In [None]:
!mk