In [1]:
import glob
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision.models import resnet18, ResNet18_Weights
from tqdm import tqdm

In [2]:
goodware_files = glob.glob("/Users/giuseppe/PycharmProjects/urbanSecurityGDGV/resources/goodware_dataset/*.exe")
malware_files = glob.glob("/Users/giuseppe/PycharmProjects/urbanSecurityGDGV/resources/malware_dataset/*")

file_paths = malware_files + goodware_files
labels = [1] * len(malware_files) + [0] * len(goodware_files)

In [3]:
def exe_to_image(file_path, width=256):
    with open(file_path, "rb") as f:
        byte_array = np.frombuffer(f.read(), dtype=np.uint8)

    length = len(byte_array)
    height = int(np.ceil(length / width))

    padded = np.pad(byte_array, (0, height * width - length), 'constant', constant_values=0)
    image = padded.reshape((height, width))

    return Image.fromarray(image)

In [4]:
class EXEDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        img = exe_to_image(self.file_paths[idx])
        if self.transform:
            img = self.transform(img)
        label = self.labels[idx]
        return img, label


In [5]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),  # ResNet richiede 3 canali
    transforms.ToTensor()
])

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [6]:
def create_model():
    model = resnet18(weights=ResNet18_Weights.DEFAULT)
    model.fc = nn.Linear(model.fc.in_features, 2)  # Binary classification
    return model.to(device)

In [7]:
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, f1

In [8]:
dataset = EXEDataset(file_paths, labels, transform)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(file_paths, labels)):
    print(f"--- Fold {fold+1} ---")

    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)

    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=32)

    model = create_model()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    # Training loop
    model.train()
    for epoch in range(10):  # puoi aumentare il numero di epoche
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            scheduler.step()

    acc, f1 = evaluate(model, val_loader)
    print(f"Accuracy: {acc:.4f} | F1-score: {f1:.4f}")
    accuracies.append(acc)
    f1_scores.append(f1)

--- Fold 1 ---


Epoch 1: 100%|██████████| 239/239 [01:06<00:00,  3.62it/s]
Epoch 2: 100%|██████████| 239/239 [01:02<00:00,  3.83it/s]
Epoch 3: 100%|██████████| 239/239 [01:03<00:00,  3.78it/s]
Epoch 4: 100%|██████████| 239/239 [01:09<00:00,  3.45it/s]
Epoch 5: 100%|██████████| 239/239 [01:21<00:00,  2.93it/s]


Accuracy: 0.8939 | F1-score: 0.8929
--- Fold 2 ---


Epoch 1: 100%|██████████| 239/239 [01:11<00:00,  3.34it/s]
Epoch 2: 100%|██████████| 239/239 [01:18<00:00,  3.04it/s]
Epoch 3: 100%|██████████| 239/239 [01:20<00:00,  2.98it/s]
Epoch 4: 100%|██████████| 239/239 [01:19<00:00,  3.02it/s]
Epoch 5: 100%|██████████| 239/239 [01:21<00:00,  2.92it/s]


Accuracy: 0.9127 | F1-score: 0.9127
--- Fold 3 ---


Epoch 1: 100%|██████████| 239/239 [01:25<00:00,  2.81it/s]
Epoch 2: 100%|██████████| 239/239 [01:23<00:00,  2.86it/s]
Epoch 3: 100%|██████████| 239/239 [01:26<00:00,  2.77it/s]
Epoch 4: 100%|██████████| 239/239 [01:20<00:00,  2.99it/s]
Epoch 5: 100%|██████████| 239/239 [01:50<00:00,  2.17it/s]


Accuracy: 0.8278 | F1-score: 0.8237
--- Fold 4 ---


Epoch 1: 100%|██████████| 239/239 [01:44<00:00,  2.29it/s]
Epoch 2: 100%|██████████| 239/239 [01:44<00:00,  2.29it/s]
Epoch 3: 100%|██████████| 239/239 [01:40<00:00,  2.37it/s]
Epoch 4: 100%|██████████| 239/239 [01:46<00:00,  2.24it/s]
Epoch 5: 100%|██████████| 239/239 [01:45<00:00,  2.27it/s]


Accuracy: 0.9045 | F1-score: 0.9045
--- Fold 5 ---


Epoch 1: 100%|██████████| 239/239 [01:44<00:00,  2.29it/s]
Epoch 2: 100%|██████████| 239/239 [01:44<00:00,  2.28it/s]
Epoch 3: 100%|██████████| 239/239 [01:45<00:00,  2.27it/s]
Epoch 4: 100%|██████████| 239/239 [01:47<00:00,  2.22it/s]
Epoch 5: 100%|██████████| 239/239 [01:46<00:00,  2.25it/s]


Accuracy: 0.8986 | F1-score: 0.8981
--- Fold 6 ---


Epoch 1: 100%|██████████| 239/239 [01:46<00:00,  2.24it/s]
Epoch 2: 100%|██████████| 239/239 [01:46<00:00,  2.24it/s]
Epoch 3: 100%|██████████| 239/239 [01:30<00:00,  2.64it/s]
Epoch 4: 100%|██████████| 239/239 [01:28<00:00,  2.70it/s]
Epoch 5: 100%|██████████| 239/239 [01:21<00:00,  2.92it/s]


Accuracy: 0.8939 | F1-score: 0.8936
--- Fold 7 ---


Epoch 1: 100%|██████████| 239/239 [01:21<00:00,  2.94it/s]
Epoch 2: 100%|██████████| 239/239 [01:22<00:00,  2.90it/s]
Epoch 3: 100%|██████████| 239/239 [01:16<00:00,  3.12it/s]
Epoch 4: 100%|██████████| 239/239 [01:14<00:00,  3.23it/s]
Epoch 5: 100%|██████████| 239/239 [01:13<00:00,  3.24it/s]


Accuracy: 0.8689 | F1-score: 0.8685
--- Fold 8 ---


Epoch 1: 100%|██████████| 239/239 [01:13<00:00,  3.26it/s]
Epoch 2: 100%|██████████| 239/239 [01:12<00:00,  3.30it/s]
Epoch 3: 100%|██████████| 239/239 [01:12<00:00,  3.28it/s]
Epoch 4: 100%|██████████| 239/239 [01:12<00:00,  3.28it/s]
Epoch 5: 100%|██████████| 239/239 [01:12<00:00,  3.29it/s]


Accuracy: 0.8937 | F1-score: 0.8937
--- Fold 9 ---


Epoch 1: 100%|██████████| 239/239 [01:13<00:00,  3.25it/s]
Epoch 2: 100%|██████████| 239/239 [01:13<00:00,  3.24it/s]
Epoch 3: 100%|██████████| 239/239 [01:12<00:00,  3.28it/s]
Epoch 4: 100%|██████████| 239/239 [01:12<00:00,  3.28it/s]
Epoch 5: 100%|██████████| 239/239 [01:12<00:00,  3.29it/s]


Accuracy: 0.9067 | F1-score: 0.9061
--- Fold 10 ---


Epoch 1: 100%|██████████| 239/239 [01:16<00:00,  3.11it/s]
Epoch 2: 100%|██████████| 239/239 [01:29<00:00,  2.66it/s]
Epoch 3: 100%|██████████| 239/239 [01:34<00:00,  2.53it/s]
Epoch 4: 100%|██████████| 239/239 [01:36<00:00,  2.48it/s]
Epoch 5: 100%|██████████| 239/239 [01:35<00:00,  2.49it/s]


Accuracy: 0.9020 | F1-score: 0.9013


In [9]:
print("\n--- Risultati finali (10-fold CV) ---")
print(f"Accuracy media: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"F1-score medio: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")


--- Risultati finali (10-fold CV) ---
Accuracy media: 0.8903 ± 0.0236
F1-score medio: 0.8895 ± 0.0246
