In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Load Benign and Malware Dataset
benign_dataset = pd.read_csv("../data/benign.csv")
malware_dataset = pd.read_csv("../data/malware.csv")

print(benign_dataset.shape, malware_dataset.shape)

# Add Ground Truth Column
benign_dataset["ground_truth"] = 0
malware_dataset["ground_truth"] = 1

# Combine the Two Datasets Shuffling the Rows
combined_dataset = pd.concat([benign_dataset, malware_dataset], ignore_index=True).sample(frac=1).reset_index(drop=True)
combined_dataset.drop(combined_dataset.columns[0], axis=1, inplace=True)
print(combined_dataset.shape)

(15166, 2382) (8970, 2382)
(24136, 2382)


In [3]:
combined_dataset.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2372,2373,2374,2375,2376,2377,2378,2379,2380,ground_truth
0,0.540527,0.018148,0.014811,0.006185,0.014947,0.003852,0.012831,0.006158,0.005669,0.004584,...,0.0,0.0,0.0,8.0,8192.0,0.0,0.0,72.0,8200.0,0
1,0.041336,0.006592,0.003821,0.004275,0.006518,0.003952,0.003922,0.00521,0.007232,0.004194,...,422288.0,0.0,0.0,108.0,61440.0,0.0,0.0,0.0,0.0,1
2,0.046235,0.006822,0.003634,0.003757,0.00393,0.003552,0.003513,0.003523,0.003689,0.003563,...,0.0,0.0,0.0,176.0,446464.0,176.0,12288.0,0.0,0.0,1
3,0.117287,0.163838,0.00572,0.010307,0.016535,0.006009,0.003274,0.003774,0.005486,0.002673,...,0.0,0.0,0.0,148.0,120368.0,0.0,0.0,0.0,0.0,1
4,0.235805,0.01612,0.008132,0.008151,0.009494,0.004992,0.012816,0.003794,0.004175,0.004865,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,8192.0,0


In [4]:
X = combined_dataset.drop("ground_truth", axis=1)
y = combined_dataset["ground_truth"]
print(X.shape, y.shape)

(24136, 2381) (24136,)


In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio_cumulative = np.cumsum(explained_variance_ratio)

In [7]:
n_components = np.argmax(explained_variance_ratio_cumulative >= 0.95) + 1
print(f"Number of components to retain for 95% variance: {n_components}")

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

Number of components to retain for 95% variance: 679


In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)
X_lda = lda.fit_transform(X_scaled, y)

In [9]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

feature_importances = xgb_model.feature_importances_

N = 14
selected_feature_indices = np.argsort(feature_importances)[::-1][:N]
selected_features = X.columns[selected_feature_indices]

X_train_selected = X_train.iloc[:, selected_feature_indices]
X_test_selected = X_test.iloc[:, selected_feature_indices]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)
print(X_train_scaled.shape, X_test_scaled.shape)

(19308, 14) (4828, 14)


In [None]:
import torch
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np

In [11]:
class MalwareDetector(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MalwareDetector, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.unsqueeze(1)
        out, _ = self.rnn(x)
        last_output = out[:, -1, :]
        out = self.fc(last_output)
        return out

In [12]:
input_size = X_train_scaled.shape[1]
hidden_size = 64
output_size = 2
model = MalwareDetector(input_size, hidden_size, output_size)

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MalwareDetector(
  (rnn): RNN(14, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)

In [16]:
train_dataset = data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = data.TensorDataset(X_test_tensor, y_test_tensor)
test_loader = data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

train_losses = []
train_accuracies = []
train_auc_scores = []
test_losses = []
test_accuracies = []
test_auc_scores = []

epochs = 10
for epoch in range(10):
    model.train()
    train_loss_sum = 0.0
    num_correct_train = 0
    num_samples_train = 0
    y_true_train = []
    y_pred_train = []

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

        _, predictions = torch.max(outputs, 1)
        num_correct_train += (predictions == labels).sum().item()
        num_samples_train += labels.size(0)
        y_true_train.extend(labels.cpu().numpy())
        y_pred_train.extend(predictions.cpu().numpy())

    train_accuracy = num_correct_train / num_samples_train * 100
    train_losses.append(train_loss_sum / num_samples_train)
    train_accuracies.append(train_accuracy)

    train_auc_score = roc_auc_score(y_true_train, y_pred_train, average="weighted")
    train_auc_scores.append(train_auc_score)

    model.eval()
    test_loss_sum = 0.0
    num_correct_test = 0
    num_samples_test = 0
    y_true_test = []
    y_pred_test = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels.long())

            _, predictions = torch.max(outputs, 1)
            num_correct_test += (predictions == labels).sum().item()
            num_samples_test += labels.size(0)
            y_true_test.extend(labels.cpu().numpy())
            y_pred_test.extend(predictions.cpu().numpy())

    test_accuracy = num_correct_test / num_samples_test * 100
    test_losses.append(test_loss_sum / num_samples_test)
    test_accuracies.append(test_accuracy)

    test_auc_score = roc_auc_score(y_true_test, y_pred_test, average="weighted")
    test_auc_scores.append(test_auc_score)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracies[-1]:.2f}%, Train AUC: {train_auc_scores[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}, Test Accuracy: {test_accuracies[-1]:.2f}%, Test AUC: {test_auc_scores[-1]:.4f}")


