## Import Data

In [None]:
import os
import torch
from torch_geometric.data import Data
from torch.utils.data import Dataset, Subset
from torch_geometric.loader import DataLoader
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split

class MalwareGraphDataset(Dataset):
    def __init__(self, root_dir):
        """
        Custom dataset to load labeled graph data from saved .pt files.

        Parameters:
        - root_dir: str, path to the directory containing the .pt graph files.
        """
        self.root_dir = root_dir
        self.graph_files = [f for f in os.listdir(root_dir) if f.endswith('.pt')]

    def __len__(self):
        return len(self.graph_files)

    def __getitem__(self, idx):
        file_path = os.path.join(self.root_dir, self.graph_files[idx])
        data = torch.load(file_path)  # Load the Data object
        return data

# Path to the directory with processed graphs
processed_data_dir = '/home/belief/Desktop/MalwareDetection/Dynamic 2/ProcessedGraphs'
dataset = MalwareGraphDataset(processed_data_dir)

# Collect indices for each class
class_indices = defaultdict(list)
for i in range(len(dataset)):
    data = torch.load(os.path.join(processed_data_dir, dataset.graph_files[i]))
    label = data.y.item()
    class_indices[label].append(i)

# Find the minimum class size
min_class_size = min(len(indices) for indices in class_indices.values())

# Undersample each class to match the minimum class size
balanced_indices = []
for indices in class_indices.values():
    balanced_indices.extend(random.sample(indices, min_class_size))


# 20 - 80 split for testing and training
train_indices, test_indices = train_test_split(
    balanced_indices, test_size=0.2, random_state=42, stratify=[torch.load(os.path.join(processed_data_dir, dataset.graph_files[i])).y.item() for i in balanced_indices]
)

train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)

# Count samples per class in training and testing datasets
def count_class_samples(dataset, name):
    class_counts = defaultdict(int)
    for data in dataset:
        label = data.y.item()
        class_counts[label] += 1
    print(f"\nNumber of samples per class in {name} set:")
    for label, count in class_counts.items():
        print(f"Class {label}: {count} samples")

# Print class counts for train and test sets
count_class_samples(train_dataset, "training")
count_class_samples(test_dataset, "testing")

# Create DataLoaders using 100% data for training and 20% of it for testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print("\nDataLoaders are ready. Training on 100% of balanced data, testing on 20% of that data.")


## No of Input Features

In [None]:
# Get a single batch from the train_loader
data_batch = next(iter(train_loader))

# Check the dimension of input node features
print("Dimension of input features (data.x):", data_batch.x.size())


## Model!

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GraphLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, lstm_hidden_dim, output_dim, dropout_rate=0.3):
        super(GraphLSTMClassifier, self).__init__()
        # Graph Convolutional Layers
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)
        
        # LSTM Layer
        self.lstm = nn.LSTM(hidden_dim, lstm_hidden_dim, batch_first=True)
        
        # Fully Connected Layer
        self.fc1 = nn.Linear(lstm_hidden_dim, output_dim)

    def forward(self, data):
        # GCN Layers with Dropout
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = self.dropout(x)  # Apply dropout after conv1
        x = F.relu(self.conv2(x, edge_index))
        x = self.dropout(x)  # Apply dropout after conv2
        
        # Global Pooling to get graph-level embeddings
        x = global_mean_pool(x, batch)  # (batch_size, hidden_dim)

        # Prepare for LSTM by adding sequence dimension
        x = x.unsqueeze(1)  # (batch_size, seq_len=1, hidden_dim)
        
        # LSTM Layer
        _, (h_n, _) = self.lstm(x)  # Only get the hidden state from LSTM
        x = h_n[-1]  # (batch_size, lstm_hidden_dim)

        # Fully Connected Layer for classification
        x = self.dropout(x)  # Apply dropout before final layer
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)

# Define model parameters
input_dim = 1  # Adjust if node feature dimension differs
hidden_dim = 128
lstm_hidden_dim = 256
output_dim = 2  # Number of classes (benign, adware, etc.)

# Instantiate model with dropout
model = GraphLSTMClassifier(input_dim, hidden_dim, lstm_hidden_dim, output_dim)
print(model)


In [None]:
import torch
import torch.nn.functional as F
from torch.optim import Adam

# Define the device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:

# Define optimizer and loss function
optimizer = Adam(model.parameters(), lr=0.0005)
criterion = torch.nn.CrossEntropyLoss()


In [None]:

# Training function
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


In [None]:

# Evaluation function
def evaluate(model, loader, device):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
    return correct / len(loader.dataset)


In [None]:

# Training loop
epochs = 20
for epoch in range(1, epochs + 1):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    test_acc = evaluate(model, test_loader, device)
    print(f"Epoch {epoch:02d}, Loss: {train_loss:.4f}, Test Accuracy: {test_acc:.4f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score

def evaluate_with_metrics(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            all_preds.extend(pred.cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())
    
    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Generate classification report for precision, recall, and F1-score
    report = classification_report(all_labels, all_preds, target_names=['Benign', 'Malware'])
    
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    
    return accuracy, report

# Run evaluation on the test set
accuracy, classification_report = evaluate_with_metrics(model, test_loader, device)


In [None]:
import torch

# Specify the path to save the model
model_path = '/workspace/graph_lstm_classifier.pth'  # Adjust the path as needed

# Save the model's state dictionary
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


In [None]:
# Define the model architecture (should match the original model)
model1 = GraphLSTMClassifier(input_dim=1, hidden_dim=96, lstm_hidden_dim=128, output_dim=5)  # Adjust parameters if needed

# Load the saved state dictionary into the model
model1.load_state_dict(torch.load(model_path))
model1.eval()  # Set the model to evaluation mode

print("Model loaded successfully and set to evaluation mode.")


In [None]:
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the correct device
model1.to(device)
# Import required metrics
from sklearn.metrics import classification_report, accuracy_score

def evaluate_with_metrics(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for data in loader:
            data = data.to(device)  # Move each batch to the correct device
            output = model(data)
            pred = output.argmax(dim=1)
            all_preds.extend(pred.cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())
    
    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Generate classification report for precision, recall, and F1-score
    report = classification_report(all_labels, all_preds, target_names=['Benign', 'Adware', 'Banking', 'Riskware', 'Smsware'])
    
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    
    return accuracy, report

# Run evaluation on the test set
accuracy, report = evaluate_with_metrics(model, test_loader, device)

