In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [2]:
if torch.cuda.is_available():
    print("CUDA is available")
else:
    print("CUDA is not available")

CUDA is available


In [3]:
import pandas as pd
import numpy as np

In [4]:
malware_calls = pd.read_csv("../datasets/CatakPreprocessed.csv")

In [5]:
malware_calls.iloc[0]['class']

'Trojan'

In [6]:
malware_calls.head()

Unnamed: 0,api,class
0,ldrloaddll ldrgetprocedureaddress regopenkeyex...,Trojan
1,getsystemtimeasfiletime ntallocatevirtualmemor...,Trojan
2,ldrgetdllhandle ldrgetprocedureaddress getsyst...,Backdoor
3,ldrloaddll ldrgetprocedureaddress regopenkeyex...,Backdoor
4,ldrloaddll ldrgetprocedureaddress wsastartup n...,Trojan


In [7]:
malware_calls['class'].value_counts()

class
Trojan        1001
Backdoor      1001
Downloader    1001
Worms         1001
Virus         1001
Dropper        891
Spyware        832
Adware         379
Name: count, dtype: int64

In [8]:
num_classes = len(malware_calls["class"].value_counts())
class_weights = (1 - (malware_calls['class'].value_counts().sort_index() / len(malware_calls))).values
class_weights = torch.from_numpy(class_weights).float().to("cuda")
class_weights

tensor([0.9467, 0.8592, 0.8592, 0.8746, 0.8829, 0.8592, 0.8592, 0.8592],
       device='cuda:0')

### Class Mapping

In [9]:
CAT2IDX = {
    'Virus': 0,
    'Trojan': 1,
    'Worms': 2,
    'Downloader': 3,
    'Backdoor': 4,
    'Dropper': 5,
    'Spyware': 6,
    'Adware': 7,
}

IDX2CAT = {
    0:'Virus',
    1:'Trojan',
    2:'Worms',
    3:'Downloader',
    4:'Backdoor',
    5:'Dropper',
    6:'Spyware',
    7:'Adware',
}

In [10]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load LlaMa 2 7B Model Checkpoint from Hugging Face (need to be logged in)

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "meta-llama/Llama-2-7b-hf"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes, label2id=CAT2IDX, id2label=IDX2CAT)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
tokenizer.model_max_length = 512

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Split Data for Training and Validation

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(malware_calls.api, malware_calls['class'],
test_size=0.2, random_state=75, stratify = malware_calls['class'])

In [13]:
from datasets import load_dataset, Dataset
import datasets

train = Dataset.from_pandas(pd.concat([Y_train, X_train], axis=1)).remove_columns('__index_level_0__')
validation = Dataset.from_pandas(pd.concat([Y_test, X_test], axis=1)).remove_columns('__index_level_0__')

dataset = datasets.DatasetDict({"train": train, "validation": validation})
dataset

DatasetDict({
    train: Dataset({
        features: ['class', 'api'],
        num_rows: 5685
    })
    validation: Dataset({
        features: ['class', 'api'],
        num_rows: 1422
    })
})

## Load or Create Tokenized Dataset

In [14]:
from datasets import load_from_disk
import os

def tokenize_function(examples):
    #extract text
    text = examples['api']
    
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors='np',
        truncation=True,
        max_length=512
    )
    
    return tokenized_inputs

# Path to the directory where the tokenized dataset will be saved or loaded
directory_path = 'tokenized_datasets'
file_name = 'catak_tokenized'
full_path = os.path.join(directory_path, file_name)

# Check if the directory and file already exist
if os.path.exists(full_path):
    # Load the existing dataset
    tokenized_dataset = load_from_disk(full_path)
    print("Loaded the tokenized dataset.")
else:
    # Ensure the directory exists
    os.makedirs(directory_path, exist_ok=True)
    
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # Save the new tokenized dataset
    tokenized_dataset.save_to_disk(full_path)
    print("Saved new tokenized dataset.")
tokenized_dataset

Loaded the tokenized dataset.


DatasetDict({
    train: Dataset({
        features: ['class', 'api', 'input_ids', 'attention_mask'],
        num_rows: 5685
    })
    validation: Dataset({
        features: ['class', 'api', 'input_ids', 'attention_mask'],
        num_rows: 1422
    })
})

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
import numpy as np
from datasets import load_metric
from sklearn.metrics import roc_auc_score
from scipy.special import softmax

def compute_metrics(eval_pred):
    precision = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")
    acc = load_metric("accuracy")
    mcc = load_metric("matthews_correlation")
    #auc = load_metric("auc")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = precision.compute(predictions=predictions, average = "macro", references=labels)["precision"]
    recall = recall.compute(predictions=predictions, average = "macro", references=labels)["recall"]
    f1 = f1.compute(predictions=predictions, average = "macro", references=labels)["f1"]
    acc = acc.compute(predictions=predictions, references=labels)["accuracy"]
    mcc = mcc.compute(predictions=predictions, references=labels)["matthews_correlation"]
    auc = roc_auc_score(labels, softmax(logits, axis=1), multi_class='ovo', average='macro')
    return {"precision": precision, "recall": recall, "acc": acc, "mcc": mcc, "f1": f1, "auc":auc}

### Prediction Using Base Model Performance is not Ideal

In [28]:
import torch
torch.cuda.empty_cache()
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs available
    n_gpu = torch.cuda.device_count()
    for i in range(n_gpu):
        print(f"GPU {i}:")
        print(f"  Total memory: {torch.cuda.get_device_properties(i).total_memory / 1e9} GB")
        print(f"  Allocated memory: {torch.cuda.memory_allocated(i) / 1e9} GB")
        print(f"  Cached memory: {torch.cuda.memory_reserved(i) / 1e9} GB")
else:
    print("CUDA is not available.")

GPU 0:
  Total memory: 25.393692672 GB
  Allocated memory: 24.771476992 GB
  Cached memory: 24.771559424 GB


In [20]:
torch.cuda.empty_cache()  # Clear cache first

try:
    # Your model loading code here
    model = model.to('cuda')
except RuntimeError as e:
    if "out of memory" in str(e):
        print("Not enough GPU memory to load the model. Trying to clear cache.")
        torch.cuda.empty_cache()
        # Consider further steps here like reducing model/batch size or moving to CPU
    else:
        raise e

Not enough GPU memory to load the model. Trying to clear cache.


In [39]:
next(model.parameters()).device

device(type='cpu')

In [19]:
try:
    input = tokenizer.encode(malware_calls.iloc[0]['api'], return_tensors="pt")
    input = input.to(torch.device("cpu"))
    logits = model(input).logits
    prediction = torch.argmax(logits)
    print(malware_calls.iloc[0]['class'] + " - " + IDX2CAT[prediction.tolist()])
except RuntimeError as e:
    print(f"An error occurred: {e}")
    torch.cuda.empty_cache()  # Clearing the CUDA cache

Trojan - Backdoor


In [20]:
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj'] # Only apply LORA to the query and value projections, the paper on LORA suggests that this provides the best results
)

In [21]:
print(model)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNor

### PEFT Will Only Tune %0.06 of the Parameters

In [22]:
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 4,227,072 || all params: 6,611,603,456 || trainable%: 0.06393414287670189


In [23]:
# hyperparameters
lr = 1e-3
batch_size = 2
num_epochs = 10

In [24]:
from transformers import TrainingArguments

# define training arguments
training_args = TrainingArguments(
    output_dir= model_name + "-Malware-Classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [27]:
from transformers import Trainer

try:
    # creater trainer object
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
except RuntimeError as e:
    print(f"An error occurred: {e}")
    torch.cuda.empty_cache()  # Clearing the CUDA cache



An error occurred: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacty of 23.65 GiB of which 137.19 MiB is free. Process 3282954 has 23.51 GiB memory in use. Of the allocated memory 23.07 GiB is allocated by PyTorch, and 80.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [None]:
# Save the trained model
trainer.save_model("./")

# If you also want to save the tokenizer associated with the model
tokenizer.save_pretrained("./")