In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [2]:
if torch.cuda.is_available():
    print("CUDA is available")
else:
    print("CUDA is not available")

CUDA is available


In [3]:
import pandas as pd
import numpy as np

In [4]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [5]:
malware_calls = pd.read_csv("../datasets/CatakPreprocessed.csv")

In [6]:
malware_calls.head()

Unnamed: 0,api,class
0,ldrloaddll ldrgetprocedureaddress regopenkeyex...,Trojan
1,getsystemtimeasfiletime ntallocatevirtualmemor...,Trojan
2,ldrgetdllhandle ldrgetprocedureaddress getsyst...,Backdoor
3,ldrloaddll ldrgetprocedureaddress regopenkeyex...,Backdoor
4,ldrloaddll ldrgetprocedureaddress wsastartup n...,Trojan


In [7]:
malware_calls['class'].value_counts()

Trojan        1001
Backdoor      1001
Downloader    1001
Worms         1001
Virus         1001
Dropper        891
Spyware        832
Adware         379
Name: class, dtype: int64

In [8]:
num_classes = len(malware_calls["class"].value_counts())
class_weights = (1 - (malware_calls['class'].value_counts().sort_index() / len(malware_calls))).values
class_weights = torch.from_numpy(class_weights).float().to("cuda")
class_weights

tensor([0.9467, 0.8592, 0.8592, 0.8746, 0.8829, 0.8592, 0.8592, 0.8592],
       device='cuda:0')

In [10]:
from transformers import Trainer
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [9]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "meta-llama/Llama-2-7b-chat-hf"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
tokenizer.model_max_length = 512

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
CAT2IDX = {
    'Virus': 0,
    'Trojan': 1,
    'Worms': 2,
    'Downloader': 3,
    'Backdoor': 4,
    'Dropper': 5,
    'Spyware': 6,
    'Adware': 7,
}

IDX2CAT = {
    0:'Virus',
    1:'Trojan',
    2:'Worms',
    3:'Downloader',
    4:'Backdoor',
    5:'Dropper',
    6:'Spyware',
    7:'Adware',
}

In [12]:
malware_calls['class'] = malware_calls['class'].map(lambda x: CAT2IDX[x])

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(malware_calls.api, malware_calls['class'],
test_size=0.2, random_state=75, stratify = malware_calls['class'])
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
test_size=0.2, random_state=75, stratify = Y_train)

In [40]:
def get_list_strs(df):
    lst_str = []
    for i in range(len(df)):
        str_ = df.values[i]
        lst_str.append(str_)
    return lst_str

In [61]:
from datasets import load_dataset, Dataset

test = Dataset.from_pandas(malware_calls)
test

Dataset({
    features: ['api', 'class'],
    num_rows: 7107
})

In [62]:
from datasets import load_dataset

temp = load_dataset("shawhin/imdb-truncated")
temp

Traceback (most recent call last):
  File "_pydevd_bundle/pydevd_cython.pyx", line 1078, in _pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch
  File "_pydevd_bundle/pydevd_cython.pyx", line 297, in _pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend
  File "c:\Users\regis\anaconda3\lib\site-packages\debugpy\_vendored\pydevd\pydevd.py", line 1976, in do_wait_suspend
    keep_suspended = self._do_wait_suspend(thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)
  File "c:\Users\regis\anaconda3\lib\site-packages\debugpy\_vendored\pydevd\pydevd.py", line 2011, in _do_wait_suspend
    time.sleep(0.01)
KeyboardInterrupt


KeyboardInterrupt: 

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(malware_calls.api, malware_calls['class'],
test_size=0.2, random_state=75, stratify = malware_calls['class'])

In [73]:
import datasets

train = Dataset.from_pandas(pd.concat([X_train, Y_train], axis=1)).remove_columns('__index_level_0__')
validation = Dataset.from_pandas(pd.concat([X_test, Y_test], axis=1)).remove_columns('__index_level_0__')

dataset = datasets.DatasetDict({"train": train, "validation": validation})
dataset

DatasetDict({
    train: Dataset({
        features: ['api', 'class'],
        num_rows: 5685
    })
    validation: Dataset({
        features: ['api', 'class'],
        num_rows: 1422
    })
})

In [75]:
def tokenize_function(examples):
    #extract text
    text = examples['api']
    
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors='np',
        truncation=True,
        max_length=512
    )
    
    return tokenized_inputs

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/5685 [00:00<?, ? examples/s]

In [30]:
train_encodings = tokenizer(get_list_strs(X_train), padding="max_length", truncation=True, return_tensors="pt")
val_encodings = tokenizer(get_list_strs(X_val), padding="max_length", truncation=True, return_tensors="pt")
test_encodings = tokenizer(get_list_strs(X_test), padding="max_length", truncation=True, return_tensors="pt")

In [None]:
class MalwareDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)