In [1]:
import json
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, DataCollatorWithPadding, TrainingArguments, Trainer
import sys
sys.path.append('../')
import config
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

2024-07-27 20:43:26.782418: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


In [2]:
data_dir = config.DATA_DIR
print("data_dir: ", data_dir)
output_dir = config.OUTPUT_DIR
print("output_dir: ", output_dir)
llm_dir = output_dir+"llm_finetuned_models/"
print("llm_dir: ", llm_dir)
models = ["jackaduma/SecRoBERTa", "ehsanaghaei/SecureBERT", "gpt2-xl"]
model_names = ["SecRoBERTa", "SecureBERT", "gpt2-xl"]
n_epoch = 10
model_id = 0

data_dir:  /home/afarhan/post-doc/AWEB_GCL/datasets/enterprise_attack/
output_dir:  /home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/
llm_dir:  /home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/llm_finetuned_models/


In [3]:

if not os.path.exists(llm_dir+model_names[model_id]):
    os.makedirs(llm_dir+model_names[model_id])

# load attack and weakness description with ID
with open(config.DESCRIPTION_FILE) as f:
    doc_id_to_desc = json.load(f)
print("Number of Nodes with Description: ",len(doc_id_to_desc))
# text_data is a list containing your text data
text_data = []  # Your text items
for doc_id in doc_id_to_desc:
    text_data.append(doc_id_to_desc[doc_id])

Number of Nodes with Description:  2986


In [4]:
# Initialize the tokenizer and model based on model_name
model_name = models[model_id]
if "gpt2" in model_name:
    tokenizer = GPT2Tokenizer.from_pretrained(models[model_id])
    model = GPT2LMHeadModel.from_pretrained(models[model_id])
elif("SecRoBERTa" in model_name):
    tokenizer = AutoTokenizer.from_pretrained(models[model_id])
    model = AutoModelForMaskedLM.from_pretrained(models[model_id])
elif("SecureBERT" in model_name):
    tokenizer = AutoTokenizer.from_pretrained(models[model_id])
    model = AutoModelForMaskedLM.from_pretrained(models[model_id])
model.to(device)
model.to(device)
# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [5]:
# Calculate max_length based on the longest text in your dataset
#max_length = max([len(tokenizer.encode(text)) for text in text_data])
max_length = min(max([len(tokenizer.encode(text)) for text in text_data]), 512)  # Limit to 512 tokens
print("Max # token in the longest text :",max_length)

Max # token in the longest text : 512


In [6]:
# Custom dataset
# Custom dataset with chunking
class CustomDataset(Dataset):
    def __init__(self, tokenizer, texts, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []

        for text in texts:
            tokenized_text = tokenizer.encode(text)
            for i in range(0, len(tokenized_text), max_length):
                chunk = tokenized_text[i:i + max_length]
                self.examples.append(chunk)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        chunk = self.examples[idx]
        tokenized_inputs = self.tokenizer.prepare_for_model(
            chunk,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
        for key in tokenized_inputs:
            tokenized_inputs[key] = tokenized_inputs[key].squeeze(0).to(device)
        return tokenized_inputs


In [7]:
max_length = 512  # Define your max length
dataset = CustomDataset(tokenizer, text_data, max_length)

if "gpt2" in model_name:
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
else:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)


In [9]:
print(data_collator)

DataCollatorForLanguageModeling(tokenizer=RobertaTokenizerFast(name_or_path='jackaduma/SecRoBERTa', vocab_size=52000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental

In [8]:
# Training arguments and Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=n_epoch,
    learning_rate=1e-4,
    output_dir=os.path.join(llm_dir, model_names[model_id], 'results'),
    logging_dir=os.path.join(llm_dir , model_names[model_id] , 'logs'),
    logging_steps=100,
    load_best_model_at_end=False,
    evaluation_strategy="no",
    remove_unused_columns=False,
    push_to_hub=False,
    save_strategy="no",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

torch.cuda.empty_cache()
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,4.7349
200,4.2898
300,4.1361
400,3.7626
500,3.7251
600,3.5155
700,3.5439
800,3.3545
900,3.423
1000,3.2526


TrainOutput(global_step=3810, training_loss=2.8570995360847533, metrics={'train_runtime': 461.7631, 'train_samples_per_second': 66.073, 'train_steps_per_second': 8.251, 'total_flos': 4041281476657152.0, 'train_loss': 2.8570995360847533, 'epoch': 9.99})

In [10]:
trainer.save_model()
model.save_pretrained(llm_dir+model_names[model_id]+'/epoch_{}'.format(n_epoch))
tokenizer.save_pretrained(llm_dir+model_names[model_id]+'/epoch_{}'.format(n_epoch))

('/home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/llm_finetuned_models/SecRoBERTa/epoch_10/tokenizer_config.json',
 '/home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/llm_finetuned_models/SecRoBERTa/epoch_10/special_tokens_map.json',
 '/home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/llm_finetuned_models/SecRoBERTa/epoch_10/vocab.json',
 '/home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/llm_finetuned_models/SecRoBERTa/epoch_10/merges.txt',
 '/home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/llm_finetuned_models/SecRoBERTa/epoch_10/added_tokens.json',
 '/home/afarhan/post-doc/AWEB_GCLmodel_outputs/enterprise_attack/llm_finetuned_models/SecRoBERTa/epoch_10/tokenizer.json')

In [11]:
torch.cuda.empty_cache()

In [12]:
data_dir

'/home/afarhan/post-doc/AWEB_GCL/datasets/enterprise_attack/'

In [13]:
os.path.join(data_dir, 'doc_id_to_desc.json')

'/home/afarhan/post-doc/AWEB_GCL/datasets/enterprise_attack/doc_id_to_desc.json'