In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorWithPadding, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

2024-07-24 12:19:20.582121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


In [2]:
import json
import numpy as np
data_dir = "../datasets/ics_attack/"
output_dir = "../model_outputs/ics_attack/llm_finetuned_models/"
model_name = "gpt2-xl"
n_epoch = 10
if not os.path.exists(output_dir+model_name):
    os.makedirs(output_dir+model_name)

In [3]:
# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.to(device)
# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

Using pad_token, but it is not set yet.


In [4]:
with open(data_dir+"doc_id_to_desc.json") as f:
    doc_id_to_desc = json.load(f)
print("Number of Nodes with Description: ",len(doc_id_to_desc))

Number of Nodes with Description:  1136


In [5]:
# text_data is a list containing your text data
text_data = []  # Your text items
for doc_id in doc_id_to_desc:
    text_data.append(doc_id_to_desc[doc_id])

In [6]:
# Calculate max_length based on the longest text in your dataset
#max_length = max([len(tokenizer.encode(text)) for text in text_data])
max_length = min(max([len(tokenizer.encode(text)) for text in text_data]), 512)  # Limit to 512 tokens

print("Max # token in the longest text :",max_length)

Max # token in the longest text : 512


In [7]:
# Custom dataset
# Custom dataset with chunking
class CustomDataset(Dataset):
    def __init__(self, tokenizer, texts, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []

        for text in texts:
            tokenized_text = tokenizer.encode(text)
            for i in range(0, len(tokenized_text), max_length):
                chunk = tokenized_text[i:i + max_length]
                self.examples.append(chunk)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        chunk = self.examples[idx]
        tokenized_inputs = self.tokenizer.prepare_for_model(
            chunk,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
        # Move tensors to the device
        for key in tokenized_inputs:
            tokenized_inputs[key] = tokenized_inputs[key].squeeze(0).to(device)
        return tokenized_inputs


In [8]:
# Create dataset
dataset = CustomDataset(tokenizer, text_data, max_length)

# Create a data collator that will dynamically pad the sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
# Training arguments and Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=2,  # Reduce batch size
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    num_train_epochs=n_epoch,
    learning_rate=1e-4,
    output_dir=output_dir+model_name+'/results',
    logging_dir=output_dir+model_name+'/logs',
    logging_steps=100,
    load_best_model_at_end=False,
    evaluation_strategy="no",
    remove_unused_columns=False,
    push_to_hub=False,
    save_strategy="no",  # Disable checkpoint saving
    fp16=True,  # Use mixed precision training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=None,  # You can specify an evaluation dataset here
    data_collator=data_collator,  # Add the data collator here
)
# Clear cache before training
torch.cuda.empty_cache()
trainer.train()



Step,Training Loss
100,0.9615
200,0.4492
300,0.3243
400,0.1521
500,0.1004
600,0.0586
700,0.0365
800,0.0268
900,0.0224
1000,0.0178


TrainOutput(global_step=1430, training_loss=0.15408067611547616, metrics={'train_runtime': 2837.5854, 'train_samples_per_second': 4.035, 'train_steps_per_second': 0.504, 'total_flos': 5.18158683635712e+16, 'train_loss': 0.15408067611547616, 'epoch': 9.98})

In [10]:
trainer.save_model()
model.save_pretrained(output_dir+model_name+'/epoch_{}'.format(n_epoch))
tokenizer.save_pretrained(output_dir+model_name+'/epoch_{}'.format(n_epoch))

('../model_outputs/ics_attack/llm_finetuned_models/gpt2-xl/Epoch_10/tokenizer_config.json',
 '../model_outputs/ics_attack/llm_finetuned_models/gpt2-xl/Epoch_10/special_tokens_map.json',
 '../model_outputs/ics_attack/llm_finetuned_models/gpt2-xl/Epoch_10/vocab.json',
 '../model_outputs/ics_attack/llm_finetuned_models/gpt2-xl/Epoch_10/merges.txt',
 '../model_outputs/ics_attack/llm_finetuned_models/gpt2-xl/Epoch_10/added_tokens.json')

In [11]:
torch.cuda.empty_cache()