Here is where we do the fine tuning of the model. We will use the `transformers` library to load the pre-trained model and tokenizer. 

This is configured to run on a loacl machine with a GPU. In this case a single NVIDIA RTX 3090. 

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, LlamaForCausalLM
from peft import get_peft_model, LoraConfig

In [2]:
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3090


In [3]:
max_input_length = 1024
# Load the dataset
dataset = load_dataset("patrickjmcbride/math-instruct-binned")

# Split the dataset into training and testing sets
dataset = dataset['small'].train_test_split(test_size=0.5)

train_dataset = dataset['train'].select(range(8192))
#test_dataset = dataset['test'].select(range(1000))

# Load the tokenizer and add a special pad token
tokenizer = AutoTokenizer.from_pretrained("failspy/Meta-Llama-3-8B-Instruct-abliterated-v3")
tokenizer.pad_token = tokenizer.eos_token
 
def tokenize(prompt=None, max_length=None):
    result = tokenizer(
        prompt + tokenizer.eos_token, # Add an eos token that will be inclueded in attention
        truncation=True,
        max_length=max_length,
        padding='max_length',  # Add padding
        return_tensors=None,
    )
    result["labels"] = result["input_ids"].copy()

    assert result["input_ids"][-1] == tokenizer.eos_token_id # Ensure the eos token is at the end
    return result
 
def preprocess_function(entry):
    return tokenize(prompt=entry['text'], max_length=max_input_length)

# Apply the preprocessing function and filter the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function)
#tokenized_test_dataset = test_dataset.map(preprocess_function)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/8192 [00:00<?, ? examples/s]

In [4]:
# Load the model and resize embeddings for the new special token
model = LlamaForCausalLM.from_pretrained("failspy/Meta-Llama-3-8B-Instruct-abliterated-v3", load_in_8bit=True, torch_dtype=torch.float16)
model.resize_token_embeddings(len(tokenizer))

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_kbit_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results-v0.1",
    #eval_strategy="epoch",
    learning_rate=2e-6,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    log_level='warning',
    per_device_train_batch_size=4,
    bf16=True,  # Enable mixed precision training with brain floating point 16
    #gradient_checkpointing=False,  # Enable gradient checkpointing
    gradient_accumulation_steps=16,  # Accumulate gradients over 8 steps
    dataloader_num_workers=8,
    dataloader_persistent_workers=True,
    save_steps=0.2,
)

#model.compile()

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    #eval_dataset=tokenized_test_dataset,
)
print(trainer.args)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=8,
dataloader_persistent_workers=True,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,


In [5]:

# Train the model
trainer.train()

  0%|          | 0/384 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


{'train_runtime': 33650.6588, 'train_samples_per_second': 0.73, 'train_steps_per_second': 0.011, 'train_loss': 1.8656837145487468, 'epoch': 3.0}


TrainOutput(global_step=384, training_loss=1.8656837145487468, metrics={'train_runtime': 33650.6588, 'train_samples_per_second': 0.73, 'train_steps_per_second': 0.011, 'total_flos': 1.1337202520146575e+18, 'train_loss': 1.8656837145487468, 'epoch': 3.0})

In [6]:

# Save the fine-tuned model
model.save_pretrained("./Meta-Llama-3-8B-Instruct-abliterated-math-v0.1")
tokenizer.save_pretrained("./Meta-Llama-3-8B-Instruct-abliterated-math-v0.1")



('./Meta-Llama-3-8B-Instruct-abliterated-math-v0.1/tokenizer_config.json',
 './Meta-Llama-3-8B-Instruct-abliterated-math-v0.1/special_tokens_map.json',
 './Meta-Llama-3-8B-Instruct-abliterated-math-v0.1/tokenizer.json')