Here is where we do the fine tuning of the model. We will use the `transformers` library to load the pre-trained model and tokenizer. 

This is configured to run on a loacl machine with a GPU. In this case a single NVIDIA RTX 3090. 

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import bitsandbytes
import accelerate
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, AutoModelForCausalLM, LlamaForCausalLM
from peft import get_peft_model, LoraConfig



In [2]:
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

0
<torch.cuda.device object at 0x7d04e4ecf0d0>
1
NVIDIA GeForce RTX 3090


In [3]:
max_input_length = 1024
# Load the dataset
dataset = load_from_disk("MathInstructSmall")

# Split the dataset into training and testing sets
dataset = dataset.train_test_split(test_size=0.5)

train_dataset = dataset['train'].select(range(5000))
test_dataset = dataset['test'].select(range(500))

# Load the tokenizer and add a special pad token
tokenizer = AutoTokenizer.from_pretrained("failspy/Meta-Llama-3-8B-Instruct-abliterated-v3")
tokenizer.pad_token = tokenizer.eos_token
 
def tokenize(prompt=None, add_eos_token=True, max_length=None):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding='max_length',  # Add padding
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < max_length
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
 
    result["labels"] = result["input_ids"].copy()
 
    return result
 
def preprocess_function(entry):
    full_prompt = entry['text']
    tokenized_full_prompt = tokenize(prompt=full_prompt, max_length=max_input_length, add_eos_token=True)
    return tokenized_full_prompt

# Apply the preprocessing function and filter the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function)
tokenized_test_dataset = test_dataset.map(preprocess_function)


# Load the model and resize embeddings for the new special token
model = LlamaForCausalLM.from_pretrained("failspy/Meta-Llama-3-8B-Instruct-abliterated-v3", load_in_8bit=True, torch_dtype=torch.float16)
model.resize_token_embeddings(len(tokenizer))

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_kbit_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    per_device_train_batch_size=1,
    fp16=True,  # Enable mixed precision training
    gradient_checkpointing=True,  # Enable gradient checkpointing
    gradient_accumulation_steps=16,  # Accumulate gradients over 16 steps
    save_total_limit=3,
    save_steps=100,
    eval_steps=100,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Print the shapes of the input IDs and attention mask
for i, data in enumerate(tokenized_train_dataset):
    # make sure they are all padded to max_input_length
    assert len(data["input_ids"]) == max_input_length
    assert len(data["attention_mask"]) == max_input_length

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./Meta-Llama-3-8B-Instruct-abliterated-math-v0")
tokenizer.save_pretrained("./Meta-Llama-3-8B-Instruct-abliterated-math-v0")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/936 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


{'loss': 3.027, 'grad_norm': 3.953536033630371, 'learning_rate': 1.980769230769231e-05, 'epoch': 0.03}
{'loss': 2.737, 'grad_norm': 3.9414432048797607, 'learning_rate': 1.9594017094017095e-05, 'epoch': 0.06}
{'loss': 2.415, 'grad_norm': 4.599000453948975, 'learning_rate': 1.9401709401709403e-05, 'epoch': 0.1}
{'loss': 1.9706, 'grad_norm': 3.3462464809417725, 'learning_rate': 1.918803418803419e-05, 'epoch': 0.13}
{'loss': 1.7272, 'grad_norm': 0.7417623996734619, 'learning_rate': 1.8974358974358975e-05, 'epoch': 0.16}
{'loss': 1.5878, 'grad_norm': 0.6823728084564209, 'learning_rate': 1.876068376068376e-05, 'epoch': 0.19}
{'loss': 1.6482, 'grad_norm': 0.6858626008033752, 'learning_rate': 1.854700854700855e-05, 'epoch': 0.22}
{'loss': 1.5098, 'grad_norm': 0.7660446166992188, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.26}
{'loss': 1.4725, 'grad_norm': 0.7725253701210022, 'learning_rate': 1.8119658119658122e-05, 'epoch': 0.29}
{'loss': 1.3605, 'grad_norm': 0.9488421082496643, 'learn



{'loss': 1.2947, 'grad_norm': 1.265254020690918, 'learning_rate': 1.7692307692307694e-05, 'epoch': 0.35}
{'loss': 1.2277, 'grad_norm': 1.0456111431121826, 'learning_rate': 1.747863247863248e-05, 'epoch': 0.38}
{'loss': 1.1622, 'grad_norm': 1.005322813987732, 'learning_rate': 1.7264957264957267e-05, 'epoch': 0.42}
{'loss': 1.1078, 'grad_norm': 0.6953596472740173, 'learning_rate': 1.7051282051282053e-05, 'epoch': 0.45}
{'loss': 1.0917, 'grad_norm': 0.6518557667732239, 'learning_rate': 1.683760683760684e-05, 'epoch': 0.48}
{'loss': 1.1187, 'grad_norm': 0.5502466559410095, 'learning_rate': 1.6623931623931625e-05, 'epoch': 0.51}
{'loss': 1.11, 'grad_norm': 0.5621817111968994, 'learning_rate': 1.641025641025641e-05, 'epoch': 0.54}
{'loss': 1.0717, 'grad_norm': 0.5439116358757019, 'learning_rate': 1.6196581196581197e-05, 'epoch': 0.58}
{'loss': 1.1147, 'grad_norm': 0.6543027758598328, 'learning_rate': 1.5982905982905986e-05, 'epoch': 0.61}
{'loss': 1.0732, 'grad_norm': 0.5117805004119873, 'le



{'loss': 1.0416, 'grad_norm': 0.5284964442253113, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.67}
{'loss': 1.0654, 'grad_norm': 0.5814707279205322, 'learning_rate': 1.5341880341880344e-05, 'epoch': 0.7}
{'loss': 1.0411, 'grad_norm': 0.545748233795166, 'learning_rate': 1.5128205128205129e-05, 'epoch': 0.74}
{'loss': 1.0542, 'grad_norm': 0.5380589365959167, 'learning_rate': 1.4914529914529916e-05, 'epoch': 0.77}
{'loss': 1.0764, 'grad_norm': 0.5269405841827393, 'learning_rate': 1.4700854700854703e-05, 'epoch': 0.8}
{'loss': 1.085, 'grad_norm': 0.6070964932441711, 'learning_rate': 1.4487179487179489e-05, 'epoch': 0.83}
{'loss': 1.0224, 'grad_norm': 0.49708130955696106, 'learning_rate': 1.4273504273504275e-05, 'epoch': 0.86}
{'loss': 1.0554, 'grad_norm': 0.5486913919448853, 'learning_rate': 1.405982905982906e-05, 'epoch': 0.9}
{'loss': 1.0302, 'grad_norm': 0.5685379505157471, 'learning_rate': 1.3846153846153847e-05, 'epoch': 0.93}
{'loss': 1.0268, 'grad_norm': 0.5069224834442139, 'l



{'loss': 1.0594, 'grad_norm': 0.5892452001571655, 'learning_rate': 1.3418803418803419e-05, 'epoch': 0.99}


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 