In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, LlamaForCausalLM
from peft import get_peft_model, LoraConfig
from unsloth import FastLanguageModel

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3090


In [3]:
max_input_length = 1024
# Load the dataset
dataset = load_dataset("patrickjmcbride/math-instruct-binned")

# Split the dataset into training and testing sets
dataset = dataset['small'].train_test_split(test_size=0.5)

train_dataset = dataset['train'].select(range(8192))
#test_dataset = dataset['test'].select(range(1000))

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3", # Supports Llama, Mistral - replace this!
    max_seq_length = 2048, # Supports RoPE Scaling internally, so choose any!
    load_in_4bit = True
)
 
def tokenize(prompt=None, max_length=None):
    result = tokenizer(
        prompt + tokenizer.eos_token, # Add an eos token that will be inclueded in attention
        truncation=True,
        max_length=max_length,
        padding='max_length',  # Add padding
        return_tensors=None,
    )
    result["labels"] = result["input_ids"].copy()

    assert result["input_ids"][-1] == tokenizer.eos_token_id # Ensure the eos token is at the end
    return result
 
def preprocess_function(entry):
    return tokenize(prompt=entry['text'], max_length=max_input_length)

# Apply the preprocessing function and filter the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function)
#tokenized_test_dataset = test_dataset.map(preprocess_function)

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.48it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
failspy/Meta-Llama-3-8B-Instruct-abliterated-v3 does not have a padding token! Will use pad_token = <|reserved_special_token_250|>.
Map: 100%|██████████| 8192/8192 [00:10<00:00, 748.71 examples/s]


In [4]:
# Load the model and resize embeddings for the new special token
model.resize_token_embeddings(len(tokenizer))

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    #use_gradient_checkpointing = True,
)


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results-v0.2",
    #eval_strategy="epoch",
    learning_rate=2e-6,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    log_level='warning',
    per_device_train_batch_size=8,
    bf16=True,  # Enable mixed precision training with brain floating point 16
    #gradient_checkpointing=False,  # Enable gradient checkpointing
    gradient_accumulation_steps=16,  # Accumulate gradients over 8 steps
    dataloader_num_workers=8,
    dataloader_persistent_workers=True,
    save_steps=0.2,
)

#model.compile()

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    #eval_dataset=tokenized_test_dataset,
)
print(trainer.args)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=8,
dataloader_persistent_workers=True,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=no,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_con

In [5]:
# Train the model
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,192 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 192
 "-____-"     Number of trainable parameters = 41,943,040
100%|██████████| 192/192 [5:11:32<00:00, 97.35s/it]  

{'train_runtime': 18692.1575, 'train_samples_per_second': 1.315, 'train_steps_per_second': 0.01, 'train_loss': 5.229942639668782, 'epoch': 3.0}





TrainOutput(global_step=192, training_loss=5.229942639668782, metrics={'train_runtime': 18692.1575, 'train_samples_per_second': 1.315, 'train_steps_per_second': 0.01, 'total_flos': 1.1395388675488481e+18, 'train_loss': 5.229942639668782, 'epoch': 3.0})

In [6]:
# Save the fine-tuned model
model.save_pretrained("./Meta-Llama-3-8B-Instruct-abliterated-math-v0.2")
tokenizer.save_pretrained("./Meta-Llama-3-8B-Instruct-abliterated-math-v0.2")

('./Meta-Llama-3-8B-Instruct-abliterated-math-v0.2/tokenizer_config.json',
 './Meta-Llama-3-8B-Instruct-abliterated-math-v0.2/special_tokens_map.json',
 './Meta-Llama-3-8B-Instruct-abliterated-math-v0.2/tokenizer.json')