In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig, BitsAndBytesConfig, GenerationConfig, TrainingArguments
from peft import LoraConfig, PeftModel
from datasets import Dataset, load_dataset
from trl import SFTTrainer



In [2]:
# import dataset from the hugging face hub from my own account
dataset = load_dataset('TC14/050724_random200gs_llama3_instruct')

In [3]:
# model name
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

# tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

# add special token padding for that
tokenizer.add_special_tokens({"pad_token" : "<pad>"})
tokenizer.padding_side = 'left'


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# define compute module
compute_dtype = getattr(torch, 'float16')

# Quantization parameter
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = compute_dtype,
    bnb_4bit_use_double_quant = True,
    
)

# load in the base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto"
)

# uniform the input text length
model.resize_token_embeddings(len(tokenizer))
# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
tokenizer.padding_side = 'right'
model.config.use_cache = False

#model = prepare_model_for_kbit_training(model)

# configure lora 
peft_config = LoraConfig(
    lora_alpha = 32,
    lora_dropout = 0.1,
    r = 8,
    bias = 'none',
    task_type = 'CAUSAL_LM',
    target_modules = ['q_proj' , 'v_proj']
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
training_arg = TrainingArguments(
    output_dir = './results/run_1',
    #evaluation_strategy = "steps",
    #do_eval = False,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 1,
    #per_device_eval_batch_size = 4,
    log_level = "debug",
    optim = 'paged_adamw_32bit',
    save_steps = 100,
    logging_steps = 10,
    learning_rate = 1e-4,
    #eval_steps = 5,
    fp16 = True,
    num_train_epochs = 3,
    max_grad_norm = 0.3,
    #max_steps = 10,
    warmup_ratio = 0.03,
    lr_scheduler_type = 'constant',
    report_to = "tensorboard"
)

In [6]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        #eval_dataset=dataset['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arg,
)

trainer.train()

Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 1,015
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 381
  Number of trainable parameters = 3,407,872


Step,Training Loss
10,6.8123
20,3.8193
30,2.1003
40,1.7538
50,1.5376
60,1.5473
70,1.4518
80,1.4016
90,1.4669
100,1.4336


Saving model checkpoint to ./results/run_1/checkpoint-100
loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.40.2",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./results/run_1/checkpoint-100/tokenize

TrainOutput(global_step=381, training_loss=1.3453347642277795, metrics={'train_runtime': 183.004, 'train_samples_per_second': 16.639, 'train_steps_per_second': 2.082, 'total_flos': 8538915910533120.0, 'train_loss': 1.3453347642277795, 'epoch': 3.0})

In [7]:
model = PeftModel.from_pretrained(model, "./results/run_1/checkpoint-200")


In [8]:
messages = [{'role': 'system', 'content': 'You are an radio transcript message transcript assisant, please classifer the following message'},
         {'role': 'user', 'content': 'Overall balance is really good'}]

prompt = tokenizer.apply_chat_template(
    messages,
    return_tensors = "pt"
)

#text_input = tokenizer(prompt , return_tensors = "pt")

#print(text_input)

model.to("cuda")
model_input = prompt.cuda()

generation_output = model.generate(
    model_input,
    max_new_tokens = 20,
    do_sample = True
)

decoded = tokenizer.batch_decode(generation_output)
print(decoded)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an radio transcript message transcript assisant, please classifer the following message<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nOverall balance is really good<|eot_id|><|start_header_id|>assiant<|end_header_id|>\n\nVehicle handling<|eot_id|>']


In [14]:
del model

torch.cuda.empty_cache()
import gc
gc.collect()

NameError: name 'model' is not defined

In [19]:
torch.cuda.empty_cache()
gc.collect()

0