In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig, BitsAndBytesConfig, GenerationConfig, TrainingArguments
from peft import LoraConfig, PeftModel
from datasets import Dataset, load_dataset
from trl import SFTTrainer



In [2]:
# model name
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

# tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

# add special token padding for that
tokenizer.add_special_tokens({"pad_token" : "<pad>"})
tokenizer.padding_side = 'right'


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
# define compute module
compute_dtype = getattr(torch, 'float16')

# Quantization parameter
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = compute_dtype,
    bnb_4bit_use_double_quant = True,
    
)

# load in the base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #quantization_config = bnb_config,
    device_map = "auto"
)

# uniform the input text length
model.resize_token_embeddings(len(tokenizer))
# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
tokenizer.padding_side = 'right'
model.config.use_cache = False

model = PeftModel.from_pretrained(model, "./results/run_1/checkpoint-200")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
messages = [{'role': 'system', 'content': 'You are an radio transcript message transcript assisant, please classifer the following message'},
         {'role': 'user', 'content': 'Overall balance is really good'}]

prompt = tokenizer.apply_chat_template(
    messages,
    return_tensors = "pt"
)

#text_input = tokenizer(prompt , return_tensors = "pt")

#print(text_input)

model.to("cuda")
model_input = prompt.cuda()

generation_output = model.generate(
    model_input,
    max_new_tokens = 20,
    do_sample = True
)

decoded = tokenizer.batch_decode(generation_output)
print(decoded)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an radio transcript message transcript assisant, please classifer the following message<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nOverall balance is really good<|eot_id|><|start_header_id|>assiant<|end_header_id|>\n\nVehicle handling<|eot_id|>']


In [6]:
model = model.merge_and_unload()

In [7]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

In [8]:
model.save_pretrained('./llama3_lora_run1')
tokenizer.save_pretrained('./llama3_lora_run1')

('./llama3_lora_run1/tokenizer_config.json',
 './llama3_lora_run1/special_tokens_map.json',
 './llama3_lora_run1/tokenizer.json')

In [None]:
model.push_to_hub('llama3_lora_run1')
tokenizer.push_to_hub('llama3_lora_run1')