<a href="https://colab.research.google.com/github/SongZhou-Meg/transformers/blob/main/LLm_Lora_merged_with__base__model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops
!pip install -q wandb

from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer

from huggingface_hub import login

import wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


Initialize the variables

In [2]:
 #model_name = "meta-llama/Llama-2-7b-hf"
#model_name = "TinyPixel/Llama-2-7B-bf16-sharded"
model_name = "Salesforce/codegen-350M-mono"
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"
split = "train[:10%]"
finetunes_model_name = "llama-2-7b-finetuned-int4-python-18k-alpaca"
device_map = {"": 0}

In [3]:
peft_config = LoraConfig(
      lora_alpha=16,
      lora_dropout=0.1,
      r=64,
      bias="none",
      task_type="CAUSAL_LM",
)

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

Load the Model & DataSet

In [5]:
from huggingface_hub import notebook_login
# Log in to HF Hub
notebook_login()

wandb.login()
%env WANDB_PROJECT=python-fine-tuning

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[34m[1mwandb[0m: Currently logged in as: [33mzhousong178[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=python-fine-tuning


In [6]:
def prompt_instruction_format(sample):
  return f"""### Instruction:
    Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

    ### Task:
    {sample['instruction']}

    ### Input:
    {sample['input']}

    ### Response:
    {sample['output']}
    """

In [7]:
dataset = load_dataset(dataset_name,split=split)

In [8]:
base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_cache = False, device_map=device_map)
base_model.config.pretraining_tp = 1

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Train the model

In [10]:
trainingArgs = TrainingArguments(
    output_dir=finetunes_model_name,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    report_to="wandb",
    push_to_hub = True,
    seed=42
)

In [11]:
# Create the trainer
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
)



In [12]:
print("Start the supervised Fine tuning")
trainer.train()
print("Done Training")

#stop reporting to wandb
wandb.finish()

# save model
trainer.save_model()
print("Model saved")

Start the supervised Fine tuning


Token indices sequence length is longer than the specified maximum sequence length for this model (2905 > 2048). Running this sequence through the model will result in indexing errors
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.1542, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.02}
{'loss': 1.0952, 'learning_rate': 9.523809523809524e-05, 'epoch': 0.04}
{'loss': 1.0681, 'learning_rate': 0.00014285714285714287, 'epoch': 0.06}
{'loss': 1.0197, 'learning_rate': 0.00019047619047619048, 'epoch': 0.09}
{'loss': 0.9184, 'learning_rate': 0.00019998282416292055, 'epoch': 0.11}
{'loss': 0.8431, 'learning_rate': 0.00019991305743680013, 'epoch': 1.01}
{'loss': 0.8771, 'learning_rate': 0.00019978966374934254, 'epoch': 1.03}
{'loss': 0.8533, 'learning_rate': 0.00019961270933041477, 'epoch': 1.06}
{'loss': 0.7989, 'learning_rate': 0.0001993822891578708, 'epoch': 1.08}
{'loss': 0.7799, 'learning_rate': 0.00019909852690657359, 'epoch': 1.1}
{'loss': 0.7668, 'learning_rate': 0.00019876157488201424, 'epoch': 2.0}
{'loss': 0.7937, 'learning_rate': 0.0001983716139385641, 'epoch': 2.03}
{'loss': 0.7219, 'learning_rate': 0.00019792885338240374, 'epoch': 2.05}
{'loss': 0.7328, 'learning_rate': 0.0001974335308591806, 

VBox(children=(Label(value='0.001 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.055483…

0,1
train/epoch,▁▁▁▁▁▄▄▄▅▅███████
train/global_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇██
train/learning_rate,▁▃▅█████████████
train/loss,█▇▇▆▅▄▄▄▃▃▂▃▂▂▁▂
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,2.12
train/global_step,81.0
train/learning_rate,0.0002
train/loss,0.7119
train/total_flos,2453232974561280.0
train/train_loss,0.8589
train/train_runtime,1386.6806
train/train_samples_per_second,4.026
train/train_steps_per_second,0.504


Model saved


Merge the LORA to the main model

In [13]:
# load the trained model from the output directory
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    trainingArgs.output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)


In [14]:
# Merge LoRA with the base model and save the merged model
lora_merged_model = trained_model.merge_and_unload()
lora_merged_model.save_pretrained("merged",safe_serialization=True)
tokenizer.save_pretrained("merged")

#push merged model to the hub
lora_merged_model.push_to_hub("codegen-350M-mono-python-18k-alpaca")
tokenizer.push_to_hub("codegen-350M-mono-python-18k-alpaca")

pytorch_model.bin:   0%|          | 0.00/713M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/megzhou123/codegen-350M-mono-python-18k-alpaca/commit/51736185d0553176b374d016789f85926c3fdb6f', commit_message='Upload tokenizer', commit_description='', oid='51736185d0553176b374d016789f85926c3fdb6f', pr_url=None, pr_revision=None, pr_num=None)

Test the model

In [15]:
instruction="Write a Python program to generate a Markov chain given a text input."
input="Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'"

prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the Task.
### Task:
{instruction}
### Input:
{input}
### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():

print(f"-------------------------\n")
print(f"Prompt:\n{prompt}\n")
print(f"-------------------------\n")

print(f"Base Model Response :\n")
output_base = base_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(output_base.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")

print(f"Trained Model Response :\n")
trained_model = lora_merged_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(trained_model.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")

print(f"LORA Model Response :\n")
output_trained_lora = lora_merged_model.generate(input_ids=input_ids, max_new_tokens=500, do_sample=True, top_p=0.9,temperature=0.5)
print(f"{tokenizer.batch_decode(output_trained_lora.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"-------------------------\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


-------------------------

Prompt:
### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the Task.
### Task:
Write a Python program to generate a Markov chain given a text input.
### Input:
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'
### Response:


-------------------------

Base Model Response :



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# Import the necessary modules
import random

# Create a Markov chain
def make_chain(text):
    # Create a list of all the words in the text
    words = text.split()
    # Create a list of the words in the Markov chain
    words_in_chain = []
    for word in words:
        words_in_chain.append(word)
    # Create a random number for the next word
    next_word = words_in_chain[random.randint(0, len(words_in_chain)-1)]
    # Create a Markov chain
    markov_chain = [next_word]
    # Iterate through the words in the Markov chain
    for word in words:
        # If the word is not the current word, add it to the Markov chain
        if word!= next_word:
            markov_chain.append(word)
    # Return the Markov chain
    return markov_chain

# Generate a Markov chain
text = "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


import random

# generate a Markov chain
def generate_markov_chain(text):
    # create an empty list to store the words
    words = []
    # iterate through the text
    for i in text:
        # append the word to the list
        words.append(i)
    # create a Markov chain
    chain = []
    # create a list of the words
    for i in range(len(words)):
        # get the current word
        word = words[i]
        # get the previous word
        prev_word = words[i-1]
        # get the next word
        next_word = words[i+1]
        # create a list of the next words
        next_words = [next_word, prev_word]
        # add the next words to the chain
        chain.append(next_words)
    # return the chain
    return chain

# test the function
text = "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the us