If in VSCODE:

In [1]:
# check if Torch supports CUDA
import torch
print(torch.__version__)
torch.cuda.is_available()


2.1.0+cu121


True

Hyper params

In [1]:
# The model that you want to train from the Hugging Face hub
model_name = "codellama/CodeLlama-7b-Instruct-hf"

# The instruction dataset to use
dataset_name = "StarkWizard/cairo-instruct"

# Fine-tuned model name
new_model = "StarkWizard/llama-2-7b-cairo-trained-PEFT"


import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


max_steps = 700 # to tweak to get the best out of the model 

Load Base Model and Tokenizer

In [2]:

import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

# Load the tokenizer from the model (llama2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token =tokenizer.eos_token

# load the quantized settings: 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)


# don't use the cache
model.config.use_cache = False
model.config.pretraining_tp=1
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Import Dataset from Hugging Face

In [3]:
from datasets import load_dataset

# Load the dataset
dataset_train = load_dataset(dataset_name, split="train", download_mode='force_redownload',ignore_verifications=True)
dataset_test = load_dataset(dataset_name, split="eval", download_mode='force_redownload',ignore_verifications=True)




Downloading readme:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/245 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/245 [00:00<?, ? examples/s]

Check to see if everything is fine before launching training

Load Model

In [6]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from trl import SFTTrainer
import transformers



supervised_finetuning_trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=512,
    neftune_noise_alpha=5,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=3,
        warmup_ratio=0.03,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=1,
        output_dir=new_model,
        max_steps=max_steps,
        fp16=True,
        push_to_hub=True,
    ),
)


Map:   0%|          | 0/245 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



In [5]:
supervised_finetuning_trainer.train()


  0%|          | 0/700 [00:00<?, ?it/s]

You're using a CodeLlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.5287, 'learning_rate': 0.00019995065603657316, 'epoch': 0.12}
{'loss': 2.8018, 'learning_rate': 0.00019970908853907026, 'epoch': 0.24}
{'loss': 2.4956, 'learning_rate': 0.00019926672020679736, 'epoch': 0.37}
{'loss': 1.9681, 'learning_rate': 0.00019862444191070408, 'epoch': 0.49}
{'loss': 1.8496, 'learning_rate': 0.0001977835471138027, 'epoch': 0.61}
{'loss': 1.5734, 'learning_rate': 0.00019674572926630567, 'epoch': 0.73}
{'loss': 1.5447, 'learning_rate': 0.0001955130783952423, 'epoch': 0.86}
{'loss': 1.4797, 'learning_rate': 0.00019408807689542257, 'epoch': 0.98}
{'loss': 1.4469, 'learning_rate': 0.00019247359453022407, 'epoch': 1.1}
{'loss': 1.1731, 'learning_rate': 0.00019067288265227082, 'epoch': 1.22}
{'loss': 1.1972, 'learning_rate': 0.0001886895676556415, 'epoch': 1.35}
{'loss': 1.2246, 'learning_rate': 0.00018652764367279461, 'epoch': 1.47}
{'loss': 1.2272, 'learning_rate': 0.00018419146453091701, 'epoch': 1.59}
{'loss': 1.0996, 'learning_rate': 0.0001816857349838956

TrainOutput(global_step=700, training_loss=0.8824873300961086, metrics={'train_runtime': 2017.6403, 'train_samples_per_second': 1.041, 'train_steps_per_second': 0.347, 'train_loss': 0.8824873300961086, 'epoch': 8.57})

If you are on a colab, push to hub or save to drive

In [8]:
supervised_finetuning_trainer.model.push_to_hub(new_model)

adapter_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/StarkWizard/llama-2-7b-cairo-trained/commit/25f029151b1a42870258fcbb251d62be7e0e42ad', commit_message='Upload model', commit_description='', oid='25f029151b1a42870258fcbb251d62be7e0e42ad', pr_url=None, pr_revision=None, pr_num=None)