# Install Required Libraries

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets einops

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproje

# Log in to WandB to track finetuning metrics
# Log in to Hugging Face

In [None]:
!pip install wandb
!wandb login

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load & Quantize the model

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

# Preprocessing the model

In [5]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# Import LoRa Config

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

# Load and preprocessing the dataset

In [7]:
from datasets import load_dataset

# Split the data into 10% test, 10% validation, 80% training
train_data = load_dataset("Hieu-Pham/cooking_squad_splitted", split='train')
val_data = load_dataset("Hieu-Pham/cooking_squad_splitted", split='validation')

def preprocess_dataset(dataset):

    grouped = []

    for row in dataset:
      answers = row['answers']
      input = '[INST] Question: ' + row['question'] + ' Context: ' + row['context'] + ' [/INST]\nAnswer: ' + answers['text']
      grouped.append(input)

    return grouped

input_column = preprocess_dataset(train_data)
train_data = train_data.add_column('Inputs', input_column)


train_data = train_data.map(
    lambda row: tokenizer(row["Inputs"], truncation=True),
    batched=True,
    remove_columns=train_data.column_names,
)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/155k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1793 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
# Preprocessing validation dataset
# Validation dataset
output_column = preprocess_dataset(val_data)
val_data = val_data.add_column('Outputs', output_column)

val_data = val_data.map(
    lambda row: tokenizer(row["Outputs"], truncation=True),
    batched=True,
    remove_columns=val_data.column_names,
)

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

# Training Configurations and Start Training

In [9]:
import transformers

# tokenizer.pad_token = tokenizer.eos_token

training_args = transformers.TrainingArguments(
    output_dir="Llama2-7B-QLoRA-cooking-text-gen-prompting",
    evaluation_strategy="steps",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_steps=10,
    logging_steps=20,
    max_grad_norm=0.3,
    max_steps=300,
    warmup_ratio=0.03,
    fp16=True,
    optim="paged_adamw_8bit",
    push_to_hub=True,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhieupham[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
20,1.4104,1.046584
40,0.9569,0.936805
60,0.9286,0.916666
80,0.9028,0.905166
100,0.8772,0.897664
120,0.8639,0.893782
140,0.8478,0.891183
160,0.8611,0.888349
180,0.8547,0.88461
200,0.8423,0.88143




TrainOutput(global_step=300, training_loss=0.9007607587178548, metrics={'train_runtime': 1057.2106, 'train_samples_per_second': 4.54, 'train_steps_per_second': 0.284, 'total_flos': 5.546633446703923e+16, 'train_loss': 0.9007607587178548, 'epoch': 2.67})