# Install Required Libraries

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets einops

# Log in to WandB to track finetuning metrics
# Log in to Hugging Face

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install wandb
!wandb login

# Load & Quantize the model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"":0})
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

# Import IA3 Config

In [None]:
from peft import IA3Config, get_peft_model

config = IA3Config(
    peft_type="IA3",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
    feedforward_modules=["v_proj"],
)

model = get_peft_model(model, config)

In [None]:
print(model)

PeftModelForCausalLM(
  (base_model): IA3Model(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 4096x1])
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (ia3_l): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 1x4096])
              )
              (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): Linear(in

# Load and preprocessing the dataset

In [None]:
from datasets import load_dataset

# Split the data into 10% test, 10% validation, 80% training
train_data = load_dataset("Hieu-Pham/cooking_squad_splitted", split='train')
val_data = load_dataset("Hieu-Pham/cooking_squad_splitted", split='validation')

def group_train_data(dataset):

    grouped = []

    for row in dataset:
      answers = row['answers']
      input = 'Question: ' + row['question'] + ' Context: ' + row['context'] + ' Answer: ' + answers['text']
      grouped.append(input)

    return grouped

input_column = group_train_data(train_data)
train_data = train_data.add_column('Inputs', input_column)

train_data = train_data.map(
    lambda row: tokenizer(row["Inputs"], truncation=True),
    batched=True,
    remove_columns=train_data.column_names,
)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/155k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1793 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Preprocessing test and validation dataset

def group_test_data(dataset):

    grouped = []

    for row in dataset:
      answers = row['answers']
      input = 'Question: ' + row['question'] + ' Context: ' + row['context']
      grouped.append(input)

    return grouped

# Validation dataset
output_column = group_test_data(val_data)
val_data = val_data.add_column('Outputs', output_column)

val_data = val_data.map(
    lambda row: tokenizer(row["Outputs"], truncation=True),
    batched=True,
    remove_columns=val_data.column_names,
)

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

# Training Configurations and Start Training

In [None]:
import transformers

training_args = transformers.TrainingArguments(
    output_dir="Llama2-7B-IA3-cooking-text-gen",
    evaluation_strategy="steps",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_steps=10,
    logging_steps=20,
    max_grad_norm=0.3,
    max_steps=200,
    warmup_ratio=0.03,
    fp16=True,
    optim="paged_adamw_8bit",
    push_to_hub=True,
)


In [None]:
# tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhieupham[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
20,1.4873,1.500468
40,1.441,1.470042
60,1.4368,1.452831
80,1.4118,1.43866
100,1.3861,1.42674
120,1.3774,1.415866
140,1.3626,1.407074
160,1.359,1.401021
180,1.3461,1.397552
200,1.3475,1.396376


TrainOutput(global_step=200, training_loss=1.3955546092987061, metrics={'train_runtime': 279.7295, 'train_samples_per_second': 11.44, 'train_steps_per_second': 0.715, 'total_flos': 3.559575369842688e+16, 'train_loss': 1.3955546092987061, 'epoch': 1.78})