In [None]:
%%capture
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3
!pip install trl==0.7.7
!pip install tqdm==4.66.1

In [None]:
import os
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')
wandb_api_key = userdata.get('wandb')

login(token=hf_token)

os.environ["WANDB_API_KEY"] = wandb_api_key
os.environ["WANDB_PROJECT"] = "Fine-tuning Llama2-with-stack-exchange"
os.environ["WANDB_NOTES"] = "Fine tune model distilbert base uncased"
os.environ["WANDB_NAME"] = "ft-Llama2-with-stack-exchange-paired"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name="meta-llama/Llama-2-7b-hf"
tokenizer=AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

  _torch_pytree._register_pytree_node(


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset
import tqdm

def prepare_sample_text(example):
    text=f"Question:{example['question']}\n\nAnswer:{example['response_j']}"
    return text

def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens=0,0
    for _, example in tqdm.tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text=prepare_sample_text(example)
        total_characters+=len(text)
        if tokenizer.is_fast:
            total_tokens+=len(tokenizer(text).tokens())
        else:
            total_tokens+=len(tokenizer.tokenize(text))
    return total_characters/total_tokens


streaming=True

dataset=load_dataset(
    path="lvwerra/stack-exchange-paired",
    data_dir="data/finetune",
    split="train",
    streaming=streaming,
)

dataset

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

<datasets.iterable_dataset.IterableDataset at 0x784f0c28c8e0>

In [None]:
import tqdm

if streaming:
    valid_data=dataset.take(4000)
    train_data=dataset.skip(4000)
    train_data=train_data.shuffle(buffer_size=5000, seed=None)
else:
    dataset=dataset.train_test_split(test_size=0.005, seed=None)
    train_data=dataset["train"]
    valid_data=dataset["test"]

chars_per_token=chars_token_ratio(train_data, tokenizer)
chars_per_token

100%|██████████| 400/400 [00:05<00:00, 68.47it/s] 


3.2580971659919027

In [None]:
from trl.trainer import ConstantLengthDataset

train_dataset=ConstantLengthDataset(
    tokenizer,
    train_data,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=1024,
    chars_per_token=chars_per_token,
)

valid_dataset=ConstantLengthDataset(
    tokenizer,
    valid_data,
    formatting_func=prepare_sample_text,
    infinite=False,
    seq_length=1024,
    chars_per_token=chars_per_token,
)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
import torch
from peft import AutoPeftModelForCausalLM


bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model=AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=False,
)

base_model.config.use_cache=False
print(base_model)



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [None]:
from peft import LoraConfig, TaskType

peft_config=LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj","v_proj"],
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'v_proj', 'q_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
from trl import SFTTrainer

from transformers import TrainingArguments

training_args=TrainingArguments(
    output_dir="./sft",
    max_steps=100,
    logging_steps=10,
    save_steps=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    group_by_length=False,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=50,
    weight_decay=0.05,
    optim="paged_adamw_32bit",
    fp16=True,
    remove_unused_columns=False,
    run_name=os.getenv("WANDB_NAME"),
    report_to="wandb"
)

sft_trainer=SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    peft_config=peft_config,
    packing=True,
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_args,
)

sft_trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msosamaali[0m ([33mmidconstruct[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.752
20,1.771
30,1.7458
40,1.7113
50,1.5952
60,1.5663
70,1.6101
80,1.5504
90,1.5711
100,1.5446


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=100, training_loss=1.6417860984802246, metrics={'train_runtime': 1568.1429, 'train_samples_per_second': 0.51, 'train_steps_per_second': 0.064, 'total_flos': 3.2497031184384e+16, 'train_loss': 1.6417860984802246, 'epoch': 1.0})

In [None]:
kwargs={
    'model_name': f'{os.getenv("WANDB_NAME")}',
    'finetuned_from': 'meta-llama/Llama-2-7b-hf',
#     'tasks': '',
#     'dataset_tags':'',
    'dataset':'lvwerra/stack-exchange-paired'
}

tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
sft_trainer.push_to_hub(**kwargs)

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/OsamaAliMid/sft/commit/89bc653a97bee42f2727847140318e8d25fba3bf', commit_message='End of training', commit_description='', oid='89bc653a97bee42f2727847140318e8d25fba3bf', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import gc

del sft_trainer, base_model
gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch
from peft import AutoPeftModelForCausalLM

model=AutoPeftModelForCausalLM.from_pretrained("/content/sft", device_map="auto", torch_dtype=torch.bfloat16)
model=model.merge_and_unload()

model.save_pretrained("./sft/final_merged_checkpoint", safe_serialization=True)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from typing import Dict

def return_prompt_and_responses(samples)-> Dict[str,str]:
    return {
        "prompt":[
            "Question:"+question+"\n\nAnswer:" for question in samples["question"]
        ],
        "chosen": samples["response_j"],
        "rejected": samples["response_k"],
    }


def get_stack_exchange_paired(data_dir="data/rl",sanity_check=False,cache_dir=None,num_proc=24):
    dataset=load_dataset(
        "lvwerra/stack-exchange-paired",
        split="train",
        data_dir="data/rl",
        cache_dir=cache_dir,
    )
    original_columns=dataset.column_names

    if sanity_check:
        dataset=dataset.select(range(min(len(dataset), 1000)))

    return dataset.map(
        return_prompt_and_responses,
        batched=True,
        num_proc=num_proc,
        remove_columns=original_columns,
    )

In [None]:
model=AutoModelForCausalLM.from_pretrained(
    "./sft/final_merged_checkpoint",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

model.config.use_cache=False

model_ref=AutoModelForCausalLM.from_pretrained(
    "./sft/final_merged_checkpoint",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
tokenizer_dpo=AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer_dpo.pad_token=tokenizer.eos_token



In [None]:
train_dataset=get_stack_exchange_paired(
    data_dir="data/rl",
    sanity_check=False
)

train_dataset=train_dataset.filter(
    lambda x: len(x["prompt"])+len(x["chosen"])<=1024 and len(x["prompt"])+len(x["rejected"])<=1024
)

eval_dataset=get_stack_exchange_paired(data_dir="data/evaluation", sanity_check=True)
eval_dataset=eval_dataset.filter(
    lambda x: len(x["prompt"])+len(x["chosen"])<=1024 and len(x["prompt"])+len(x["rejected"])<=1024
)

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=24):   0%|          | 0/7435908 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7435908 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Map (num_proc=24):   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from trl import DPOTrainer

training_args=TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    max_steps=1000,
    logging_steps=10,
    save_steps=100,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-4,
    evaluation_strategy="steps",
    eval_steps=100,
    output_dir=os.getenv("WANDB_NAME"),
    report_to="wandb",
    lr_scheduler_type="cosine",
    warmup_steps=100,
    optim="paged_adamw_32bit",
    bf16=True,
    remove_unused_columns=False,
    run_name=os.getenv("WANDB_NAME"),
)

peft_config=LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=['q_proj','v_proj','k_proj','out_proj','fc_in','fc_out','wte',],
    bias="none",
    task_type="CAUSAL_LM",
)

dpo_trainer=DPOTrainer(
    model,
    model_ref,
    args=training_args,
    beta=0.1,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    max_prompt_length=512,
    max_length=1024,
)

dpo_trainer.train()
dpo_trainer.save_model(os.getenv("WANDB_NAME"))

Map:   0%|          | 0/1659503 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Could not estimate the number of tokens of the input, floating-point operations will not be computed
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
100,0.6692,0.661575,-0.588043,-0.782953,0.5875,0.19491,-162.974548,-165.013153,-0.304814,-0.306855
200,0.6671,0.648494,-1.368391,-1.684834,0.595833,0.316443,-171.993347,-172.816635,-0.328192,-0.330097
300,0.6695,0.645608,-0.668569,-0.919908,0.6375,0.25134,-164.344086,-165.81842,-0.574364,-0.57315
400,0.743,0.635801,-0.664716,-0.971068,0.65,0.306352,-164.855682,-165.779877,-0.535195,-0.544784
500,0.7019,0.613986,-0.879764,-1.2835,0.625,0.403736,-167.980011,-167.930374,-0.590016,-0.590142
600,0.6882,0.618361,-1.306417,-1.784432,0.633333,0.478015,-172.989319,-172.196884,-0.407729,-0.413643
700,0.6714,0.598996,-1.089747,-1.586793,0.670833,0.497047,-171.012939,-170.030182,-0.413906,-0.42031
800,0.648,0.617786,-0.885202,-1.255131,0.6625,0.36993,-167.69632,-167.984726,-0.433447,-0.435952
900,0.6515,0.61154,-0.765391,-1.152965,0.654167,0.387574,-166.674637,-166.786636,-0.436778,-0.440582
1000,0.6494,0.609192,-0.785381,-1.185849,0.666667,0.400468,-167.003494,-166.986526,-0.432479,-0.436465


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

In [None]:
dpo_trainer.model.save_pretrained(os.getenv("WANDB_NAME")+"/final_checkpoint")