In [1]:
%%capture
!pip install -U -q transformers==4.39.3
!pip install -U -q accelerate==0.28.0
!pip install -U -q datasets==2.18.0
!pip install -U -q peft==0.10.0
!pip install -U -q bitsandbytes==0.43.1
!pip install -U -q trl==0.8.6

In [2]:
import torch

if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.6/2.6 MB[0m [31m103.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [3]:
import os
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')
wandb_api_key = userdata.get('wandb')

login(token=hf_token)

os.environ["WANDB_API_KEY"] = wandb_api_key
os.environ["WANDB_PROJECT"] = "Fine-tuning Llama 3 8B"
os.environ["WANDB_NAME"] = "ft-Llama3-8b-orpo"
os.environ["MODEL_NAME"] = "meta-llama/Meta-Llama-3-8B"
os.environ["DATASET"] = "mlabonne/orpo-dpo-mix-40k"

torch.backends.cudnn.deterministic=True
torch.backends.cuda.enable_mem_efficient_sdp(False)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

Loading pretrained config for `meta-llama/Meta-Llama-3-8B` from `transformers`...
config.json: 100% 654/654 [00:00<00:00, 4.33MB/s]
┌──────────────────────────────────────────────────────┐
│Memory Usage for loading `meta-llama/Meta-Llama-3-8B` │
├───────┬─────────────┬──────────┬─────────────────────┤
│ dtype │Largest Layer│Total Size│ Training using Adam │
├───────┼─────────────┼──────────┼─────────────────────┤
│float32│   1.96 GB   │ 28.21 GB │      112.83 GB      │
│float16│  1002.0 MB  │ 14.1 GB  │       56.42 GB      │
│  int8 │   501.0 MB  │ 7.05 GB  │       28.21 GB      │
│  int4 │   250.5 MB  │ 3.53 GB  │       14.1 GB       │
└───────┴─────────────┴──────────┴─────────────────────┘


In [5]:
from transformers import BitsAndBytesConfig
from peft import LoraConfig

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True
)

peft_config=LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [6]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))



tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
from transformers import AutoModelForCausalLM

model=AutoModelForCausalLM.from_pretrained(
    os.getenv('MODEL_NAME'),
    quantization_config=bnb_config,
    device_map={"":0},
    torch_dtype=torch_dtype
)

model.device

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

device(type='cuda', index=0)

In [8]:
def print_trainable_parameters(model):
    trainable_params=0
    all_params=0
    for _, param in model.named_parameters():
        all_params+=param.numel()
        if param.requires_grad:
            trainable_params+=param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")

print_trainable_parameters(model)

trainable params: 1050939392 || all params: 4540600320 || trainable%: 23.15


In [9]:
from trl import setup_chat_format
from peft import prepare_model_for_kbit_training

model, tokenizer=setup_chat_format(model, tokenizer)

model=prepare_model_for_kbit_training(model)

In [10]:
print_trainable_parameters(model)

trainable params: 0 || all params: 4540616704 || trainable%: 0.00


In [11]:
from datasets import load_dataset

ds=load_dataset(os.getenv('DATASET'), split='all')
ds=ds.shuffle(seed=42).select(range(1000))


#ds=load_dataset(os.getenv('DATASET'), split='train[:300]')
ds

Downloading readme:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 127M/127M [00:08<00:00, 15.8MB/s]


Generating train split:   0%|          | 0/44245 [00:00<?, ? examples/s]

Dataset({
    features: ['source', 'chosen', 'rejected', 'prompt', 'question'],
    num_rows: 1000
})

In [12]:
ds=ds.shuffle(seed=42)

def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

ds=ds.map(format_chat_template, num_proc=os.cpu_count())

  self.pid = os.fork()


Map (num_proc=12):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
ds=ds.train_test_split(test_size=0.01)
ds

DatasetDict({
    train: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt', 'question'],
        num_rows: 990
    })
    test: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt', 'question'],
        num_rows: 10
    })
})

In [16]:
%shell
!pip uninstall scipy -y
!pip install scipy

Found existing installation: scipy 1.14.1
Uninstalling scipy-1.14.1:
  Successfully uninstalled scipy-1.14.1
Collecting scipy
  Using cached scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
Installing collected packages: scipy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.14.1 which is incompatible.[0m[31m
[0mSuccessfully installed scipy-1.14.1


In [18]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [19]:
from trl import ORPOConfig, ORPOTrainer

orpo_args=ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    run_name=os.getenv('WANDB_NAME'),
    output_dir=os.getenv('WANDB_NAME')
)

trainer=ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    peft_config=peft_config,
    tokenizer=tokenizer
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msosamaali[0m ([33mmidconstruct[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
25,1.6579,1.246896,27.244,0.367,0.184,-0.155951,-0.231801,0.5,0.075849,-2.318006,-1.559511,-1.230027,-1.019903,1.177551,-0.693453,0.744012
50,1.1014,1.029664,27.3259,0.366,0.183,-0.126213,-0.199421,0.5,0.073207,-1.994208,-1.262135,-1.400586,-1.374306,0.958702,-0.70962,0.713689
75,0.9391,0.946264,27.2896,0.366,0.183,-0.110619,-0.184396,0.5,0.073777,-1.843962,-1.106188,-1.597035,-1.55041,0.875438,-0.708254,0.718451
100,0.676,0.898318,27.089,0.369,0.185,-0.09993,-0.17478,0.4,0.07485,-1.747798,-0.999296,-1.546594,-1.531461,0.82806,-0.702578,0.731441


TrainOutput(global_step=123, training_loss=1.4661583125106687, metrics={'train_runtime': 8922.4391, 'train_samples_per_second': 0.111, 'train_steps_per_second': 0.014, 'total_flos': 0.0, 'train_loss': 1.4661583125106687, 'epoch': 0.99})

In [20]:
kwargs={
    'model_name': os.getenv("WANDB_NAME"),
    'finetuned_from': os.getenv('MODEL_NAME'),
#     'tasks': '',
#     'dataset_tags':'',
    'dataset': os.getenv("DATASET")
}

tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(**kwargs)



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/OsamaAliMid/ft-Llama3-8b-orpo/commit/12f5476bcf398a3b9ef463bbbfb371715f13926b', commit_message='End of training', commit_description='', oid='12f5476bcf398a3b9ef463bbbfb371715f13926b', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
import gc

del trainer, model
gc.collect()

torch.cuda.empty_cache()

In [27]:
tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
