In [1]:
# Install the requirements in Google Colab
!pip install transformers datasets trl huggingface_hub peft evaluation

# Authenticate to Hugging Face

from huggingface_hub import login
login()

# for convenience you can create an environment variable containing your hub token as HF_TOKEN



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
model_id = 'HuggingfaceTB/SmolLM-360M'
dataset_id = 'Intel/orca_dpo_pairs'

In [3]:
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer, setup_chat_format
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype='auto').to(device)
model, tokenizer = setup_chat_format(model, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb): LlamaRotaryEm

In [5]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    modules_to_save=["lm_head"],
    bias="none",
    task_type="CAUSAL_LM",
)

In [6]:
# PEFT 적용
lora_model = get_peft_model(model, peft_config)

# 전체 모델 파라미터 개수
total_params = sum(p.numel() for p in model.parameters())

# LoRA 파라미터 개수 (학습 가능한 파라미터만 포함)
lora_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)

# LoRA 파라미터 비율 계산
lora_ratio = (lora_params / total_params) * 100
print(lora_ratio)

12.931811828954734


In [7]:
finetune_name = "SmolLM2-DPO-example"
finetune_tags = ["smol-course", "module_1"]

In [8]:
dataset = load_dataset(dataset_id, split='train')

In [9]:
train_size = 1000
eval_size = 100

dataset = dataset.shuffle(seed=123)

train_dataset = dataset.select(range(train_size))
eval_dataset = dataset.select(range(train_size, train_size+eval_size))

In [10]:
def chatml_format(example):
    system_message = [{"role": "system", "content": example['system']}] if len(example['system']) > 0 else []

    user_message = [{'role':"user", "content": example['question']}]

    chosen_message = [{"role": "assistant", "content": example['chosen'].strip()}]
    rejected_message = [{"role": "assistant", "content": example['rejected'].strip()}]

    prompt = tokenizer.apply_chat_template(system_message+user_message, tokenize=False, add_generation_prompt=True)

    return {
        'prompt': prompt,
        'chosen': tokenizer.apply_chat_template(chosen_message, tokenize=False),
        'rejected': tokenizer.apply_chat_template(rejected_message, tokenize=False)
    }

In [11]:
original_columns = dataset.column_names

train_dataset = train_dataset.map(
    chatml_format,
    remove_columns=original_columns
)

eval_dataset = eval_dataset.map(
    chatml_format,
    remove_columns=original_columns
)

In [12]:
example = eval_dataset[0]['prompt']
print(example)
print('\n')
print('Chosen Generation')
print(eval_dataset[0]['chosen'])
print('\n')
print('Rejected Generation')
print(eval_dataset[0]['rejected'])

<|im_start|>system
You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.<|im_end|>
<|im_start|>user
Given the question: Lee was a top lawyer and after dealing with a very high profile murder case, he brought the trial to a conclusion with such damning evidence to convict the murderer.  Given the context: What will happen to Lee?  Possible answers: be happy the murderer was convicted, not commended, praised for their hard work
The answer is:<|im_end|>
<|im_start|>assistant



Chosen Generation
<|im_start|>assistant
To determine the best answer, let's analyze each of the possible answers step by step:

1. Be happy the murderer was convicted: This is a possible emotional outcome for Lee, as he successfully concluded the high-profile murder case. It is a natural reaction for a professional to feel content with achieving a positive result in their work. However,

In [13]:
# Generate response
inputs = tokenizer(example, return_tensors="pt", padding=True).to(device)
outputs = model.generate(**inputs,
                         max_new_tokens=200,
                         do_sample=True,
                         temperature=0.6
                        )
print("Before training:")
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# user 메시지를 제외하고 assistant 응답만 추출
assistant_response = full_response[len(example):].strip()
print(assistant_response)

Before training:
en to Lee? 3) what is the question? 4) what is the answer to the question? 5) what will happen to Lee? 6) what is the question? 7) what is the answer to the question? 8) what will happen to Lee? 9) what is the question? 10) what is the answer to the question? 11) what will happen to Lee? 12) what is the question? 13) what is the answer to the question? 14) what will happen to Lee? 15) what is the question? 16) what is the answer to the question? 17) what will happen to Lee? 18) what is the question? 19) what is the answer to the question? 20) what will happen to Lee? 2


In [14]:
steps = 50

training_args = DPOConfig(
    output_dir='DPO_Lora',
    per_device_train_batch_size=8,
    eval_strategy='steps',
    eval_steps=steps,
    num_train_epochs=5,
    logging_steps=steps,
    save_steps=steps,
    bf16=True,
    dataloader_pin_memory=2,
    gradient_accumulation_steps=2,
    learning_rate=float(1e-5),
)

In [15]:
trainer = DPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
)

Extracting prompt in train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

In [16]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkyu5787[0m ([33mkyu5787-pohang-university-of-science-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
50,0.6895,0.68324,-0.010304,-0.032369,0.557692,0.022064,-327.619965,-363.512634,5.056277,4.482193
100,0.6709,0.6599,-0.016892,-0.085266,0.798077,0.068374,-327.685883,-364.041565,5.066031,4.495579
150,0.6392,0.640328,-0.051117,-0.165676,0.846154,0.114559,-328.028107,-364.845673,5.078936,4.510261
200,0.6175,0.607417,-0.069086,-0.256648,0.894231,0.187562,-328.207794,-365.755371,5.091789,4.530377
250,0.5999,0.592201,-0.08708,-0.310413,0.875,0.223334,-328.387695,-366.29303,5.097951,4.536731
300,0.5747,0.588188,-0.099348,-0.336452,0.884615,0.237105,-328.510437,-366.553436,5.105952,4.546926


TrainOutput(global_step=310, training_loss=0.6304628510628977, metrics={'train_runtime': 438.4937, 'train_samples_per_second': 11.403, 'train_steps_per_second': 0.707, 'total_flos': 0.0, 'train_loss': 0.6304628510628977, 'epoch': 4.928})

In [17]:
example = eval_dataset[0]['prompt']
inputs = tokenizer(example, return_tensors="pt", padding=True).to(device)
outputs = model.generate(**inputs,
                         max_new_tokens=200,
                         do_sample=True,
                         temperature=0.6
                        )
print("After training:")
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# user 메시지를 제외하고 assistant 응답만 추출
assistant_response = full_response[len(example):].strip()
print(assistant_response)

After training:
matrix, which means that he was trapped in it for 13 years. How long he would have been trapped in the matrix? Given the context: What will happen to Neo?  Possible answers: be happy, be in prison, be in a coma, be in a hospital, be in a jail, be in a car, be in a jail, be in a hospital, be in a car, be in a jail, be in a hospital, be in a car, be in a jail Given the context: What will happen to Neo?  Possible answers: be happy, be in prison, be in a coma, be in a hospital, be in a hospital, be in a car, be in a hospital, be in a jail, be in a car, be in a jail Given the context: What will happen to Neo?  Possible answers: be happy, be in prison, be in


In [18]:
trainer.push_to_hub(tags=finetune_tags)

adapter_model.safetensors:   0%|          | 0.00/121M [00:00<?, ?B/s]

events.out.tfevents.1739954633.6b9272e9afa4.621.0:   0%|          | 0.00/8.84k [00:00<?, ?B/s]

events.out.tfevents.1739954783.6b9272e9afa4.2772.0:   0%|          | 0.00/19.1k [00:00<?, ?B/s]

events.out.tfevents.1739955380.6b9272e9afa4.5202.0:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/6.20k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/kyu5787/DPO_Lora/commit/76292a8e051a4620692c9272ef856e0df7941a75', commit_message='End of training', commit_description='', oid='76292a8e051a4620692c9272ef856e0df7941a75', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kyu5787/DPO_Lora', endpoint='https://huggingface.co', repo_type='model', repo_id='kyu5787/DPO_Lora'), pr_revision=None, pr_num=None)