In [1]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl

Import all the necessary packages.

In [2]:
import torch, multiprocessing
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig

  from .autonotebook import tqdm as notebook_tqdm


#SmolLM 135M
# Distilled Supervised Fine-tuning

First, activate the use of bfloat16 and FlashAttenion if they are compatible with your GPU.
Then, load the tokenizer and configure padding

In [3]:
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
  !pip install flash-attn
  attn_implementation='flash_attention_2'
  print("Your GPU is compatible with FlashAttention.")
else:
  attn_implementation='eager'
  print("Your GPU is not compatible with FlashAttention.")

model_name = "HuggingFaceTB/SmolLM-135M"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 2
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Your GPU is compatible with FlashAttention.


Load the version of ultrachat prepared by Hugging Face. I only load 5% of the test split to speed up validation.

In [4]:
dataset_train_sft = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
dataset_test_sft = load_dataset("HuggingFaceH4/ultrachat_200k", split="test_sft[:5%]")

Load the model that we will train with SFT and activate gradient checkpointing to save memory.

In [5]:
model = AutoModelForCausalLM.from_pretrained(
          model_name, attn_implementation=attn_implementation, device_map={"": 0}
)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour


For this demonstration, I trained for only 4000 steps. One epoch would be ideal.

In [7]:
training_arguments = SFTConfig(
        output_dir="./sft_smollm_135M/",
        eval_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_steps=500,
        logging_steps=50,
        learning_rate=2e-5,
        fp16= not torch.cuda.is_bf16_supported(),
        bf16= torch.cuda.is_bf16_supported(),
        eval_steps=50,
        max_steps=4000,
        warmup_steps=30,
        max_seq_length=2048,
        lr_scheduler_type="linear",
)

Start training:

In [8]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_train_sft,
        eval_dataset=dataset_test_sft,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

Map:   0%|          | 0/207865 [00:00<?, ? examples/s]


ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

# Distilled DPO

Load the model that will be trained with DPO.

In [9]:
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
  !pip install flash-attn
  attn_implementation='flash_attention_2'
  print("Your GPU is compatible with FlashAttention.")
else:

  attn_implementation='eager'
  print("Your GPU is not compatible with FlashAttention.")

model_name = "HuggingFaceTB/SmolLM-135M"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 2
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

model = AutoModelForCausalLM.from_pretrained(
          model_name, attn_implementation=attn_implementation, device_map={"": 0}
)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


Your GPU is compatible with FlashAttention.


We will use as a reference model our checkpoint trained with SFT.

In [None]:
ref_model = AutoModelForCausalLM.from_pretrained(
          "./sft_smollm_135M/checkpoint-4000", attn_implementation=attn_implementation, device_map={"": 0}
)

Format UltraFeedback with a default chat template for DPO training.

In [None]:
dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=["train_prefs","test_prefs"])

tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

def process(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset[0] = dataset[0].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

dataset[1] = dataset[1].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

print(dataset)

Downloading readme:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=12):   0%|          | 0/61135 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default clas

Map (num_proc=12):   0%|          | 0/2000 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
No chat template is set for this tokenizer, falling back to a default clas

[Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 61135
}), Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 2000
})]


For this demonstration, I trained for only 4000 steps. DPO learns very slowly so one epoch would be ideal. I didn't search for a better learning rate. A higher learning rate may yield better results.

In [None]:
training_arguments = DPOConfig(
        output_dir="./dpo_smollm_135M/",
        evaluation_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_steps=500,
        fp16= not torch.cuda.is_bf16_supported(),
        bf16= torch.cuda.is_bf16_supported(),
        logging_steps=50,
        learning_rate=1e-7,
        eval_steps=50,
        max_steps=4000,
        warmup_steps=30,
        lr_scheduler_type="linear",
        beta=0.1,
)



Start DPO training

In [None]:
trainer = DPOTrainer(
    model,
    ref_model=ref_model,
    args=training_arguments,
    train_dataset=dataset[0],
    eval_dataset=dataset[1],
    tokenizer=tokenizer
)

trainer.train()



Map:   0%|          | 0/61135 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 61,135
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 4,000
  Number of trainable parameters = 134,515,008
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
50,1.3637,1.47255,-4.453824,-4.213479,0.415,-0.240345,-466.539001,-492.149139,4.155413,4.032761
100,1.5111,1.472799,-4.451575,-4.212327,0.4185,-0.239247,-466.527496,-492.126617,4.101677,3.984634
150,1.4436,1.470805,-4.44961,-4.212429,0.4195,-0.237181,-466.528503,-492.106995,4.153218,4.030635
200,1.4195,1.470412,-4.44685,-4.208982,0.4175,-0.237867,-466.494019,-492.079376,4.155098,4.032518
250,1.4724,1.469971,-4.4467,-4.209635,0.4185,-0.237064,-466.500549,-492.077881,4.12049,4.001235
300,1.4158,1.469701,-4.443916,-4.206506,0.419,-0.23741,-466.469269,-492.050049,4.091438,3.97551
350,1.5797,1.470456,-4.443616,-4.205503,0.4175,-0.238113,-466.459259,-492.047028,4.143157,4.021668
400,1.4158,1.468347,-4.439083,-4.202416,0.4175,-0.236666,-466.428406,-492.001709,4.14228,4.020862
450,1.357,1.469454,-4.439311,-4.200433,0.4185,-0.238877,-466.408569,-492.003998,4.158603,4.035491
500,1.4389,1.468502,-4.436774,-4.198261,0.417,-0.238513,-466.386841,-491.978638,4.134239,4.013571



***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/dpo_smollm_135M/checkpoint-500
Configuration saved in ./drive/MyDrive/dpo_smollm_135M/checkpoint-500/config.json
Configuration saved in ./drive/MyDrive/dpo_smollm_135M/checkpoint-500/generation_config.json
Model weights saved in ./drive/MyDrive/dp

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
50,1.3637,1.47255,-4.453824,-4.213479,0.415,-0.240345,-466.539001,-492.149139,4.155413,4.032761
100,1.5111,1.472799,-4.451575,-4.212327,0.4185,-0.239247,-466.527496,-492.126617,4.101677,3.984634
150,1.4436,1.470805,-4.44961,-4.212429,0.4195,-0.237181,-466.528503,-492.106995,4.153218,4.030635
200,1.4195,1.470412,-4.44685,-4.208982,0.4175,-0.237867,-466.494019,-492.079376,4.155098,4.032518
250,1.4724,1.469971,-4.4467,-4.209635,0.4185,-0.237064,-466.500549,-492.077881,4.12049,4.001235
300,1.4158,1.469701,-4.443916,-4.206506,0.419,-0.23741,-466.469269,-492.050049,4.091438,3.97551
350,1.5797,1.470456,-4.443616,-4.205503,0.4175,-0.238113,-466.459259,-492.047028,4.143157,4.021668
400,1.4158,1.468347,-4.439083,-4.202416,0.4175,-0.236666,-466.428406,-492.001709,4.14228,4.020862
450,1.357,1.469454,-4.439311,-4.200433,0.4185,-0.238877,-466.408569,-492.003998,4.158603,4.035491
500,1.4389,1.468502,-4.436774,-4.198261,0.417,-0.238513,-466.386841,-491.978638,4.134239,4.013571



***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/dpo_smollm_135M/checkpoint-2000
Configuration saved in ./drive/MyDrive/dpo_smollm_135M/checkpoint-2000/config.json
Configuration saved in ./drive/MyDrive/dpo_smollm_135M/checkpoint-2000/generation_config.json
Model weights saved in ./drive/MyDrive/dpo_smollm_135M/checkpoint-2000/model.safetensors
tokenizer config file saved in ./drive/MyDrive/dpo_smollm_135M/checkpoint-2000/tokenizer_co

TrainOutput(global_step=4000, training_loss=1.4520859870910645, metrics={'train_runtime': 14629.2239, 'train_samples_per_second': 4.375, 'train_steps_per_second': 0.273, 'total_flos': 0.0, 'train_loss': 1.4520859870910645, 'epoch': 1.0468463752944255})

#SmolLM 360M
The same but for the 360M version.

In [None]:
import torch, multiprocessing
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig

In [None]:
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
  !pip install flash-attn
  attn_implementation='flash_attention_2'
  print("Your GPU is compatible with FlashAttention.")
else:
  attn_implementation='eager'
  print("Your GPU is not compatible with FlashAttention.")

model_name = "HuggingFaceTB/SmolLM-360M"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 2
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

dataset_train_sft = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
dataset_test_sft = load_dataset("HuggingFaceH4/ultrachat_200k", split="test_sft[:5%]")

model = AutoModelForCausalLM.from_pretrained(
          model_name, attn_implementation=attn_implementation, device_map={"": 0}
)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})

training_arguments = SFTConfig(
        output_dir="./sft_smollm_360M/",
        eval_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_steps=500,
        logging_steps=50,
        learning_rate=2e-5,
        fp16= not torch.cuda.is_bf16_supported(),
        bf16= torch.cuda.is_bf16_supported(),
        eval_steps=50,
        max_steps=4000,
        warmup_steps=30,
        max_seq_length=2048,
        lr_scheduler_type="linear",
)

trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_train_sft,
        eval_dataset=dataset_test_sft,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

Your GPU is compatible with FlashAttention.


tokenizer_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.44k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attentio

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



Map:   0%|          | 0/207865 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.


Map:   0%|          | 0/1156 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 207,865
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 4,000
  Number of trainable parameters = 361,821,120
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
Detected flash_attn version: 2.6.3


Step,Training Loss,Validation Loss
50,1.8196,1.706369
100,1.6092,1.579805
150,1.5273,1.549133
200,1.5498,1.533507
250,1.5261,1.52407
300,1.5073,1.517131
350,1.4875,1.511837
400,1.4945,1.507245
450,1.5003,1.503666
500,1.4981,1.500105



***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8
Saving model checkpoint to ./drive/M

Step,Training Loss,Validation Loss
50,1.8196,1.706369
100,1.6092,1.579805
150,1.5273,1.549133
200,1.5498,1.533507
250,1.5261,1.52407
300,1.5073,1.517131
350,1.4875,1.511837
400,1.4945,1.507245
450,1.5003,1.503666
500,1.4981,1.500105



***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8

***** Running Evaluation *****
  Num examples = 1156
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/sft_smollm_360M/checkpoint-4000
Configuration saved in ./drive/MyDrive/sft_smollm_360M/checkpoint-4000/config.json
Configuration saved in ./drive/MyDrive/sft_smollm_360M/checkpoint-4000/generation_config.json
Model weights saved in ./drive/MyDrive

TrainOutput(global_step=4000, training_loss=1.4588746814727782, metrics={'train_runtime': 29672.85, 'train_samples_per_second': 2.157, 'train_steps_per_second': 0.135, 'total_flos': 2.279362120992e+17, 'train_loss': 1.4588746814727782, 'epoch': 0.3078817733990148})

In [None]:
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
  !pip install flash-attn
  attn_implementation='flash_attention_2'
  print("Your GPU is compatible with FlashAttention.")
else:

  attn_implementation='eager'
  print("Your GPU is not compatible with FlashAttention.")

model_name = "HuggingFaceTB/SmolLM-360M"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 2
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

model = AutoModelForCausalLM.from_pretrained(
          model_name, attn_implementation=attn_implementation, device_map={"": 0}
)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})

ref_model = AutoModelForCausalLM.from_pretrained(
          "./sft_smollm_360M/checkpoint-4000", attn_implementation=attn_implementation, device_map={"": 0}
)

dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=["train_prefs","test_prefs"])

tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"


def process(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset[0] = dataset[0].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

dataset[1] = dataset[1].map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

print(dataset)

training_arguments = DPOConfig(
        output_dir="./dpo_smollm_360M/",
        evaluation_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_steps=500,
        fp16= not torch.cuda.is_bf16_supported(),
        bf16= torch.cuda.is_bf16_supported(),
        logging_steps=50,
        learning_rate=1e-7,
        eval_steps=50,
        max_steps=4000,
        warmup_steps=30,
        lr_scheduler_type="linear",
        beta=0.1,
)

trainer = DPOTrainer(
    model,
    ref_model=ref_model,
    args=training_arguments,
    train_dataset=dataset[0],
    eval_dataset=dataset[1],
    tokenizer=tokenizer
)

trainer.train()