In [1]:
# !pip install trl

In [2]:
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM
from transformers import TrainerCallback
# import matplotlib.pyplot as plt
# from IPython.display import display, clear_output
from transformers import EarlyStoppingCallback

from datasets import load_dataset
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# IN_MODEL_NAME = "Qwen3-0.6B"
VERSION = "0.3"
IN_MODEL_NAME = "Llama-3.2-1B-Instruct"
OUT_MODEL_NAME = f"{IN_MODEL_NAME}_SFT_{VERSION}"

INPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/pre_trained/{IN_MODEL_NAME}"
OUTPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/trained/{OUT_MODEL_NAME}"

In [4]:
dataset = load_dataset("/home/odedh/SML-For-Debug/data/dataset/")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 9010
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 1002
    })
})

In [6]:
# load the model
model = AutoModelForCausalLM.from_pretrained(
    INPUT_MODEL_PATH,
    trust_remote_code=True,
)

In [7]:
# def freeze_bottom_layers(model, freeze_ratio: float = 0.5):
#     """
#     Freeze the lower `freeze_ratio` fraction of transformer layers.
#     For Llama family (`model.model.layers`).
#     """
#     # 1. Get the list of block modules
#     blocks = model.model.layers      # Llama‑3 style
#     total_layers = len(blocks)
    
#     # 2. Decide how many to freeze
#     n_freeze = int(total_layers * freeze_ratio)
#     print(f"Freezing {n_freeze} / {total_layers} layers "
#           f"({freeze_ratio:.0%} of the network).")
    
#     # 3. Disable grads for selected layers
#     for idx, layer in enumerate(blocks):
#         if idx < n_freeze:
#             for param in layer.parameters():
#                 param.requires_grad = False
    
#     # 4. Always keep lm_head & final norm trainable
#     return model

# model = freeze_bottom_layers(model, freeze_ratio=0.5)  # lower half frozen

In [8]:
max_steps = 5_000

training_args = SFTConfig(
    output_dir = OUTPUT_MODEL_PATH,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    learning_rate = 1e-05,
    gradient_accumulation_steps = 2,
    max_steps = max_steps,
    warmup_steps=max_steps // 10, # 10% of max_steps
    # num_train_epochs = 10,
    
    logging_strategy = "steps",
    logging_steps = 50,
    save_strategy = "steps",
    save_steps = 50,
    eval_strategy = "steps",
    eval_steps = 50,
    bf16 = True,
    data_seed=42,
    gradient_checkpointing=True,
    report_to = "none",
    load_best_model_at_end= True,
    metric_for_best_model= "eval_loss",
    max_grad_norm=1.0,
    save_total_limit=3,
    optim="adamw_torch",
    lr_scheduler_type="linear",
    
    weight_decay=0.04,
)


trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=10,     # stop after 10 evals with no loss improvement
    early_stopping_threshold=0.0)
)

In [9]:
class LRSchedulerLogger(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:
            current_lr = trainer.optimizer.param_groups[0]['lr']
            print(f"Step {state.global_step}: LR = {current_lr:.6e}")


trainer.add_callback(LRSchedulerLogger())

In [10]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,0.3526,0.181785
100,0.1557,0.1429
150,0.1483,0.1303
200,0.1407,0.126588
250,0.135,0.125657
300,0.1214,0.121831
350,0.1313,0.123198
400,0.1246,0.121852
450,0.1229,0.121838
500,0.128,0.121921


Step 100: LR = 2.000000e-06


Step 200: LR = 4.000000e-06


Step 300: LR = 6.000000e-06


Step 400: LR = 8.000000e-06


Step 500: LR = 1.000000e-05


Step 600: LR = 9.777778e-06


Step 700: LR = 9.555556e-06


Step 800: LR = 9.333333e-06


Step 900: LR = 9.111111e-06


Step 1000: LR = 8.888889e-06


Step 1100: LR = 8.666667e-06


Step 1200: LR = 8.444444e-06


Step 1300: LR = 8.222222e-06


Step 1400: LR = 8.000000e-06


Step 1500: LR = 7.777778e-06


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1550, training_loss=0.0960299619551628, metrics={'train_runtime': 4452.2265, 'train_samples_per_second': 17.969, 'train_steps_per_second': 1.123, 'total_flos': 1.458017952892846e+17, 'train_loss': 0.0960299619551628})