In [1]:
# !pip install trl

In [2]:
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM
from transformers import TrainerCallback
from transformers import AutoTokenizer

# import matplotlib.pyplot as plt
# from IPython.display import display, clear_output
from transformers import EarlyStoppingCallback

from datasets import load_dataset
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# IN_MODEL_NAME = "Qwen3-0.6B"
VERSION = "0.4"
IN_MODEL_NAME = "Llama-3.2-1B-Instruct"
OUT_MODEL_NAME = f"{IN_MODEL_NAME}_SFT_{VERSION}"

INPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/pre_trained/{IN_MODEL_NAME}"
OUTPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/trained/{OUT_MODEL_NAME}"

In [4]:
dataset = load_dataset("/home/odedh/SML-For-Debug/data/dataset/")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 9010
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 1002
    })
})

In [6]:
# load the model
model = AutoModelForCausalLM.from_pretrained(
    INPUT_MODEL_PATH,
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(INPUT_MODEL_PATH)

---
Adding new classification tokens   


---

In [7]:
specials = ["<BUG_TRUE>", "<BUG_FALSE>"]       # pick any unique strings
tokenizer.add_special_tokens({"additional_special_tokens": specials})
model.resize_token_embeddings(len(tokenizer))  # must be done after adding tokens
tokenizer.save_pretrained(f"{OUTPUT_MODEL_PATH}/tokenizer")  # save tokenizer

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


('/home/odedh/SML-For-Debug/models/trained/Llama-3.2-1B-Instruct_SFT_0.4/tokenizer/tokenizer_config.json',
 '/home/odedh/SML-For-Debug/models/trained/Llama-3.2-1B-Instruct_SFT_0.4/tokenizer/special_tokens_map.json',
 '/home/odedh/SML-For-Debug/models/trained/Llama-3.2-1B-Instruct_SFT_0.4/tokenizer/chat_template.jinja',
 '/home/odedh/SML-For-Debug/models/trained/Llama-3.2-1B-Instruct_SFT_0.4/tokenizer/tokenizer.json')

In [8]:
for token in specials:
    print(f"Token: {token}, ID: {tokenizer.convert_tokens_to_ids(token)}")

Token: <BUG_TRUE>, ID: 128256
Token: <BUG_FALSE>, ID: 128257


In [9]:
def freeze_bottom_layers(model, freeze_ratio: float = 0.5):
    """
    Freezes the bottom fraction of transformer layers.
    For LLaMA-like models (model.model.layers structure).
    
    Args:
        model: Huggingface AutoModelForCausalLM
        freeze_ratio: Fraction of layers to freeze from the bottom (0.5 = freeze lower 50%)
    """
    print("Total trainable parameters before freezing:",
          sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6, "M")
    # Get all transformer blocks
    layers = model.model.layers
    total_layers = len(layers)
    num_to_freeze = int(total_layers * freeze_ratio)

    print(f"Freezing bottom {num_to_freeze}/{total_layers} layers ({freeze_ratio*100:.0f}%).")

    # Freeze selected layers
    for idx, layer in enumerate(layers):
        if idx < num_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False

    # Freeze embeddings too (recommended)
    for param in model.model.embed_tokens.parameters():
        param.requires_grad = False

    # Always keep lm_head trainable
    print(f"Finished freezing. Total trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M")

freeze_bottom_layers(model, freeze_ratio=0.5)


Total trainable parameters before freezing: 1235.818496 M
Freezing bottom 8/16 layers (50%).
Finished freezing. Total trainable params: 486.6M


In [10]:
max_steps = 10_000

training_args = SFTConfig(
    output_dir = OUTPUT_MODEL_PATH,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    learning_rate = 1e-05,
    gradient_accumulation_steps = 2,
    max_steps = max_steps,
    warmup_steps=max_steps // 10, # 10% of max_steps
    # num_train_epochs = 10,
    
    logging_strategy = "steps",
    logging_steps = 50,
    save_strategy = "steps",
    save_steps = 50,
    eval_strategy = "steps",
    eval_steps = 50,
    bf16 = True,
    data_seed=42,
    gradient_checkpointing=False,
    report_to = "none",
    load_best_model_at_end= True,
    metric_for_best_model= "eval_loss",
    max_grad_norm=1.0,
    save_total_limit=3,
    optim="adamw_torch",
    lr_scheduler_type="cosine_with_restarts",
    
    weight_decay=0.05,
)


trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=12,     # stop after 10 evals with no loss improvement
    early_stopping_threshold=0.0)
)

In [11]:
class LRSchedulerLogger(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:
            current_lr = trainer.optimizer.param_groups[0]['lr']
            print(f"Step {state.global_step}: LR = {current_lr:.6e}")


trainer.add_callback(LRSchedulerLogger())

In [12]:
trainer.train()

Step,Training Loss,Validation Loss
50,0.6007,0.41433
100,0.2578,0.2088
150,0.2006,0.190598
200,0.1901,0.179042
250,0.1795,0.173582
300,0.1702,0.166501
350,0.1694,0.16478
400,0.1705,0.161659
450,0.156,0.158298
500,0.1649,0.156466


Step 100: LR = 1.000000e-06


Step 200: LR = 2.000000e-06


Step 300: LR = 3.000000e-06


Step 400: LR = 4.000000e-06


Step 500: LR = 5.000000e-06


Step 600: LR = 6.000000e-06


Step 700: LR = 7.000000e-06


Step 800: LR = 8.000000e-06


Step 900: LR = 9.000000e-06


Step 1000: LR = 1.000000e-05


Step 1100: LR = 9.996954e-06


Step 1200: LR = 9.987820e-06


Step 1300: LR = 9.972609e-06


Step 1400: LR = 9.951340e-06


Step 1500: LR = 9.924039e-06


Step 1600: LR = 9.890738e-06


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1650, training_loss=0.13693040515437271, metrics={'train_runtime': 3228.8879, 'train_samples_per_second': 49.553, 'train_steps_per_second': 3.097, 'total_flos': 1.57626688325419e+17, 'train_loss': 0.13693040515437271})