In [1]:
# !pip install trl

In [2]:
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM
from transformers import TrainerCallback
from transformers import AutoTokenizer

# import matplotlib.pyplot as plt
# from IPython.display import display, clear_output
from transformers import EarlyStoppingCallback

from datasets import load_dataset
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# IN_MODEL_NAME = "Qwen3-0.6B"
VERSION = "0.9"
IN_MODEL_NAME = "Llama-3.2-1B-Instruct"
OUT_MODEL_NAME = f"{IN_MODEL_NAME}_SFT_{VERSION}"

INPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/pre_trained/{IN_MODEL_NAME}"
OUTPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/trained/{OUT_MODEL_NAME}"

In [4]:
dataset = load_dataset("/home/odedh/SML-For-Debug/data/dataset2.0/")
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2861
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 317
    })
})

In [5]:
# load the model
model = AutoModelForCausalLM.from_pretrained(
    INPUT_MODEL_PATH,
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(INPUT_MODEL_PATH)

---
Adding new classification tokens   


---

In [6]:
# specials = ["<BUG_TRUE>", "<BUG_FALSE>"]       # pick any unique strings
# tokenizer.add_special_tokens({"additional_special_tokens": specials})
# model.resize_token_embeddings(len(tokenizer))  # must be done after adding tokens
# tokenizer.save_pretrained(f"{OUTPUT_MODEL_PATH}/tokenizer")  # save tokenizer

In [7]:
# for token in specials:
#     print(f"Token: {token}, ID: {tokenizer.convert_tokens_to_ids(token)}")

In [8]:
# def freeze_bottom_layers(model, freeze_ratio: float = 0.5):
#     """
#     Freezes the bottom fraction of transformer layers.
#     For LLaMA-like models (model.model.layers structure).
    
#     Args:
#         model: Huggingface AutoModelForCausalLM
#         freeze_ratio: Fraction of layers to freeze from the bottom (0.5 = freeze lower 50%)
#     """
#     print("Total trainable parameters before freezing:",
#           sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6, "M")
#     # Get all transformer blocks
#     layers = model.model.layers
#     total_layers = len(layers)
#     num_to_freeze = int(total_layers * freeze_ratio)

#     print(f"Freezing bottom {num_to_freeze}/{total_layers} layers ({freeze_ratio*100:.0f}%).")

#     # Freeze selected layers
#     for idx, layer in enumerate(layers):
#         if idx < num_to_freeze:
#             for param in layer.parameters():
#                 param.requires_grad = False

#     # Freeze embeddings too (recommended)
#     for param in model.model.embed_tokens.parameters():
#         param.requires_grad = False

#     # Always keep lm_head trainable
#     print(f"Finished freezing. Total trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M")

# freeze_bottom_layers(model, freeze_ratio=0.1)


In [9]:
max_steps = 3000

training_args = SFTConfig(
    output_dir = OUTPUT_MODEL_PATH,
    per_device_train_batch_size = 6,
    per_device_eval_batch_size = 6,
    learning_rate = 1e-05,
    gradient_accumulation_steps = 2,
    max_steps = max_steps,
    warmup_steps=max_steps // 10, # 10% of max_steps
    # num_train_epochs = 10,
    
    logging_strategy = "steps",
    logging_steps = 500,
    save_strategy = "steps",
    save_steps = 500,
    eval_strategy = "steps",
    eval_steps = 500,
    bf16 = True,
    data_seed=42,
    gradient_checkpointing=False,
    report_to = "none",
    load_best_model_at_end= True,
    metric_for_best_model= "eval_loss",
    max_grad_norm=1.0,
    save_total_limit=10,
    optim="adamw_torch",
    lr_scheduler_type="cosine_with_restarts",
    weight_decay=0.04,
)


trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=500, #FIXME
    early_stopping_threshold=0.0)
)

In [10]:
class LRSchedulerLogger(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:
            current_lr = trainer.optimizer.param_groups[0]['lr']
            print(f"Step {state.global_step}: LR = {current_lr:.6e}")


trainer.add_callback(LRSchedulerLogger())

In [11]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.2909,
1000,0.1117,
1500,0.0449,
2000,0.0131,
2500,0.0018,
3000,0.0002,


Step 100: LR = 3.333333e-06
Step 200: LR = 6.666667e-06
Step 300: LR = 1.000000e-05
Step 400: LR = 9.966192e-06
Step 500: LR = 9.865224e-06
Step 600: LR = 9.698463e-06
Step 700: LR = 9.468163e-06
Step 800: LR = 9.177439e-06
Step 900: LR = 8.830222e-06
Step 1000: LR = 8.431208e-06
Step 1100: LR = 7.985793e-06
Step 1200: LR = 7.500000e-06
Step 1300: LR = 6.980399e-06
Step 1400: LR = 6.434016e-06
Step 1500: LR = 5.868241e-06
Step 1600: LR = 5.290724e-06
Step 1700: LR = 4.709276e-06
Step 1800: LR = 4.131759e-06
Step 1900: LR = 3.565984e-06
Step 2000: LR = 3.019601e-06
Step 2100: LR = 2.500000e-06
Step 2200: LR = 2.014207e-06
Step 2300: LR = 1.568792e-06
Step 2400: LR = 1.169778e-06
Step 2500: LR = 8.225609e-07
Step 2600: LR = 5.318368e-07
Step 2700: LR = 3.015369e-07
Step 2800: LR = 1.347756e-07
Step 2900: LR = 3.380821e-08
Step 3000: LR = 0.000000e+00


TrainOutput(global_step=3000, training_loss=0.07709696399917206, metrics={'train_runtime': 3556.4978, 'train_samples_per_second': 10.122, 'train_steps_per_second': 0.844, 'total_flos': 2.1474211282078925e+17, 'train_loss': 0.07709696399917206})