In [None]:
# !pip install trl

In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM
from transformers import TrainerCallback
from transformers import AutoTokenizer

# import matplotlib.pyplot as plt
# from IPython.display import display, clear_output
from transformers import EarlyStoppingCallback

from datasets import load_dataset
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
# IN_MODEL_NAME = "Qwen3-0.6B"
VERSION = "0.5"
IN_MODEL_NAME = "Llama-3.2-1B-Instruct"
OUT_MODEL_NAME = f"{IN_MODEL_NAME}_SFT_{VERSION}"

INPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/pre_trained/{IN_MODEL_NAME}"
OUTPUT_MODEL_PATH = f"/home/odedh/SML-For-Debug/models/trained/{OUT_MODEL_NAME}"

In [None]:
dataset = load_dataset("/home/odedh/SML-For-Debug/data/dataset/")

In [None]:
dataset

In [None]:
# load the model
model = AutoModelForCausalLM.from_pretrained(
    INPUT_MODEL_PATH,
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(INPUT_MODEL_PATH)

---
Adding new classification tokens   


---

In [None]:
# specials = ["<BUG_TRUE>", "<BUG_FALSE>"]       # pick any unique strings
# tokenizer.add_special_tokens({"additional_special_tokens": specials})
# model.resize_token_embeddings(len(tokenizer))  # must be done after adding tokens
# tokenizer.save_pretrained(f"{OUTPUT_MODEL_PATH}/tokenizer")  # save tokenizer

In [None]:
# for token in specials:
#     print(f"Token: {token}, ID: {tokenizer.convert_tokens_to_ids(token)}")

In [None]:
# def freeze_bottom_layers(model, freeze_ratio: float = 0.5):
#     """
#     Freezes the bottom fraction of transformer layers.
#     For LLaMA-like models (model.model.layers structure).
    
#     Args:
#         model: Huggingface AutoModelForCausalLM
#         freeze_ratio: Fraction of layers to freeze from the bottom (0.5 = freeze lower 50%)
#     """
#     print("Total trainable parameters before freezing:",
#           sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6, "M")
#     # Get all transformer blocks
#     layers = model.model.layers
#     total_layers = len(layers)
#     num_to_freeze = int(total_layers * freeze_ratio)

#     print(f"Freezing bottom {num_to_freeze}/{total_layers} layers ({freeze_ratio*100:.0f}%).")

#     # Freeze selected layers
#     for idx, layer in enumerate(layers):
#         if idx < num_to_freeze:
#             for param in layer.parameters():
#                 param.requires_grad = False

#     # Freeze embeddings too (recommended)
#     for param in model.model.embed_tokens.parameters():
#         param.requires_grad = False

#     # Always keep lm_head trainable
#     print(f"Finished freezing. Total trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M")

# freeze_bottom_layers(model, freeze_ratio=0.1)


In [None]:
max_steps = 10_000

training_args = SFTConfig(
    output_dir = OUTPUT_MODEL_PATH,
    per_device_train_batch_size = 6,
    per_device_eval_batch_size = 6,
    learning_rate = 1e-05,
    gradient_accumulation_steps = 2,
    max_steps = max_steps,
    warmup_steps=max_steps // 10, # 10% of max_steps
    # num_train_epochs = 10,
    
    logging_strategy = "steps",
    logging_steps = 50,
    save_strategy = "steps",
    save_steps = 50,
    eval_strategy = "steps",
    eval_steps = 50,
    bf16 = True,
    data_seed=42,
    gradient_checkpointing=False,
    report_to = "none",
    load_best_model_at_end= True,
    metric_for_best_model= "eval_loss",
    max_grad_norm=1.0,
    save_total_limit=3,
    optim="adamw_torch",
    lr_scheduler_type="cosine_with_restarts",
    
    weight_decay=0.05,
)


trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=8,     # stop after 8 evals with no loss improvement
    early_stopping_threshold=0.0)
)

In [None]:
class LRSchedulerLogger(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:
            current_lr = trainer.optimizer.param_groups[0]['lr']
            print(f"Step {state.global_step}: LR = {current_lr:.6e}")


trainer.add_callback(LRSchedulerLogger())

In [None]:
trainer.train()