
# Standalone Transformers SFT Notebook (Chat-format, Assistant-only Loss)

This notebook trains a causal language model using **Hugging Face Transformers** (no TRL).  
It expects a dataset with a `messages` column: a list of chat turns like
```json
{"messages": [
  {"role": "system", "content": "You are helpful."},
  {"role": "user", "content": "Say hi."},
  {"role": "assistant", "content": "Hello!"}
]}
```
**Features**
- Uses the model's `chat_template` (`tokenizer.apply_chat_template`) to format messages.
- Masks the loss to **only** the last assistant turn (assistant-only loss) without TRL.
- Optional **LoRA** (PEFT) and optional **4-bit** quantization (bitsandbytes).
- Works with Qwen/TinyLlama/phi-3-mini etc. (set `MODEL_ID`).
- Includes a **toy dataset** if you don't have one yet.

> Tip: If your `content` pieces are structured (e.g., list of parts), the normalizer flattens text parts.


In [None]:
%pip install -U "huggingface-hub>=0.34.0,<1.0"
%pip install -U datasets bitsandbytes peft #flash-attn
from google.colab import auth
auth.authenticate_user()
from google.colab import userdata
from google.colab import runtime
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import os
import gc
import math
import json, random
import shutil
import torch
from datetime import datetime
from huggingface_hub import login, whoami
import wandb
VERSION = "v1.1st"
# Set environment variables for W&B and PyTorch memory management
os.environ["WANDB_DISABLED"] = "false"  # or "true" to mute
os.environ["WANDB_PROJECT"]   = "qwen3coder-finetune-fp16"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

REPO_URL = "https://github.com/UH-Insure/Colab-Training.git"
REPO = "Colab-Training"

# Clone or update the repository
os.chdir('/content' if os.path.exists('/content') else os.getcwd())
if os.path.exists(REPO):
    print(f"Repo '{REPO}' exists, pulling latest changes...")
    os.chdir(REPO)
    !git reset --hard HEAD
    !git pull
else:
    print(f"Cloning repo '{REPO}'...")
    !git clone "$REPO_URL" "$REPO"
    os.chdir(REPO)

Repo 'Colab-Training' exists, pulling latest changes...
HEAD is now at ce8a16b Added filetype = text to data
Already up to date.


In [None]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HF_TOKEN')
WANDB_TOKEN = userdata.get('WANDB_KEY')
os.environ["WANDB_API_KEY"] = WANDB_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN
wandb.login(key=WANDB_TOKEN, relogin=True)
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoshuareedenterprises[0m ([33mjoshuareedenterprises-university-of-houston[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in as: j05hr3d


In [None]:

import os, json, random
from dataclasses import dataclass
from typing import List, Dict, Any

# Disable Torch Dynamo / torch.compile to avoid environment-specific issues.
os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")

import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments, BitsAndBytesConfig
)

print("Python:", os.sys.version)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Torch: 2.9.0+cu126
CUDA available: True
Device: NVIDIA A100-SXM4-80GB


In [None]:

# ==== USER CONFIG ====
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
#MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"  # e.g., "Qwen/Qwen2.5-0.5B-Instruct", "microsoft/phi-3-mini-4k-instruct"
TRUST_REMOTE_CODE = True  # set True for models (e.g., Qwen) that need remote code
OUTPUT_DIR = f"j05hr3d/SFT-Qwen2.5-Coder-7B_{VERSION}"
#OUTPUT_DIR = "j05hr3d/SFT-Qwen3-Coder-30B"

# Data: provide a path to a JSONL/JSON with a `messages` field per record
JSONL_PATH = f"data/nocomments_message_format_{VERSION}.jsonl"   # e.g., "/content/my_data.jsonl" (leave blank to use toy dataset)

# Training hyperparameters
MAX_SEQ_LEN = 4096            # adjust to your context length + memory
BATCH_SIZE = 2                # per-device train batch size
GRAD_ACC = 4
EPOCHS = 3
LR = 1e-4
EVAL_STEPS = 75
SAVE_STEPS = 75
LOG_STEPS = 75
WARMUP_RATION = 0.03
# LoRA options
USE_LORA = True
LORA_R = 32
LORA_ALPHA = LORA_R * 2
LORA_DROPOUT = 0.05
# Narrow target modules list if memory-limited. Common: q_proj, v_proj, k_proj, o_proj
#LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
#LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj"]  # lora_target_modules

# bitsandbytes config
USE_NESTED_QUANT = True  # use_nested_quant
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"  # bnb_4bit_compute_dtype

# 4-bit options
USE_4BIT = True
BNB_COMPUTE_DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32


In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=TRUST_REMOTE_CODE)
# Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("Pad token id:", tokenizer.pad_token_id, "EOS:", tokenizer.eos_token_id)


Pad token id: 151643 EOS: 151645


In [None]:
from src.data import load_or_make_dataset, _chunk_messages_by_tokens, _normalize_messages, explode_long_conversations

raw = load_or_make_dataset(JSONL_PATH)
print(raw[:1])

{'filename': ['AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry'], 'filetype': ['cryptol'], 'messages': [[{'role': 'system', 'content': 'Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after).\n'}, {'role': 'user', 'content': 'Write a Cryptol specification that defines AES encryption with key expansion and proves a property that encrypting with a raw key equals encrypting with its expanded key schedule for all valid key sizes and plaintexts.'}, {'role': 'assistant', 'content': '```cryptol\nmodule AES where\n\nimport `Common::AES\n\ntype constraint ValidKey k m = (k == 128 + m * 64, 2 >= m)\n\ntype ExpandedKey m = KeySchedule m\n\nencrypt : {k,m} ValidKey k m => [k] -> [128] -> [128]\nencrypt = aesEncrypt`{Mode = m}\n\nexpandKey : {k,m} ValidKey k m => [k] -> ExpandedKey m\nexpandKey = ExpandKey`{Mode = m}\n\nencryptWithSchedule : {k,m} ValidKey k m => ExpandedKey m -> [128] -> [128]\nencryptWithSchedule = aesEncryptWithSchedule`{Mode = m}\n\nproper

In [None]:
# --- actually apply the splitting to `raw` using MAX_SEQ_LEN = 4096 ---
raw = explode_long_conversations(raw, tokenizer, MAX_SEQ_LEN)
print("After splitting:")
print(raw[:1])

Skipping over-long single message with 4183 tokens from AES-GCM-SIV-proof/proof/asm/deps/saw-script/examples/openssl_aes/AES128TBox.cry
Skipping over-long single message with 4329 tokens from aws-lc-verification/cryptol-specs/McEliece_KEM/high-level/keccak.cry
Skipping over-long single message with 7109 tokens from aws-lc-verification/cryptol-specs/Primitive/Asymmetric/Signature/DilithiumR1BV.cry
Skipping over-long single message with 7008 tokens from aws-lc-verification/cryptol-specs/Primitive/Asymmetric/Signature/DilithiumR1Int.cry
Skipping over-long single message with 9267 tokens from aws-lc-verification/cryptol-specs/Primitive/Symmetric/Cipher/Stream/chacha20.cry
Skipping over-long single message with 6550 tokens from aws-lc-verification/cryptol-specs/Primitive/Symmetric/Cipher/Stream/ZUC.cry
Skipping over-long single message with 4126 tokens from aws-lc-verification/cryptol-specs/Primitive/Symmetric/Cipher/Block/DES.cry
Skipping over-long single message with 6973 tokens from BLST

Token indices sequence length is longer than the specified maximum sequence length for this model (77220 > 32768). Running this sequence through the model will result in indexing errors


Skipping over-long single message with 77078 tokens from cryptol/tests/suiteb/aes-mct-ecb.cry
Skipping over-long single message with 5107 tokens from cryptol-specs/Common/EC/PrimeField/PFEC.cry
Skipping over-long single message with 11809 tokens from cryptol-specs/Primitive/Asymmetric/KEM/ML_KEM/Specification.cry
Skipping over-long single message with 11277 tokens from cryptol-specs/Primitive/Asymmetric/Signature/ML_DSA/Specification.cry
Skipping over-long single message with 4991 tokens from cryptol-specs/Primitive/Asymmetric/Signature/XMSS/Specification.cry
Skipping over-long single message with 6234 tokens from cryptol-specs/Primitive/Asymmetric/Signature/FALCON/1.2/falcon_parameterized.cry
Skipping over-long single message with 21234 tokens from cryptol-specs/Primitive/Asymmetric/Signature/FALCON/1.2/falcon_512.cry
Skipping over-long single message with 4108 tokens from cryptol-specs/Primitive/Asymmetric/Signature/ECDSA/Tests/ECDSA_P224_SHA3_224.cry
Skipping over-long single messag

In [None]:
def encode_chat_last_assistant_only(messages: List[Dict[str, Any]], max_len: int):
    msgs = _normalize_messages(messages)

    # find the last assistant turn
    last_asst_idx = -1
    for i in range(len(msgs) - 1, -1, -1):
        if msgs[i].get("role") == "assistant":
            last_asst_idx = i
            break

    if last_asst_idx == -1:
        # no assistant -> skip this example
        return None

    # history without the last assistant, and history with it
    hist_wo = msgs[:last_asst_idx]          # may be []
    hist_w  = msgs[:last_asst_idx + 1]      # at least one message (the assistant)

    # handle empty history safely
    if hist_wo:
        ids_hist = tokenizer.apply_chat_template(
            hist_wo,
            tokenize=True,
            add_generation_prompt=False,
        )
    else:
        ids_hist = []

    # full sequence including last assistant
    ids_full = tokenizer.apply_chat_template(
        hist_w,
        tokenize=True,
        add_generation_prompt=False,
    )

    # Ensure EOS at the end (optional but nice)
    if tokenizer.eos_token_id is not None and (len(ids_full) == 0 or ids_full[-1] != tokenizer.eos_token_id):
        ids_full = ids_full + [tokenizer.eos_token_id]

    input_ids = ids_full

    # labels: mask out the history, keep only the last assistant tokens
    labels = [-100] * len(ids_hist) + input_ids[len(ids_hist):]
    attention_mask = [1] * len(input_ids)

    # Truncate from the left if needed
    if len(input_ids) > max_len:
        input_ids = input_ids[-max_len:]
        attention_mask = attention_mask[-max_len:]
        labels = labels[-max_len:]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


def ds_map_fn(ex):
    out = encode_chat_last_assistant_only(ex["messages"], MAX_SEQ_LEN)
    if out is None:
        # IMPORTANT: always return all keys, even for rows we will drop
        return {
            "input_ids": [],
            "attention_mask": [],
            "labels": [],
            "_drop": True,
        }
    out["_drop"] = False
    return out


raw = raw.class_encode_column("filetype")

split = raw.train_test_split(test_size=0.1, seed=42, stratify_by_column="filetype")
train_raw, eval_raw = split["train"], split["test"]

train_ds = train_raw.map(
    ds_map_fn,
    remove_columns=[c for c in train_raw.column_names if c != "messages"],
)
train_ds = train_ds.filter(lambda ex: ex["_drop"] is False).remove_columns("_drop")

eval_ds = eval_raw.map(
    ds_map_fn,
    remove_columns=[c for c in eval_raw.column_names if c != "messages"],
)
eval_ds = eval_ds.filter(lambda ex: ex["_drop"] is False).remove_columns("_drop")

# quick inspection
print("Train example lens:", len(train_ds[0]["input_ids"]), len(train_ds[0]["labels"]))
print("Eval example lens:", len(eval_ds[0]["input_ids"]), len(eval_ds[0]["labels"]))

max_train_len = max(len(ex["input_ids"]) for ex in train_ds)
max_eval_len  = max(len(ex["input_ids"]) for ex in eval_ds)
print("Max train len:", max_train_len)
print("Max eval  len:", max_eval_len)

print(train_ds, eval_ds)

Casting to class labels:   0%|          | 0/2085 [00:00<?, ? examples/s]

Map:   0%|          | 0/1876 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1876 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Filter:   0%|          | 0/209 [00:00<?, ? examples/s]

Train example lens: 116 116
Eval example lens: 264 264
Max train len: 4091
Max eval  len: 3861
Dataset({
    features: ['messages', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1827
}) Dataset({
    features: ['messages', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 205
})


In [None]:

@dataclass
class DataCollatorForCausalLMWithLabels:
    tokenizer: AutoTokenizer
    label_pad_token_id: int = -100

    def __call__(self, features: List[Dict[str, Any]]):
        # Separate out labels for manual padding
        labels = [f["labels"] for f in features]
        batch_inputs = {
            "input_ids": [f["input_ids"] for f in features],
            "attention_mask": [f["attention_mask"] for f in features]
        }
        padded = self.tokenizer.pad(
            batch_inputs, padding=True, return_tensors="pt"
        )
        max_len = padded["input_ids"].shape[1]
        padded_labels = torch.full(
            (len(labels), max_len), self.label_pad_token_id, dtype=torch.long
        )
        for i, lab in enumerate(labels):
            padded_labels[i, :len(lab)] = torch.tensor(lab, dtype=torch.long)
        padded["labels"] = padded_labels
        return padded

data_collator = DataCollatorForCausalLMWithLabels(tokenizer=tokenizer)


In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
gc.collect(); torch.cuda.empty_cache()

In [None]:

# 4-bit quantization
compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=USE_NESTED_QUANT,
)

base = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        load_in_8bit=False,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto",
        use_cache=False,
        trust_remote_code=True,
        attn_implementation="sdpa",
)

base = prepare_model_for_kbit_training(base)

# Ensure model has pad token id in config for generation convenience
if getattr(base.config, "pad_token_id", None) is None:
    base.config.pad_token_id = tokenizer.pad_token_id


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
blk = base.model.layers[0]           # Llama/Qwen-style
print("ATTN:", blk.self_attn)         # has q_proj, k_proj, v_proj, o_proj
print("MLP:", blk.mlp)
target_modules = LORA_TARGET_MODULES
# LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
# LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

ATTN: Qwen2Attention(
  (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
  (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
  (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
  (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
)
MLP: Qwen2MLP(
  (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
  (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
  (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
  (act_fn): SiLUActivation()
)


In [None]:
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=LORA_TARGET_MODULES,
    bias="none",
)
# model = PeftModel.from_pretrained(base, "HUGGING FACE ADAPTERS")
model = get_peft_model(base, lora_cfg)
model.print_trainable_parameters()

trainable params: 80,740,352 || all params: 7,696,356,864 || trainable%: 1.0491


In [None]:

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    warmup_ratio=WARMUP_RATION,
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    optim="paged_adamw_8bit",
    report_to=["wandb"],
    load_best_model_at_end=True,
    hub_model_id=OUTPUT_DIR,
    hub_strategy="every_save",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.0
        )
    ],
)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True})
model.config.use_cache = False
model.config.attn_implementation = "sdpa"

  trainer = Trainer(


In [None]:
print("Training...")
try:
  trainer.train(resume_from_checkpoint=False)
  eval_results = trainer.evaluate()
except Exception as e:
  print(f"Training failed: {e}")
finally:
  trainer.save_state()
  trainer.save_model(OUTPUT_DIR)
  print("Saved model to:", OUTPUT_DIR)

wandb.finish()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)
print(f"Eval loss = {eval_loss:.2f}, Perplexity = {perplexity:.2f}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Training...


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
75,0.9192,0.836521
150,0.7159,0.701979
225,0.6289,0.60639
300,0.4575,0.58268
375,0.424,0.530009
450,0.4214,0.506131
525,0.3662,0.518202
600,0.3231,0.510086
675,0.3314,0.507437


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ..._v1.1st/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...adapter_model.safetensors:  13%|#2        | 41.9MB /  323MB            

  ...-7B_v1.1st/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

Saved model to: j05hr3d/SFT-Qwen2.5-Coder-7B_v1.1st


0,1
eval/loss,‚ñà‚ñÖ‚ñÉ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñà‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/samples_per_second,‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
eval/steps_per_second,‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÅ‚ñà‚ñÖ‚ñÉ‚ñÇ‚ñÉ‚ñÑ‚ñÇ‚ñÅ
train/learning_rate,‚ñà‚ñá‚ñÜ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÇ‚ñÅ
train/loss,‚ñà‚ñÜ‚ñÖ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ

0,1
eval/loss,0.50613
eval/runtime,36.098
eval/samples_per_second,5.679
eval/steps_per_second,0.72
total_flos,1.3449351539112653e+17
train/epoch,2.94967
train/global_step,675
train/grad_norm,0.40226
train/learning_rate,0.0
train/loss,0.3314


Eval loss = 0.51, Perplexity = 1.66


In [None]:

# Quick smoke test generation (greedy/short)
model.eval()
prompt_msgs = [
    {"role": "system", "content": "Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after)."},
    {"role": "user", "content": "Implement a Caesar cipher. Define the functions `encrypt` and `decrypt` with the signature: `{n} [8] -> [n][8] -> [n][8]`."}
]

# add_generation_prompt=True -> model should produce assistant continuation
inputs = tokenizer.apply_chat_template(
    prompt_msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(model.device)

with torch.no_grad():
    out = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

generated = out[0, inputs.shape[-1]:]
print(tokenizer.decode(generated, skip_special_tokens=True))


In [None]:
runtime.unassign()


## Notes & References
- ü§ó Transformers `Trainer` docs (training loop, arguments)  
  https://huggingface.co/docs/transformers/main_classes/trainer
- `apply_chat_template` for model-specific chat formatting  
  https://huggingface.co/docs/transformers/main/chat_templating
- PEFT / LoRA docs  
  https://huggingface.co/docs/peft
- BitsAndBytes 4-bit quantization  
  https://github.com/TimDettmers/bitsandbytes
