
# Standalone Transformers SFT Notebook (Chat-format, Assistant-only Loss)

This notebook trains a causal language model using **Hugging Face Transformers** (no TRL).  
It expects a dataset with a `messages` column: a list of chat turns like
```json
{"messages": [
  {"role": "system", "content": "You are helpful."},
  {"role": "user", "content": "Say hi."},
  {"role": "assistant", "content": "Hello!"}
]}
```
**Features**
- Uses the model's `chat_template` (`tokenizer.apply_chat_template`) to format messages.
- Masks the loss to **only** the last assistant turn (assistant-only loss) without TRL.
- Optional **LoRA** (PEFT) and optional **4-bit** quantization (bitsandbytes).
- Works with Qwen/TinyLlama/phi-3-mini etc. (set `MODEL_ID`).
- Includes a **toy dataset** if you don't have one yet.

> Tip: If your `content` pieces are structured (e.g., list of parts), the normalizer flattens text parts.


In [None]:
%pip install nbstripout
%pip install -U "huggingface-hub>=0.34.0,<1.0"
%pip check
%pip install hf_transfer
%pip install -U  datasets accelerate peft trl bitsandbytes peft flash-attn
%pip install -U "trl>=0.10.0" "transformers>=4.44.0"
from datasets import load_dataset
from transformers import AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
!export HF_HUB_ENABLE_HF_TRANSFER=1

from google.colab import auth
auth.authenticate_user()
import math
import json, random
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, pipeline, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType
import os
import wandb
import shutil
from trl import SFTTrainer, SFTConfig
import torch
from datetime import datetime
from google.colab import userdata
from huggingface_hub import login, whoami
os.environ["WANDB_DISABLED"] = "false"  # or "true" to mute
os.environ["WANDB_PROJECT"]   = "qwen3coder-finetune-fp16"

os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # mitigate fragmentation
REPO_URL="https://github.com/UH-Insure/Colab-Training.git"
REPO="Colab-Training"

os.chdir("/content")

# If repo exists, update it; otherwise, clone fresh
if os.path.exists(REPO):
    print(f"Repo '{REPO}' exists, pulling latest changes...")
    os.chdir(REPO)
    !git reset --hard HEAD   # optional: discard local changes
    !git pull
else:
    print(f"Cloning repo '{REPO}'...")
    !git clone "$REPO_URL" "$REPO"
    os.chdir(REPO)

!nbstripout --install
!git branch -a


# Install dependencies if present
if os.path.exists("requirements.txt"):
    %pip install -r requirements.txt
if os.path.exists("pyproject.toml"):
    %pip install -e .


ipython 7.34.0 requires jedi, which is not installed.
Repo 'Colab-Training' exists, pulling latest changes...
HEAD is now at 873687a Added filetype.
Already up to date.
* [32mmain[m
  [31mremotes/origin/HEAD[m -> origin/main
  [31mremotes/origin/main[m


In [None]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HF_TOKEN')
WANDB_TOKEN = userdata.get('WANDB_KEY')
os.environ["WANDB_API_KEY"] = WANDB_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN
wandb.login(key=WANDB_TOKEN, relogin=True)
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoshuareedenterprises[0m ([33mjoshuareedenterprises-university-of-houston[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in as: j05hr3d


In [None]:

# If running on Colab or a fresh env, uncomment the installs below.
# Install specific versions as needed for your system/CUDA.
# %pip install -U transformers datasets accelerate peft bitsandbytes
# %pip install -U evaluate


In [None]:

import os, json, random
from dataclasses import dataclass
from typing import List, Dict, Any

# Disable Torch Dynamo / torch.compile to avoid environment-specific issues.
os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")

import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments, BitsAndBytesConfig
)

print("Python:", os.sys.version)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Torch: 2.8.0+cu126
CUDA available: True
Device: NVIDIA A100-SXM4-80GB


In [None]:

# ==== USER CONFIG ====
MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"  # e.g., "Qwen/Qwen2.5-0.5B-Instruct", "microsoft/phi-3-mini-4k-instruct"
TRUST_REMOTE_CODE = True  # set True for models (e.g., Qwen) that need remote code
OUTPUT_DIR = "j05hr3d/SFT-Qwen3-4B"

# Data: provide a path to a JSONL/JSON with a `messages` field per record
JSONL_PATH = "data/hybrid_message_format.jsonl"   # e.g., "/content/my_data.jsonl" (leave blank to use toy dataset)

# Training hyperparameters
MAX_SEQ_LEN = 4096            # adjust to your context length + memory
BATCH_SIZE = 2                # per-device train batch size
GRAD_ACC = 4
EPOCHS = 10
LR = 2e-4
EVAL_STEPS = 30
SAVE_STEPS = 30
LOG_STEPS = 30

# LoRA options
USE_LORA = True
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
# Narrow target modules list if memory-limited. Common: q_proj, v_proj, k_proj, o_proj
#LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
#LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj"]  # lora_target_modules

# bitsandbytes config
USE_NESTED_QUANT = True  # use_nested_quant
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"  # bnb_4bit_compute_dtype

# 4-bit options
USE_4BIT = True
BNB_COMPUTE_DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32


In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=TRUST_REMOTE_CODE)
# Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("Pad token id:", tokenizer.pad_token_id, "EOS:", tokenizer.eos_token_id)


Pad token id: 151643 EOS: 151645


In [None]:

def _normalize_messages(msgs: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """Normalize content fields to plain strings; keep only role & content."""
    out = []
    for m in msgs:
        role = m.get("role", "")
        content = m.get("content", "")
        if isinstance(content, list):
            # flatten text parts if given like [{"type":"text","text":"..."}]
            parts = []
            for p in content:
                if isinstance(p, dict) and p.get("type") == "text":
                    parts.append(p.get("text", ""))
                elif isinstance(p, str):
                    parts.append(p)
            content = "".join(parts)
        elif not isinstance(content, str):
            content = str(content)
        out.append({"role": role, "content": content})
    return out

def load_or_make_dataset(jsonl_path: str):
    if jsonl_path:
        if jsonl_path.endswith(".jsonl"):
            ds = load_dataset("json", data_files=jsonl_path, split="train")
        else:
            # assume JSON array
            ds = load_dataset("json", data_files=jsonl_path, split="train")
        # basic validation
        assert "messages" in ds.column_names, "Dataset must have a 'messages' column."
        return ds
    else:
        # Toy dataset (few-shot) for a quick smoke test
        toy = [
            {"messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Write a haiku about the moon."},
                {"role": "assistant", "content": "Silent silver orb\nDrifting high in velvet night\nDreams glow in cool light."}
            ]},
            {"messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "What is 17 * 12?"},
                {"role": "assistant", "content": "17 * 12 = 204."}
            ]},
            {"messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Give me a short Python function that returns the square of a number."},
                {"role": "assistant", "content": "def square(x):\n    return x * x"}
            ]},
        ]
        return Dataset.from_list(toy)

raw = load_or_make_dataset(JSONL_PATH)
print(raw[:1])


{'filetype': ['cryptol'], 'messages': [[{'role': 'system', 'content': 'Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after).\n'}, {'role': 'user', 'content': "Write Cryptol properties to verify the behavior of the 'splitAt' function when applied to the sequence 'x'. Ensure the properties check that the output tuple correctly splits 'x' into two subsequences of equal length, and that concatenating these subsequences reconstructs the original sequence 'x'.\n`x = [1,2,3,4] : [_][8]\ny = (splitAt x) : ([2][8],[2][8])`"}, {'role': 'assistant', 'content': '```cryptol\nx = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([2][8],[2][8])\n\na = y.0 @ 0\nb = y.0 @ 1\nc = y.1 @ 0\nd = y.1 @ 1\n```'}]]}


In [None]:

def encode_chat_last_assistant_only(messages: List[Dict[str, Any]], max_len: int):
    msgs = _normalize_messages(messages)

    # pick the last assistant turn
    last_asst_idx = -1
    for i in range(len(msgs)-1, -1, -1):
        if msgs[i].get("role") == "assistant":
            last_asst_idx = i
            break
    if last_asst_idx == -1:
        # no assistant -> skip example by returning None
        return None

    hist_wo = msgs[:last_asst_idx]          # up to before last assistant
    hist_w  = msgs[:last_asst_idx + 1]      # include the last assistant

    # Tokenize using chat template
    ids_hist = tokenizer.apply_chat_template(
        hist_wo, tokenize=True, add_generation_prompt=False
    )
    ids_full = tokenizer.apply_chat_template(
        hist_w, tokenize=True, add_generation_prompt=False
    )

    # Ensure EOS at the end for stability (optional)
    if tokenizer.eos_token_id is not None and (len(ids_full) == 0 or ids_full[-1] != tokenizer.eos_token_id):
        ids_full = ids_full + [tokenizer.eos_token_id]

    input_ids = ids_full
    labels = [-100] * len(ids_hist) + input_ids[len(ids_hist):]
    attention_mask = [1] * len(input_ids)

    # Truncate from the left if needed (keep tail so labels align with the most recent exchange)
    if len(input_ids) > max_len:
        input_ids = input_ids[-max_len:]
        attention_mask = attention_mask[-max_len:]
        labels = labels[-max_len:]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

def ds_map_fn(ex):
    out = encode_chat_last_assistant_only(ex["messages"], MAX_SEQ_LEN)
    if out is None:
        # mark for filtering; Trainer can't handle None rows
        return {"_drop": True}
    out["_drop"] = False
    return out


raw = raw.class_encode_column("filetype")

split = raw.train_test_split(test_size=0.2, seed=42, stratify_by_column="filetype")
train_raw, eval_raw = split["train"], split["test"]


train_ds = train_raw.map(ds_map_fn, remove_columns=[c for c in train_raw.column_names if c != "messages"])
train_ds = train_ds.filter(lambda ex: ex["_drop"] is False).remove_columns("_drop")

eval_ds = eval_raw.map(ds_map_fn, remove_columns=[c for c in eval_raw.column_names if c != "messages"])
eval_ds = eval_ds.filter(lambda ex: ex["_drop"] is False).remove_columns("_drop")

# quick inspection
print("Train example lens:", len(train_ds[0]["input_ids"]), len(train_ds[0]["labels"]))
print("Eval example lens:", len(eval_ds[0]["input_ids"]), len(eval_ds[0]["labels"]))
print(train_ds, eval_ds)



Train example lens: 670 670
Eval example lens: 630 630
Dataset({
    features: ['messages', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 712
}) Dataset({
    features: ['messages', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 179
})


In [None]:

@dataclass
class DataCollatorForCausalLMWithLabels:
    tokenizer: AutoTokenizer
    label_pad_token_id: int = -100

    def __call__(self, features: List[Dict[str, Any]]):
        # Separate out labels for manual padding
        labels = [f["labels"] for f in features]
        batch_inputs = {
            "input_ids": [f["input_ids"] for f in features],
            "attention_mask": [f["attention_mask"] for f in features]
        }
        padded = self.tokenizer.pad(
            batch_inputs, padding=True, return_tensors="pt"
        )
        max_len = padded["input_ids"].shape[1]
        padded_labels = torch.full(
            (len(labels), max_len), self.label_pad_token_id, dtype=torch.long
        )
        for i, lab in enumerate(labels):
            padded_labels[i, :len(lab)] = torch.tensor(lab, dtype=torch.long)
        padded["labels"] = padded_labels
        return padded

data_collator = DataCollatorForCausalLMWithLabels(tokenizer=tokenizer)


In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
gc.collect(); torch.cuda.empty_cache()

In [None]:

# 4-bit quantization
compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=USE_NESTED_QUANT,
)

base = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        load_in_8bit=False,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto",
        use_cache=False,
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
)

base = prepare_model_for_kbit_training(base)

# Ensure model has pad token id in config for generation convenience
if getattr(base.config, "pad_token_id", None) is None:
    base.config.pad_token_id = tokenizer.pad_token_id


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
blk = base.model.layers[0]           # Llama/Qwen-style
print("ATTN:", blk.self_attn)         # has q_proj, k_proj, v_proj, o_proj
print("MLP:", blk.mlp)
target_modules = LORA_TARGET_MODULES

ATTN: Qwen3Attention(
  (q_proj): Linear4bit(in_features=2560, out_features=4096, bias=False)
  (k_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
  (v_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
  (o_proj): Linear4bit(in_features=4096, out_features=2560, bias=False)
  (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
  (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
)
MLP: Qwen3MLP(
  (gate_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
  (up_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
  (down_proj): Linear4bit(in_features=9728, out_features=2560, bias=False)
  (act_fn): SiLUActivation()
)


In [None]:
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=LORA_TARGET_MODULES,
    bias="none",
)
# model = PeftModel.from_pretrained(base, "HUGGING FACE ADAPTERS")
model = get_peft_model(base, lora_cfg)
model.print_trainable_parameters()

trainable params: 16,515,072 || all params: 4,038,983,168 || trainable%: 0.4089


In [None]:

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    warmup_ratio=0.03,
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    optim="paged_adamw_8bit",
    report_to=["wandb"],
    load_best_model_at_end=True,
    hub_model_id=OUTPUT_DIR,
    hub_strategy="every_save",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.0
        )
    ],
)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True})
model.config.use_cache = False
model.config.attn_implementation = "flash_attention_2"

  trainer = Trainer(


In [None]:
print("Training...")
try:
  trainer.train(resume_from_checkpoint=False)
  eval_results = trainer.evaluate()
except Exception as e:
  print(f"Training failed: {e}")
finally:
  trainer.save_state()
  trainer.save_model(OUTPUT_DIR)
  print("Saved model to:", OUTPUT_DIR)

wandb.finish()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)
print(f"Eval loss = {eval_loss:.2f}, Perplexity = {perplexity:.2f}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Training...


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


Step,Training Loss,Validation Loss
30,1.2907,1.15685
60,1.064,1.061506
90,0.918,1.004575
120,0.9581,0.977831
150,0.9007,0.965217
180,0.819,0.947087
210,0.813,0.938453
240,0.6944,0.936823
270,0.8065,0.921646
300,0.6741,0.932755


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...wen3-4B/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...T-Qwen3-4B/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

  ...adapter_model.safetensors:  63%|######3   | 41.9MB / 66.1MB            

Saved model to: j05hr3d/SFT-Qwen3-4B


0,1
eval/loss,‚ñà‚ñÖ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñà‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/samples_per_second,‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
eval/steps_per_second,‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÇ‚ñÅ‚ñÉ‚ñÉ‚ñÅ‚ñá‚ñÉ‚ñÑ‚ñÑ‚ñà‚ñÜ‚ñá
train/learning_rate,‚ñà‚ñá‚ñá‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ
train/loss,‚ñà‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÅ‚ñÉ‚ñÅ‚ñÅ‚ñÇ

0,1
eval/loss,0.92165
eval/runtime,40.694
eval/samples_per_second,4.399
eval/steps_per_second,0.565
total_flos,9.81203472314327e+16
train/epoch,4.04494
train/global_step,360
train/grad_norm,0.76446
train/learning_rate,0.00012
train/loss,0.7538


Eval loss = 0.92, Perplexity = 2.51


In [None]:

# Quick smoke test generation (greedy/short)
model.eval()
prompt_msgs = [
    {"role": "system", "content": "Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after)."},
    {"role": "user", "content": "Implement a function named `xor8` which takes two 8-bit words and returns their bitwise XOR. Also provide a constant `zero8` equal to 0 (8-bit)."}
]

# add_generation_prompt=True -> model should produce assistant continuation
inputs = tokenizer.apply_chat_template(
    prompt_msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(model.device)

with torch.no_grad():
    out = model.generate(
        inputs,
        max_new_tokens=500,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

generated = out[0, inputs.shape[-1]:]
print(tokenizer.decode(generated, skip_special_tokens=True))


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Casting fp32 inputs back to torch.float16 for flash-attn compatibility.


```cryptol
xor8 : {a} (fin a) => [a] -> [a] -> [a]
xor8 x y = [ x @ i ^ y @ i | i <- [0..a-1] ]

zero8 : {a} (fin a) => [a]
zero8 = 0
```


In [None]:
from google.colab import runtime
#runtime.unassign()


## Notes & References
- ü§ó Transformers `Trainer` docs (training loop, arguments)  
  https://huggingface.co/docs/transformers/main_classes/trainer
- `apply_chat_template` for model-specific chat formatting  
  https://huggingface.co/docs/transformers/main/chat_templating
- PEFT / LoRA docs  
  https://huggingface.co/docs/peft
- BitsAndBytes 4-bit quantization  
  https://github.com/TimDettmers/bitsandbytes
