In [1]:
!nvidia-smi

Wed Jan 28 09:01:51 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install -U transformers peft trl accelerate bitsandbytes datasets


Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting trl
  Downloading trl-0.27.1-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.3.4-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading transformers-5.0.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.27.1-py3-none-any.whl (532 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0

In [3]:
import os
import torch
import time

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig


In [4]:
# ===== AMP / BF16 hard disable =====
torch.set_default_dtype(torch.float16)

os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
os.environ["TORCHAMP_DISABLE"] = "1"
os.environ["TORCH_DISABLE_FOREACH"] = "1"
os.environ["TORCH_DISABLE_FUSED"] = "1"

print("AMP & BF16 disabled")


AMP & BF16 disabled


In [5]:
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"

print("Model loaded")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/146 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Model loaded


In [6]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.0,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    param.requires_grad_((".lora_" in name))

model.enable_input_require_grads()

print("LoRA applied")


LoRA applied


In [7]:
# ===== TASK-E MEMORY OPTIMIZATION =====

# 1️⃣ Enable gradient checkpointing
model.gradient_checkpointing_enable()

# 2️⃣ Disable attention cache (VERY IMPORTANT)
model.config.use_cache = False

# 3️⃣ Disable unnecessary outputs
model.config.output_attentions = False
model.config.output_hidden_states = False

print("Gradient checkpointing ENABLED")
print("Attention cache DISABLED")


Gradient checkpointing ENABLED
Attention cache DISABLED


In [8]:
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"

dataset = load_dataset(
    "json",
    data_files={"train": url},
    split="train[:1%]"
)

print("Dataset loaded")


unified_chip2.jsonl:   0%|          | 0.00/95.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded


In [9]:
training_args = SFTConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    max_steps=5,
    logging_steps=1,
    warmup_steps=1,
    output_dir="task_e_outputs",
    seed=3407,
    fp16=False,
    bf16=False,
    optim="adamw_torch",
    report_to="none",
    dataset_num_proc=2,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    processing_class=tokenizer,
    args=training_args,
)


Adding EOS to train dataset (num_proc=2):   0%|          | 0/2103 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/2103 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/2103 [00:00<?, ? examples/s]

In [10]:
torch.cuda.reset_peak_memory_stats()

start_time = time.time()
result = trainer.train()
end_time = time.time()

task_e_vram = torch.cuda.max_memory_allocated() / (1024 ** 2)
task_e_time = end_time - start_time
task_e_loss = result.training_loss

print("\n===== TASK E RESULTS =====")
print(f"Loss  : {task_e_loss:.6f}")
print(f"Time  : {task_e_time:.2f} sec")
print(f"VRAM  : {task_e_vram:.2f} MB")


Step,Training Loss
1,2.235009
2,2.057095
3,2.03986
4,2.147752
5,2.725113



===== TASK E RESULTS =====
Loss  : 2.240966
Time  : 12.55 sec
VRAM  : 1337.56 MB


Task E:
- Enabled gradient checkpointing to reduce activation memory
- Disabled attention KV cache and unnecessary outputs
- Training completed successfully without instability
- On a 1B model with batch size 1, memory savings were modest,
  which is expected due to low activation footprint
- Demonstrated correct memory-optimization strategy and trade-offs
**bold text**