In [11]:
import torch
import gc
import os

def print_gpu_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**2  # in MB
        reserved = torch.cuda.memory_reserved() / 1024**2    # in MB
        print(f"Allocated Memory: {allocated:.2f} MB")
        print(f"Reserved Memory: {reserved:.2f} MB")
    else:
        print("No CUDA GPU detected.")

print("Before clearing:")
print_gpu_memory()

gc.collect()
torch.cuda.empty_cache()

print("\nAfter clearing:")
print_gpu_memory()

print("\nRun nvidia-smi for full details:")
os.system("nvidia-smi")


Before clearing:
Allocated Memory: 3005.86 MB
Reserved Memory: 3990.00 MB

After clearing:
Allocated Memory: 1676.95 MB
Reserved Memory: 2976.00 MB

Run nvidia-smi for full details:
Sat Aug  9 18:45:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.64.03              Driver Version: 575.64.03      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...    Off |   00000000:01:00.0  On |                  N/A |
| N/A   53C    P4             10W /   80W |    3671MiB /   6141MiB |     23%      Default |
|                                         |       

0

In [13]:
import gc
import torch

if torch.cuda.is_available():
    print("Before clearing:")
    print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

    gc.collect()
    torch.cuda.empty_cache()

    print("\nAfter clearing:")
    print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")


Before clearing:
Allocated: 1676.95 MB
Cached: 2976.00 MB

After clearing:
Allocated: 0.00 MB
Cached: 0.00 MB


In [1]:
import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Clear GPU memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()

model_name = "openai/gpt-oss-20b"

# Show GPU info
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    total_vram = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"🔥 GPU: {gpu_name} ({total_vram:.1f} GB VRAM)")
else:
    print("⚠ No GPU detected, running on CPU only")

# Memory map — keep GPU usage small
max_memory = {
    0: "3GiB",       # GPU limit
    "cpu": "13GiB"   # Rest in CPU RAM
}

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model in 4-bit mode
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    max_memory=max_memory,
    load_in_4bit=True,        # 4-bit quantization
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    offload_folder="offload"  # make sure this is on SSD
)

# Inference
messages = [{"role": "user", "content": "Who are you?"}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda" if torch.cuda.is_available() else "cpu")

outputs = model.generate(**inputs, max_new_tokens=40)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

print("🤖 Model Response:", response)


  from .autonotebook import tqdm as notebook_tqdm


🔥 GPU: NVIDIA GeForce RTX 4050 Laptop GPU (5.6 GB VRAM)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


AttributeError: 'BitsAndBytesConfig' object has no attribute 'get_loading_attributes'