# Part 0: Setup

In [1]:
# Install PEFT along with dependencies
!pip install -q peft transformers accelerate bitsandbytes

In [2]:
import os
from dotenv import load_dotenv
import torch
import platform
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [3]:
# RUN THIS CELL ONLY IF RUNNING ON PACE-ICE


# override the huggingface cache path and nltk cache path
dirs = {
    "HF_HOME":"~/scratch/hf_cache",
    "TRITON_CACHE_DIR":"~/scratch/triton_cache",
    "TORCHINDUCTOR_CACHE_DIR":"~/scratch/inductor_cache",
    'NLTK_DATA':"~/scratch/nltk_data"
}

for name in dirs:
    d = dirs[name]
    path = os.path.expanduser(d)
    print(name)
    print(path)
    os.makedirs(path, exist_ok=True)
    # making sure the cache dirs are rwx for owner
    os.chmod(path, 0o700)
    os.environ[name] = path
print("Make sure the cache files are in ~/scratch/ so quota doesn't exceed limit!")

HF_HOME
/home/hice1/yhsu72/scratch/hf_cache
TRITON_CACHE_DIR
/home/hice1/yhsu72/scratch/triton_cache
TORCHINDUCTOR_CACHE_DIR
/home/hice1/yhsu72/scratch/inductor_cache
NLTK_DATA
/home/hice1/yhsu72/scratch/nltk_data
Make sure the cache files are in ~/scratch/ so quota doesn't exceed limit!


In [None]:
# Change this to your own token (or save in .env)
os.environ['HF_TOKEN'] = ''

In [5]:
load_dotenv()  # loads HF_TOKEN into environment

print("‚úÖ Hugging Face token loaded from environment.")

‚úÖ Hugging Face token loaded from environment.


In [6]:
print("=== üß† Environment Info ===")
print(f"Python version: {platform.python_version()}")
print(f"PyTorch version: {torch.__version__}")
print("-----------------------------")

# Check for CUDA (NVIDIA GPUs)
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"‚úÖ CUDA is available. Number of GPUs: {num_gpus}")

    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        total_mem = torch.cuda.get_device_properties(i).total_memory / (1024**3)
        print(f"  ‚Ä¢ GPU {i}: {gpu_name} ({total_mem:.2f} GB VRAM)")

    # Also show current GPU and free memory
    current_gpu = torch.cuda.current_device()
    print(f"\nUsing GPU: {torch.cuda.get_device_name(current_gpu)}")
    free_mem, total_mem = torch.cuda.mem_get_info()
    print(f"Available VRAM: {free_mem/1e9:.2f} GB / {total_mem/1e9:.2f} GB")

# Check for Apple Silicon (MPS)
elif torch.backends.mps.is_available():
    print("‚úÖ Running on Apple Silicon (MPS backend).")

# Check for ROCm (AMD GPUs)
elif torch.version.hip is not None:
    print("‚úÖ ROCm (AMD GPU) detected.")

# Otherwise fallback to CPU
else:
    print("‚ö†Ô∏è No GPU detected ‚Äî running on CPU only.")
    print("This will be very slow for large models like Llama-3.1-8B.")

print("-----------------------------")

# Confirm torch default device
default_device = "cuda" if torch.cuda.is_available() else (
    "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Default torch device: {default_device}")

=== üß† Environment Info ===
Python version: 3.10.13
PyTorch version: 2.8.0+cu128
-----------------------------
‚úÖ CUDA is available. Number of GPUs: 2
  ‚Ä¢ GPU 0: NVIDIA H200 (139.80 GB VRAM)
  ‚Ä¢ GPU 1: NVIDIA H200 (139.80 GB VRAM)

Using GPU: NVIDIA H200
Available VRAM: 149.56 GB / 150.11 GB
-----------------------------
Default torch device: cuda


In [7]:

# --- 3. Model name on Hugging Face Hub ---
model_name = "meta-llama/Llama-3.1-8B"

# --- 4. (Optional) Authenticate if model is gated/private ---
# from huggingface_hub import login
# login(token="YOUR_HF_TOKEN")

print("Loading tokenizer and model‚Ä¶")

# --- 5. Load tokenizer ---
# Tokenizer converts text ‚Üî tokens. Must match model for correct vocabulary.
tokenizer = AutoTokenizer.from_pretrained(model_name)

# --- 6. Define quantization configuration ---
# This allows loading the model in 4-bit precision to save VRAM and enable QLoRA fine-tuning.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Quantize model weights to 4 bits instead of 16
    bnb_4bit_quant_type="nf4",              # "NormalFloat4" ‚Äì more accurate 4-bit representation
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for math (fast, widely supported)
    bnb_4bit_use_double_quant=True          # Extra quantization layer to reduce memory further
)

# --- 7. Load model with efficient settings ---
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",              # Automatically distributes layers across GPUs/CPU
    quantization_config=bnb_config, # Apply the quantization config defined above
    dtype=torch.bfloat16,     # Keep remaining layers in bfloat16 precision (safe default)
    low_cpu_mem_usage=True,         # Stream weights directly to GPU to reduce CPU RAM footprint
    trust_remote_code=True          # Needed if the repo includes custom model code
)

print("‚úÖ Model loaded successfully!")


Loading tokenizer and model‚Ä¶


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

‚úÖ Model loaded successfully!


In [8]:

# --- 8. Simple inference test ---
prompt = """### Instruction:
Explain the difference between left-wing and right-wing economic policies.

### Response:"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=150,                  # control output length
    do_sample=True,                      # enables some randomness
    temperature=0.7,                     # mild creativity
    top_p=0.9,                           # nucleus sampling
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.2               # prevent repeated text
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

### Instruction:
Explain the difference between left-wing and right-wing economic policies.

### Response: 
The main differences are in how government should regulate business. The Left believes that businesses need to be regulated heavily by a central authority, so as not to harm workers or consumers (e.g., minimum wage laws). Meanwhile, the Right generally opposes such regulations on principle because they think it is better for people to make their own decisions about wages etc...


# Part 1: PEFT Setup

In [9]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,                # rank of the LoRA matrices
    lora_alpha=32,       # scaling factor
    target_modules=["q_proj", "v_proj"],  # which layers to fine-tune
    lora_dropout=0.05,   # dropout for LoRA
    bias="none",         # keep bias frozen
    task_type="CAUSAL_LM" # type of task
)

# Wrap base model with PEFT
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # confirm only LoRA params are trainable

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848
