In [4]:
# Import the necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
# Check CUDA is working
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.9.0+cu130
13.0
True


In [None]:
# Config for loading the model in 4 bits
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # original is 32 bit
    bnb_4bit_quant_type="nf4", # gaussian distribution
    bnb_4bit_use_double_quant=True, # 32 -> 8 -> 4 bits
    bnb_4bit_compute_dtype=torch.float16 # compute in float16
)

In [None]:
# Load the model with our config
model_id = "mistralai/Mistral-7B-v0.3"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto", # use CUDA if available
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
# Check the model is loaded in 4bit
for name, module in model.named_modules():
    if "Linear" in str(type(module)) or "4bit" in str(type(module)):
        print(f"{name} -> {type(module)}")

model.layers.0.self_attn.q_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.0.self_attn.k_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.0.self_attn.v_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.0.self_attn.o_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.0.mlp.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.0.mlp.up_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.0.mlp.down_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.1.self_attn.q_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.1.self_attn.k_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.1.self_attn.v_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.1.self_attn.o_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.1.mlp.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.layers.1.mlp.up_proj -> <class 'bitsandbytes.nn.

In [None]:
# Setup for the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token # set padding as EOS

In [None]:
# Experimenting with tokenizer
sentence = "What's the craic?"
tokens = tokenizer(sentence)
print(f"Input IDs: {tokens["input_ids"]}")
print(f"Tokens (Encoded): {tokenizer.convert_ids_to_tokens(tokens["input_ids"])}")
print(f"Original (Decoded): {tokenizer.decode(tokens["input_ids"])}")

Input IDs: [1, 2592, 29510, 29481, 1040, 1045, 1288, 1062, 29572]
Tokens (Encoded): ['<s>', '▁What', "'", 's', '▁the', '▁c', 'ra', 'ic', '?']
Original (Decoded): <s> What's the craic?
