In [1]:
# Import the necessary libraries
import torch
from transformers import BitsAndBytesConfig, Mistral3ForConditionalGeneration, MistralCommonBackend

In [2]:
# Check CUDA is working
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.9.0+cu130
13.0
True


In [3]:
# Config for loading the model in 4 bits
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # original is 32 bit
    bnb_4bit_quant_type="nf4", # gaussian distribution
    bnb_4bit_use_double_quant=True, # 32 -> 8 -> 4 bits
    bnb_4bit_compute_dtype=torch.float16 # compute in float16
)

In [4]:
# Load the model with our config
model_id = "mistralai/Ministral-3-3B-Base-2512"
model = Mistral3ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config, # use our quantization config
    device_map="auto", # use CUDA if available
)

Unrecognized keys in `rope_parameters` for 'rope_type'='yarn': {'max_position_embeddings'}


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/458 [00:00<?, ?it/s]

In [5]:
# Check the model is loaded in 4bit
for name, module in model.named_modules():
    if "Linear" in str(type(module)) or "4bit" in str(type(module)):
        print(f"{name} -> {type(module)}")

model.vision_tower.transformer.layers.0.feed_forward.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.feed_forward.up_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.feed_forward.down_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.k_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.v_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.q_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.o_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.1.feed_forward.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.1.feed_forward.up_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transforme

In [6]:
# Setup for the tokenizer
tokenizer = MistralCommonBackend.from_pretrained(model_id) # tokenisation specific to Mistral models
print(f"Vocabulary Size: {len(tokenizer)}") # number of tokens known to the model

Vocabulary Size: 131072


In [7]:
# Experimenting with tokenizer
sample_sentence = "What's the craic?" # sample sentence
sample_tokens = tokenizer(sample_sentence) # tokenise the sentence
print(f"Input IDs: {sample_tokens["input_ids"]}")
print(f"Tokens (Encoded): {tokenizer.convert_ids_to_tokens(sample_tokens["input_ids"])}")
print(f"Original (Decoded): {tokenizer.decode(sample_tokens["input_ids"])}")

Input IDs: [1, 7493, 1681, 1278, 20547, 1290, 1063]
Tokens (Encoded): ['<s>', 'What', "'s", ' the', ' cra', 'ic', '?']
Original (Decoded): <s>What's the craic?


In [10]:
# Experimenting with batching
sample_sentences = ["Sound lad", "That's grand", "Ye eejit"] # sample sentences
sample_batch = tokenizer(
    sample_sentences,
    padding=True,
    return_tensors="pt"
)
print(f"Input IDs: {sample_batch["input_ids"]}")
for ids in sample_batch["input_ids"]:
    print(f"\nTokens (Encoded): {tokenizer.convert_ids_to_tokens(ids)}")
    print(f"Original (Decoded): {tokenizer.decode(ids)}")

Input IDs: tensor([[   11,    11,     1, 33795, 21154],
        [   11,     1,  9842,  1681,  4186],
        [    1, 42414,  1324,  9472,  1276]])

Tokens (Encoded): ['<pad>', '<pad>', '<s>', 'Sound', ' lad']
Original (Decoded): <pad><pad><s>Sound lad

Tokens (Encoded): ['<pad>', '<s>', 'That', "'s", ' grand']
Original (Decoded): <pad><s>That's grand

Tokens (Encoded): ['<s>', 'Ye', ' e', 'ej', 'it']
Original (Decoded): <s>Ye eejit


In [None]:
# Test the quantized model before fine-tuning
prompt_1 = "What's the craic?"
prompt_2 = "What's the story lad?"
prompt_3 = "How are ye getting on?"
prompts = [prompt_1, prompt_2, prompt_3]
inputs = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=30, # limit new token generation
        do_sample=True, # sample from distribution
    )
for i, output in enumerate(outputs):
    response = tokenizer.decode(output, skip_special_tokens=True)
    print(f"\nResponse {i+1}: {response}")


Response 1: What's the craic? (Irish for what's up)

1/30/2017

## 1st Annual U.S. History Test

The

Response 2: What's the story lad? A girl gets pregnant and doesn't want to carry the child. She has to have an abortion and goes to a party. He is an alcoholic who

Response 3: How are ye getting on? It certainly was an interesting way to begin the weekend . . . although it took me a little bit of looking into the scriptures and books of faith
