In [1]:
# Import the necessary libraries
import torch
from transformers import BitsAndBytesConfig, Mistral3ForConditionalGeneration, MistralCommonBackend

In [2]:
# Check CUDA is working
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.9.0+cu130
13.0
True


In [3]:
# Config for loading the model in 4 bits
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # original is 32 bit
    bnb_4bit_quant_type="nf4", # gaussian distribution
    bnb_4bit_use_double_quant=True, # 32 -> 8 -> 4 bits
    bnb_4bit_compute_dtype=torch.float16 # compute in float16
)

In [4]:
# Load the model with our config
model_id = "mistralai/Ministral-3-3B-Base-2512"
model = Mistral3ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto", # use CUDA if available
)

Unrecognized keys in `rope_parameters` for 'rope_type'='yarn': {'max_position_embeddings'}


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/458 [00:00<?, ?it/s]

In [5]:
# Check the model is loaded in 4bit
for name, module in model.named_modules():
    if "Linear" in str(type(module)) or "4bit" in str(type(module)):
        print(f"{name} -> {type(module)}")

model.vision_tower.transformer.layers.0.feed_forward.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.feed_forward.up_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.feed_forward.down_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.k_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.v_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.q_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.o_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.1.feed_forward.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.1.feed_forward.up_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transforme

In [None]:
# Setup for the tokenizer
tokenizer = MistralCommonBackend.from_pretrained(model_id) # tokenisation specific to Mistral models
print(f"Vocabulary Size: {len(tokenizer)}")

Vocabulary Size: 131072


In [7]:
# Experimenting with tokenizer
sentence = "What's the craic?"
tokens = tokenizer(sentence)
print(f"Input IDs: {tokens["input_ids"]}")
print(f"Tokens (Encoded): {tokenizer.convert_ids_to_tokens(tokens["input_ids"])}")
print(f"Original (Decoded): {tokenizer.decode(tokens["input_ids"])}")

Input IDs: [1, 7493, 1681, 1278, 20547, 1290, 1063]
Tokens (Encoded): ['<s>', 'What', "'s", ' the', ' cra', 'ic', '?']
Original (Decoded): <s>What's the craic?


In [8]:
# Experimenting with batching
sentences = ["Sound lad", "That's grand", "Ye eejit"]
batch = tokenizer(
    sentences,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
print(f"Input IDs: {batch["input_ids"]}")
print(f"Tokens (Encoded):")
for ids in batch["input_ids"]:
    print(f"{tokenizer.convert_ids_to_tokens(tokens["input_ids"])}")
    print(f"Original (Decoded): {tokenizer.decode(tokens["input_ids"])}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Input IDs: tensor([[   11,    11,     1, 33795, 21154],
        [   11,     1,  9842,  1681,  4186],
        [    1, 42414,  1324,  9472,  1276]])
Tokens (Encoded):
['<s>', 'What', "'s", ' the', ' cra', 'ic', '?']
Original (Decoded): <s>What's the craic?
['<s>', 'What', "'s", ' the', ' cra', 'ic', '?']
Original (Decoded): <s>What's the craic?
['<s>', 'What', "'s", ' the', ' cra', 'ic', '?']
Original (Decoded): <s>What's the craic?


In [9]:
# Test the quantized model before fine-tuning
prompt_1 = "What's the craic?"
prompt_2 = "What's the story lad?"
prompt_3 = "How are ye getting on?"
prompts = [prompt_1, prompt_2]
inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=128, 
        do_sample=True,
    )
for i, output in enumerate(outputs):
    response = tokenizer.decode(output, skip_special_tokens=True)
    print(response)

What's the craic? - Friday 10th May 2008

For those times when the usual Friday Cuckoo site doesn't live up to its alliterating title, we've created this one for the weekend instead!

Question 1 -
I'm a huge fan of the Beatles, from the early days to the late era. I can't stand what's happening with the band at the moment. I love them - if not for their work, for their friends on this planet. My most recent purchase was the CD of "I've Just Seen a Face," a 1980 live concert featuring four of them with two
What's the story lad? We hear you have no idea where you're going, nor a place to stay. You're feeling lost, you feel confused. You feel you're drifting along and nothing is real and your life is in disarray.

Welcome home, mate. Your journey, your reality, your confusion is that of your higher self.

The higher self is the spiritual realm in which you reside, the part of your mind that holds a grand vision of what you want to be. A vision of which you are to grow and to expand, and w