In [1]:
# Import the necessary libraries
import torch
from transformers import BitsAndBytesConfig, Mistral3ForConditionalGeneration, MistralCommonBackend, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, prepare_model_for_kbit_training

In [2]:
# Check CUDA is working
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.9.0+cu126
12.6
True


In [3]:
# Config for loading the model in 4 bits
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # original is 32 bit
    bnb_4bit_quant_type="nf4", # gaussian distribution
    bnb_4bit_use_double_quant=True, # 32 -> 8 -> 4 bits
    bnb_4bit_compute_dtype=torch.float16 # compute in float16
)

In [4]:
# Load the model with our config
model_id = "mistralai/Ministral-3-3B-Base-2512"
model = Mistral3ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config, # use our quantization config
    device_map="auto", # use CUDA if available
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

Unrecognized keys in `rope_parameters` for 'rope_type'='yarn': {'max_position_embeddings'}


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/458 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

In [5]:
# Check the model is loaded in 4bit
for name, module in model.named_modules():
    if "Linear" in str(type(module)) or "4bit" in str(type(module)):
        print(f"{name} -> {type(module)}")

model.vision_tower.transformer.layers.0.feed_forward.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.feed_forward.up_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.feed_forward.down_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.k_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.v_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.q_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.0.attention.o_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.1.feed_forward.gate_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transformer.layers.1.feed_forward.up_proj -> <class 'bitsandbytes.nn.modules.Linear4bit'>
model.vision_tower.transforme

In [6]:
# Enable gradient checkpointing
model.gradient_checkpointing_enable()
# Quantization-aware training
model = prepare_model_for_kbit_training(model)

In [7]:
# Setup for the tokenizer
tokenizer = MistralCommonBackend.from_pretrained(model_id) # tokenization specific to Mistral models
print(f"Vocabulary Size: {len(tokenizer)}") # number of tokens known to the model

tekken.json:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Vocabulary Size: 131072


In [8]:
# Experimenting with tokenizer
sample_sentence = "What's the craic?" # sample sentence
sample_tokens = tokenizer(sample_sentence) # tokenize the sentence
print(f"Input IDs: {sample_tokens["input_ids"]}")
print(f"Tokens (Encoded): {tokenizer.convert_ids_to_tokens(sample_tokens["input_ids"])}")
print(f"Original (Decoded): {tokenizer.decode(sample_tokens["input_ids"])}")

Input IDs: [1, 7493, 1681, 1278, 20547, 1290, 1063]
Tokens (Encoded): ['<s>', 'What', "'s", ' the', ' cra', 'ic', '?']
Original (Decoded): <s>What's the craic?


In [9]:
# Experimenting with batching
sample_sentences = ["Sound lad", "That's grand", "Ye eejit"] # sample sentences
sample_batch = tokenizer(
    sample_sentences,
    padding=True,
    return_tensors="pt"
)
print(f"Input IDs: {sample_batch["input_ids"]}")
for ids in sample_batch["input_ids"]:
    print(f"\nTokens (Encoded): {tokenizer.convert_ids_to_tokens(ids)}")
    print(f"Original (Decoded): {tokenizer.decode(ids)}")

Input IDs: tensor([[   11,    11,     1, 33795, 21154],
        [   11,     1,  9842,  1681,  4186],
        [    1, 42414,  1324,  9472,  1276]])

Tokens (Encoded): ['<pad>', '<pad>', '<s>', 'Sound', ' lad']
Original (Decoded): <pad><pad><s>Sound lad

Tokens (Encoded): ['<pad>', '<s>', 'That', "'s", ' grand']
Original (Decoded): <pad><s>That's grand

Tokens (Encoded): ['<s>', 'Ye', ' e', 'ej', 'it']
Original (Decoded): <s>Ye eejit


In [10]:
# Test the quantized model before fine-tuning
prompt_1 = "What's the craic?"
prompt_2 = "What's the story?"
prompt_3 = "How are ye getting on?"
prompts = [prompt_1, prompt_2, prompt_3]
inputs = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=30, # limit new token generation
        do_sample=False, # deterministic output
    )
for i, output in enumerate(outputs):
    response = tokenizer.decode(output, skip_special_tokens=True)
    print(f"\nResponse {i+1}: {response}")


Response 1: What's the craic? (Irish for "what's the weather like?") It's a sunny 20 degrees Celsius in Dublin, Ireland, and the weather is

Response 2: What's the story? The story is that the world is in a state of flux. The world is changing. The world is evolving. The world is transforming. The world

Response 3: How are ye getting on? I hope you are enjoying the course so far. I am going to give you a little break from the course and give you a little quiz to see


In [11]:
# Load the dataset and view first entry
dataset = load_dataset("json", data_files="training_data.jsonl", split="train")
print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

{'text': "What's the craic?\nNot much lad, yourself?"}


In [12]:
# Tokenize the dataset
def tokenize(sample):
  return tokenizer(
      sample["text"],
      truncation=True,
      padding="max_length",
      max_length=64
  )
tokenized_dataset = dataset.map(tokenize, batched=True)
print(tokenized_dataset[0]["input_ids"])
print(tokenizer.decode(tokenized_dataset[0]["input_ids"]))

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1, 7493, 1681, 1278, 20547, 1290, 9551, 5484, 3315, 21154, 1044, 14019, 1063]
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><s>What's the craic?
Not much lad, yourself?


In [13]:
# Lora Configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 33,751,040 || all params: 3,882,841,088 || trainable%: 0.8692


In [15]:
# Model Training
training_args = TrainingArguments(
    output_dir="./qlora-checkpoint",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=50,
    save_total_limit=1,
    report_to="none"
)
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=3, training_loss=4.84968630472819, metrics={'train_runtime': 18.9451, 'train_samples_per_second': 1.584, 'train_steps_per_second': 0.158, 'total_flos': 40091764654080.0, 'train_loss': 4.84968630472819, 'epoch': 3.0})

In [19]:
# Save the trained LoRA
peft_model.save_pretrained("mistral-qlora-craic")
tokenizer.save_pretrained("mistral-qlora-craic")

('mistral-qlora-craic/tekken.json',)

In [20]:
# Load the base quantized model
base_model = Mistral3ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
# Load the trained LoRA
model = PeftModel.from_pretrained(base_model, "mistral-qlora-craic")
tokenizer = MistralCommonBackend.from_pretrained("mistral-qlora-craic")

Unrecognized keys in `rope_parameters` for 'rope_type'='yarn': {'max_position_embeddings'}


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/458 [00:00<?, ?it/s]

In [21]:
# Evaluate trained results
prompt_1 = "What's the craic?"
prompt_2 = "What's the story?"
prompt_3 = "How are ye getting on?"
prompts = [prompt_1, prompt_2, prompt_3]
inputs = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        do_sample=False,
    )
for i, output in enumerate(outputs):
    response = tokenizer.decode(output, skip_special_tokens=True)
    print(f"\nResponse {i+1}: {response}")


Response 1: What's the craic? I'm back again, and I'm going to be talking about the most important thing in the world, which is the weather. I'm going to

Response 2: What's the story? The story is that the world is a dangerous place, and we need to be careful. We need to be careful because the world is a dangerous place

Response 3: How are ye getting on? I'm not sure if I'm doing this right, but I'm trying to get the code to work. I'm trying to get the code to
