## Kaggle is slow - you'll have to wait **5 minutes** for it to install.


In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/210k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
# Move the model to the desired device
device = torch.device("cuda:0")
model.to(device)

Unsloth 2025.3.19 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 5120, padding_idx=151665)
        (layers): ModuleList(
          (0-47): 48 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

### Data Prep


In [5]:
import pandas as pd
train_df = pd.read_csv("/kaggle/input/fpt-ai-residency/finetuning_data.csv")
# # Take 300 sample for finetuning only
# sample_df = train_df.sample(n=, random_state=42)
# sample_df

In [6]:
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)
print(dataset)

Dataset({
    features: ['task_id', 'question', 'choices', 'answer', 'text'],
    num_rows: 3813
})


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Fixed major bug in latest Unsloth
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = None,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "paged_adamw_8bit", # Save more memory
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc

    ),
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/3813 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [8]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user\n",
    response_part = "<|im_start|>assistant\n",
)

Map (num_proc=4):   0%|          | 0/3813 [00:00<?, ? examples/s]

We verify masking is actually done:

In [9]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|im_start|>system\nYou are a coding assistant that helps to answer multiple choice questions about software development. \nExtract the final answer as a single alphabet option (A,B,C,D,E,F) in json format<|im_end|>\n<|im_start|>user\nQuestion: Match the following.\n      List-I                  List-II\nA. Lexical analysis       1. Graph coloring\nB. Parsing                2. DFA minimization\nC. Register allocation    3. Post-order traversal\nD. Expression evaluation  4. Production tree\nChoices:[\'A: A – 2, B – 3, C – 1, D – 4\', \'B: A – 2, B – 1, C – 4, D – 3\', \'C: A – 2, B – 4, C – 1, D – 3\', \'D: A – 2, B – 3, C – 4, D – 1\']<|im_end|>\n<|im_start|>assistant\n```json\n{"answer": "C"}\n```<|im_end|>\n'

In [10]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                                          ```json\n{"answer": "C"}\n```<|im_end|>\n'

We can see the System and Instruction prompts are successfully masked!

In [11]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
9.604 GB of memory reserved.


We fixed a major gradient accumulation bug in all trainers. See [blog](https://unsloth.ai/blog/gradient) for more details.

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,813 | Num Epochs = 1 | Total steps = 476
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 68,812,800/14,000,000,000 (0.49% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.5557
2,0.6313
3,0.627
4,0.3874
5,0.2738
6,0.1745
7,0.1135
8,0.0849
9,0.0769
10,0.1317


In [21]:
trainer_stats

TrainOutput(global_step=476, training_loss=0.05810104402960004, metrics={'train_runtime': 9650.3564, 'train_samples_per_second': 0.395, 'train_steps_per_second': 0.049, 'total_flos': 1.0663652876476416e+17, 'train_loss': 0.05810104402960004})

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [13]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen-2.5",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference        

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 5120, padding_idx=151665)
        (layers): ModuleList(
          (0-47): 48 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [14]:
import re
def get_answer(input_text, model=model, tokenizer=tokenizer, max_attempts=2):
    for attempt in range(max_attempts):
        instruction = """You are a coding assistant that helps to answer multiple choice questions about software development. 
        Extract the final answer as a single alphabet option (A,B,C,D,E,F) in json format
        """
        messages = [
            {'role': "system", "content": instruction},
            {"role": "user", "content": input_text},
        ]
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")
    
        try:
            outputs = model.generate(input_ids = inputs, max_new_tokens = 512, use_cache = True,
                                     temperature = 0.1, min_p = 0.1)
            generated_text = tokenizer.batch_decode(outputs)
            # Extract only the assistant's response
            assistant_response = generated_text[0].split("<|im_start|>assistant")[-1].split("<|im_end|>")[0]
            print(assistant_response)
            
            match = re.search(
                        r'```json\s*{\s*"answer"\s*:\s*"([^"]+)"\s*}\s*```',
                        assistant_response,
                    )
            if match:
                return match.group(1)
        except Exception as e:
            print(
                f"Warning: Error occurred: {e}. (Attempt {attempt + 1}/{max_attempts})"
            )
    print(f"Warning: Failed to get a valid answer after {max_attempts} attempts")
    return None

In [None]:
input_text = """Question: If a sorted array of integers is guaranteed to not contain duplicate values, in order to search for a specific value which of the following algorithms is the most efficient for this task?
(A) Bubble Sort
(B) Linear Search
(C) Insertion Sort
(D) Binary Search
"""
get_answer(input_text, model, tokenizer)

# Inference

In [15]:
prompt_template = """
The following are multiple choice questions (with answers) about software development.
Note: Alphabet should be one option in (A,B,C,D,E,F,G)
Question: {question}
{multiple_choices}
```json
{{"answer": "Your selected option here"}}
``` 
"""

CORRECT_FORMAT_PROMPT = """
You are given the question available choices and the chosen choice by the LLM, but it is not in the desired format. 
The desired format should be only the JSON with an alphabetic answer. Please reanswer the question again and convert the given answer to the correct format.
You can consider llm output as reference, should think step by step

Example:
Given:
Question: This is a sample question
Choices: ["Option A", "Option B", "Option C", "Option D"]
LLM's answer: "The correct answer is Option B"

Desired output:
```json
{{"answer": "B"}}
```

Your task is to extract the correct letter (A, B, C, or D) from the LLM's answer and format it as shown in the example above.

Question: {question}
Choices: {choices}
LLM's answer: {answer}
Desired output:
NOTE: Only one option is selected
"""
i2choices = {i: chr(65 + i) for i in range(7)}  # A to G

In [16]:
import pandas as pd
import os


def correct_format(question, choices, llm_output):
    input_text = CORRECT_FORMAT_PROMPT.format(
        question=question, choices=choices, answer=llm_output
    )

    system_message = "The answer selected from the multiple choice options should be one alphabet option (A,B,C,D,E,F)."
    messages = [
            {'role': "system", "content": system_message},
            {"role": "user", "content": input_text},
        ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

 
    outputs = model.generate(input_ids = inputs, max_new_tokens = 5000, use_cache = True,
                             temperature = 0.01, min_p = 0.1)
    generated_text = tokenizer.batch_decode(outputs)
    # Extract only the assistant's response
    assistant_response = generated_text[0].split("<|im_start|>assistant")[-1].split("<|im_end|>")[0]
    print(assistant_response)

    content = assistant_response
    match = re.search(r'```json\s*{\s*"answer"\s*:\s*"([^"]+)"\s*}\s*```', content)
    return match.group(1) if match else None
    
def correct_wrong_formats(df, wrong_format_answer):
    correct_format_answer = {}
    successful_corrections = 0
    none_corrections = 0
    for index, llm_output in wrong_format_answer.items():
        try:
            question = df.iloc[index]["question"]
            choices = df.iloc[index]["choices"]
            corrected_output = correct_format(question, choices, llm_output)
            if corrected_output:
                correct_format_answer[index] = corrected_output
                successful_corrections += 1
            else:
                correct_format_answer[index] = "A"
                none_corrections += 1
        except IndexError:
            print(f"Warning: Index {index} is out of bounds. Defaulting to 'A'")
            correct_format_answer[index] = "A"
            none_corrections += 1
    print(f"Successfully corrected: {successful_corrections}")
    print(f"Defaulted to 'A': {none_corrections}")
    return correct_format_answer


def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df["choices"] = df["choices"].apply(lambda x: eval(x))
    df["choices"] = df["choices"].apply(
        lambda x: [f"{i2choices[i]}: {x[i]}" for i in range(len(x))]
    )
    return df



def process_answers(df, prompt_template=prompt_template):
    choices_answer = []
    wrong_format_answer = {}

    # Check if temporary file exists and load previously processed answers
    temp_file_path = "output/temp_choices_answer.csv"
    start_index = 0
    os.makedirs("output", exist_ok=True)

    if os.path.exists(temp_file_path):
        print(f"Found existing file {temp_file_path}, loading previous answers...")
        temp_df = pd.read_csv(temp_file_path, header=None, names=["index", "answer"])

        # Convert index to integer for proper comparison
        temp_df["index"] = temp_df["index"].astype(int)

        # Get the last processed index
        if not temp_df.empty:
            last_processed_index = temp_df["index"].max()
            start_index = last_processed_index + 1

            # Load previous answers
            for _, row in temp_df.iterrows():
                idx = row["index"]
                answer = row["answer"]

                # Extend choices_answer list if needed
                while len(choices_answer) <= idx:
                    choices_answer.append(None)

                choices_answer[idx] = answer

                # Check if answer is in wrong format
                if answer not in i2choices.values():
                    wrong_format_answer[idx] = answer

            print(
                f"Loaded {len(temp_df)} previous answers. Continuing from index {start_index}."
            )
        else:
            # File exists but is empty
            print("Temporary file exists but is empty. Starting from the beginning.")
            # Clear the file to start fresh
            open(temp_file_path, "w").close()
    else:
        # Create a new file
        open(temp_file_path, "w").close()
        print("Starting new inference process.")

    # Continue processing from where we left off
    for i, row in df.iloc[start_index:].iterrows():
        print(i)
        input_text = prompt_template.format(
            question=row["question"], multiple_choices=row["choices"]
        )
        answer = get_answer(input_text)

        # Extend choices_answer list if needed
        while len(choices_answer) <= i:
            choices_answer.append(None)

        choices_answer[i] = answer

        # Write answers to temporary CSV file
        with open(temp_file_path, "a") as f:
            f.write(f"{i},{answer}\n")

        if answer not in i2choices.values():
            print(f"Wrong format answer at index {i}")
            wrong_format_answer[i] = answer

    return choices_answer, wrong_format_answer
def main(template):
    test_df = load_and_preprocess_data("/kaggle/input/fpt-ai-residency/b6_test_data.csv")

    choices_answer, wrong_format_answer = process_answers(
        test_df, prompt_template=template
    )
    correct_format_answer = correct_wrong_formats(test_df, wrong_format_answer)

    for i, answer in correct_format_answer.items():
        choices_answer[i] = answer

    submission = pd.DataFrame({"task_id": test_df["task_id"], "answer": choices_answer})
    submission.to_csv("output/submission.csv", index=False)
    print("Submission saved to output/submission.csv")

In [17]:
main(prompt_template)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Starting new inference process.
0

```json
{"answer": "D"}
```
1

```json
{"answer": "D"}
```
2

```json
{"answer": "C"}
```
3

```json
{"answer": "B"}
```
4

```json
{"answer": "D"}
```
5

```json
{"answer": "D"}
```
6

```json
{"answer": "D"}
```
7

```json
{"answer": "A"}
```
8

```json
{"answer": "A"}
```
9

```json
{"answer": "A"}
```
10

```json
{"answer": "A"}
```
11

```json
{"answer": "A"}
```
12

```json
{"answer": "A"}
```
13

```json
{"answer": "D"}
```
14

```json
{"answer": "D"}
```
15

```json
{"answer": "A"}
```
16

```json
{"answer": "C"}
```
17

```json
{"answer": "B"}
```
18

```json
{"answer": "C"}
```
19

```json
{"answer": "B"}
```
20

```json
{"answer": "A"}
```
21

```json
{"answer": "B"}
```
22

```json
{"answer": "A"}
```
23

```json
{"answer": "D"}
```
24

```json
{"answer": "B"}
```
25

```json
{"answer": "A"}
```
26

```json
{"answer": "D"}
```
27

```json
{"answer": "D"}
```
28

```json
{"answer": "A"}
```
29

```json
{"answer": "C"}
```
30

```json
{"answ

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [19]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Describe a tall tower in the capital of France."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)