# Math Question Answer Verification Competition

## Starter Code

Borrowed from [official Unsloth implementation](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=MKX_XKs_BNZR)

In [None]:
# %%capture
# This cell will take time
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
pip install wandb

In [4]:
import wandb
wandb.login()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "/content/drive/MyDrive/lora",
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit,
# )

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [9]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

## Load model and wrap with LoRA adapters

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Competition dataset

In [11]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [12]:
prompt = """You are a skilled mathematician, tasked with verifying if a given answer to a math question is correct. Carefully review the question, the provided answer, and the solution steps. Respond only with 'True' if the answer is correct or 'False' if the answer is incorrect, based strictly on the solution.

### Question:
{}

### Provided Answer:
{}

### Solution:
{}

### Final Answer (Is the provided answer correct? Reply with 'True' or 'False' only):
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans = examples["answer"]
    solution = examples["solution"]
    output = examples["is_correct"]
    texts = []
    for ques, ans_text, sol, correct in zip(question, ans, solution, output):
        # Generate formatted prompt and add EOS_TOKEN to ensure stopping
        text = prompt.format(ques, ans_text, sol, correct) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}





In [15]:
from datasets import Dataset

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
        ## increase batch size to 4
        per_device_train_batch_size = 4,
        ## increase gradient acc to 8
        gradient_accumulation_steps = 8,
        ## increase warmup_steps to 100
        warmup_steps = 30,
        ## changed num_epochs to 3
        num_train_epochs = 1, # Set this for 1 full training run.
        ## changed max_steps to 150
        #max_steps = 100,
        ## lower learning rate to 1e-4
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 100,
        optim = "adamw_torch",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
        run_name = "DLMidterm"
    )

import random

# Define the number of samples to select per iteration
total_samples = 1000000
chunk_size = 10000  # Number of samples to train on in each iteration
total_iterations = 2  # Number of iterations

# Loop through the dataset in chunks
for i in range(total_iterations):
    # Randomly select 10,000 indices within the total dataset range
    sampled_indices = random.sample(range(total_samples), chunk_size)

    # Use select to get a subset of the dataset as a Dataset object
    sampled_train_dataset = dataset['train'].select(sampled_indices)

    # Apply the formatting function to the sampled dataset
    train_dataset = sampled_train_dataset.map(formatting_prompts_func, batched=True)

    # Initialize the Trainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=12,
        packing=False,  # Can make training 5x faster for short sequences
        args=training_args
    )

    # Train the model on this chunk
    trainer.train()

    # Optional: Save the model checkpoint after each chunk
    trainer.save_model(f"outputs/checkpoint-{i + 1}")

    print(f"Completed training on random chunk {i + 1} of {total_iterations}")


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/10000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 312
 "-____-"     Number of trainable parameters = 83,886,080
[34m[1mwandb[0m: Currently logged in as: [33mzyf20010627[0m ([33mzyf20010627-new-york-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Step,Training Loss
100,0.6843
200,0.5867
300,0.5585


Completed training on random chunk 1 of 2


Map (num_proc=12):   0%|          | 0/10000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 312
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss
100,0.4608
200,0.4538
300,0.5122


Completed training on random chunk 2 of 2


In [None]:
# Apply the formatting function to the sampled dataset
train_dataset = dataset['train'].map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [None]:
#print a smaple training example
train_dataset['text'][0]

"You are a skilled mathematician, tasked with verifying if a given answer to a math question is correct. Carefully review the question, the provided answer, and the solution steps. Respond only with 'True' if the answer is correct or 'False' if the answer is incorrect, based strictly on the solution.\n\n### Question:\nA line is parameterized by\n\\[\\begin{pmatrix} x \\\\ y \\end{pmatrix} = \\begin{pmatrix} 2 \\\\ 3 \\end{pmatrix} + t \\begin{pmatrix} -1 \\\\ 5 \\end{pmatrix}.\\]A second line is parameterized by\n\\[\\begin{pmatrix} x \\\\ y \\end{pmatrix} = \\begin{pmatrix} 0 \\\\ 7 \\end{pmatrix} + u \\begin{pmatrix} -1 \\\\ 4 \\end{pmatrix}.\\]Find the point where the lines intersect.\n\n### Provided Answer:\n(2/3,4/3)\n\n### Solution (please analyze this to verify the answer):\nFirst, we need to solve the system of equations\n\\[\n\\begin{aligned}\n2 - t &= s\\\\\n3 + 5t &= 7 + 4s\n\\end{aligned}\n\\]\nby eliminating s.\nWe'll use sympy.\n<llm-code>\nfrom sympy import symbols, solv

## SFT

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.1819
20,0.6498
30,0.6698
40,0.6522
50,0.6393
60,0.6424
70,0.6535
80,0.5937
90,0.6416
100,0.6238


In [20]:
torch.cuda.empty_cache()


## inference

In [16]:
# Sample inferene data point
test_dataset = dataset['test']

sample_ques = test_dataset['question'][0]
sample_ans = test_dataset['answer'][0]
sample_sol = test_dataset['solution'][0]
sample_label = test_dataset['is_correct'][0]


In [18]:

# Running inference on single test
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
input_prompt = prompt.format(
        sample_ques, # ques
        sample_ans, # given answer
        sample_sol, # solution
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )

print("Input Promt:\n", input_prompt)

inputs = tokenizer([input_prompt], return_tensors = "pt").to("cuda")

# Check the structure of inputs

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response

Input Promt:
 You are a skilled mathematician, tasked with verifying if a given answer to a math question is correct. Carefully review the question, the provided answer, and the solution steps. Respond only with 'True' if the answer is correct or 'False' if the answer is incorrect, based strictly on the solution.

### Question:
The Parker family needs to leave the house by 5 pm for a dinner party. Mrs. Parker was waiting to get into the bathroom at 2:30 pm. Her oldest daughter used the bathroom for 45 minutes and her youngest daughter used the bathroom for another 30 minutes. Then her husband used it for 20 minutes. How much time will Mrs. Parker have to use the bathroom to leave on time?

### Provided Answer:
205

### Solution:
Let's solve this problem using Python code.
<llm-code>
minutes_per_hour = 60
minutes_left_before_5 = 5 * minutes_per_hour
total_time_spent_by_family = 45 + 30 + 20
minutes_before_5_after_family = minutes_left_before_5 - total_time_spent_by_family
minutes_before

['True']

In [14]:
model2, tokenizer2 = FastLanguageModel.from_pretrained(
    model_name = "/content/drive/MyDrive/lora",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [19]:
import torch
import csv
FastLanguageModel.for_inference(model)
# Parameters
batch_size = 16  # Adjust based on GPU memory
total_samples = len(test_dataset['question'])

# Initialize list to store results
results = []

# Loop through the test dataset in batches
for start_idx in range(0, total_samples, batch_size):
    end_idx = min(start_idx + batch_size, total_samples)

    # Prepare a batch of input prompts
    batch_questions = test_dataset['question'][start_idx:end_idx]
    batch_answers = test_dataset['answer'][start_idx:end_idx]
    batch_solutions = test_dataset['solution'][start_idx:end_idx]

    # Format prompts for the batch
    input_prompts = [
        prompt.format(question, answer, solution, "")
        for question, answer, solution in zip(batch_questions, batch_answers, batch_solutions)
    ]

    # Tokenize and move inputs to device in a single batch
    inputs = tokenizer(input_prompts, return_tensors="pt", padding=True).to("cuda")

    # Run inference in batch
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

    # Decode outputs
    responses = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Process each response in the batch
    for idx, response in enumerate(responses):
        # Convert response to boolean
        model_prediction = response.strip() == "True"

        # Store result with ID and prediction
        test_case_id = start_idx + idx
        results.append([test_case_id, model_prediction])

# Write results to CSV file
with open("predictions3.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "Prediction"])  # Write header
    writer.writerows(results)

print("Predictions saved to predictions2.csv")


Predictions saved to predictions2.csv


## saving model

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
