# Math Question Answer Verification Competition

## Starter Code

Borrowed from [official Unsloth implementation](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=MKX_XKs_BNZR)

In [1]:
# # %%capture
# # This cell will take time
!pip install unsloth
# # Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting unsloth
  Downloading unsloth-2024.11.6-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.5-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Download

Found existing installation: unsloth 2024.11.6
Uninstalling unsloth-2024.11.6:
  Successfully uninstalled unsloth-2024.11.6
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-a6aw1y7x/unsloth_27debb5b62d849449e10e4c58eab201d
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-a6aw1y7x/unsloth_27debb5b62d849449e10e4c58eab201d
  Resolved https://github.com/unslothai/unsloth.git to commit d8ff860c842095f4729fdd1d5aedf567a9e2c4da
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2024.11.6-py3-none-a

In [10]:
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

In [11]:
from unsloth import FastLanguageModel
import torch

# 加载Meta-Llama-3.1-8B预训练模型
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Load model and wrap with LoRA adapters

In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

## Competition dataset

In [13]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset
# print(dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [14]:
# prompt = """You are a great mathematician and you are tasked with finding if an answer to a given maths question is correct or not.
# Your answer should follow these steps
# 1. Read the question carefully
# 2. Check the given answer
# 3. Solve step by step
# 4. Compare your solution with given answer
# 5. Respond ONLY with 'True' if correct, 'False' if incorrect.



# ### Question:
# {}

# ### Answer:
# {}

# ### Step-by-step verification:
# {}

# ### Output:
# {}"""


# prompt = """You are a great mathematician and you are tasked with finding if an answer to a given math question is correct or not.
# Your answer should follow these steps
# 1. Read the question carefully
# ### Question:
# {}
# 2. Check the given answer
# ### Answer:
# {}
# 3. Solve step by step
# ### Step-by-step verification:
# {}
# 4. Compare your solution with given answer
# 5. Respond ONLY with 'True' if correct, 'False' if incorrect.
# ### Output:
# {}

# ### Example 1:
# ### Question:
# What is the radius of the circle inscribed in triangle $ABC$ if $AB = 22, AC=12,$ and $BC=14$? Express your answer in simplest radical form.

# ### Answer:
# 3.16227766016838

# ### Step-by-step verification:
# The circle is inscribed in a triangle, and we know the sides of the triangle.
# To use the inradius formula, we need to know the area of the triangle.
# We can use Heron's formula to calculate the area.
# <llm-code>
# import math
# from sympy import *

# AB, AC, BC = 22, 12, 14

# # Calculate the semiperimeter and area using Heron's formula
# s = (AB + AC + BC) / 2
# K = sqrt(s * (s - AB) * (s - AC) * (s - BC))

# print(K)
# </llm-code>
# <llm-code-output>
# 75.8946638440411
# </llm-code-output>
# Let's now use the formula for the radius of the inscribed circle.
# <llm-code>
# r = K / s
# print(r)
# </llm-code>
# <llm-code-output>
# 3.16227766016838
# </llm-code-output>
# The answer is \boxed{3.16227766016838}

# ### Output:
# True
# """


# EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# def formatting_prompts_func(examples):
#     question = examples["question"]
#     ans       = examples["answer"]
#     solution  = examples["solution"]
#     output      = examples["is_correct"]
#     texts = []
#     for instruction, input,solution, output in zip(question, ans,solution, output):
#         # Must add EOS_TOKEN, otherwise your generation will go on forever!
#         # text = prompt.format(instruction, input, output) + EOS_TOKEN
#         text = prompt.format(instruction, input, solution, output) + EOS_TOKEN
#         texts.append(text)
#     return { "text" : texts, }








prompt = """You are a great mathematician and you are tasked with finding if an answer to a given math question is correct or not.
Your answer should follow these steps
1. Read the question carefully
### Question:
{}
2. Check the given answer
### Answer:
{}
3. Solve step by step
### Step-by-step verification:
{}
4. Compare your solution with given answer
5. Respond ONLY with 'True' if correct, 'False' if incorrect.
### Output:
{}

### Example 1:
### Question:
What is the radius of the circle inscribed in triangle $ABC$ if $AB = 22, AC=12,$ and $BC=14$? Express your answer in simplest radical form.

### Answer:
3.16227766016838

### Step-by-step verification:
The circle is inscribed in a triangle, and we know the sides of the triangle.
To use the inradius formula, we need to know the area of the triangle.
We can use Heron's formula to calculate the area.
<llm-code>
import math
from sympy import *

AB, AC, BC = 22, 12, 14

# Calculate the semiperimeter and area using Heron's formula
s = (AB + AC + BC) / 2
K = sqrt(s * (s - AB) * (s - AC) * (s - BC))

print(K)
</llm-code>
<llm-code-output>
75.8946638440411
</llm-code-output>
Let's now use the formula for the radius of the inscribed circle.
<llm-code>
r = K / s
print(r)
</llm-code>
<llm-code-output>
3.16227766016838
</llm-code-output>
The answer is 3.16227766016838

### Output:
True
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans       = examples["answer"]
    solution  = examples["solution"]
    output      = examples["is_correct"]
    texts = []
    for instruction, input,solution, output in zip(question, ans,solution, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        # text = prompt.format(instruction, input, output) + EOS_TOKEN
        text = prompt.format(instruction, input, solution, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


In [15]:
# Process the training dataset and generate prompt for each datapoint
train_dataset = dataset['train'].map(formatting_prompts_func,
                                     batched = True,
                                     batch_size=64,
                                     num_proc=4,
                                     load_from_cache_file=False,)

Map (num_proc=4):   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [16]:
#print a smaple training example
train_dataset['text'][0]
train_dataset['text'][1]

"You are a great mathematician and you are tasked with finding if an answer to a given math question is correct or not.\nYour answer should follow these steps\n1. Read the question carefully\n### Question:\nIf $x + y = 16$ and $x-y = 2$, what is the value of $x^2 - y^2$?\n2. Check the given answer\n### Answer:\n32\n3. Solve step by step\n### Step-by-step verification:\nWe can solve this problem using Python's sympy library to do symbolic algebra.\n<llm-code>\nfrom sympy import symbols, Eq, solve\n\nx, y = symbols('x, y')\n# Define the equations\neq1 = Eq(x + y, 16)\neq2 = Eq(x - y, 2)\n\n# Solve the system of equations\nsolutions = solve((eq1, eq2), (x, y))\n\n# Computing x^2-y^2 using the values of x and y computed from the system of equations\nx_val, y_val = solutions[x], solutions[y]\nx_val**2 - y_val**2\n</llm-code>\n<llm-code-output>\n32\n</llm-code-output>\nSo $x^2-y^2$ is \\boxed{32}.\n4. Compare your solution with given answer\n5. Respond ONLY with 'True' if correct, 'False' if

## SFT

In [19]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# 与训练有关的重要参数都在这边调
training_args = TrainingArguments(
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 10, # Set this for 1 full training run.
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        eval_steps = 10,  # 每 10 个步骤进行一次评估
        seed = 42,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
)

In [20]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
torch.cuda.empty_cache()
# small_train_dataset = train_dataset.select(range(10000))  # 取数据集的一个小样本例如使用前10000个样本
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args
)

Map (num_proc=4):   0%|          | 0/1000000 [00:00<?, ? examples/s]

TimeoutError: 

In [None]:
PYTORCH_CUDA_ALLOC_CONF=expandable_segments=True
torch.cuda.empty_cache()
trainer_stats = trainer.train()

## Saving model

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

In [None]:
# Sample inferene data point
test_dataset = dataset['test']
sample_ques = test_dataset['question'][0]
sample_sols = test_dataset['solution'][0]
sample_ans = test_dataset['answer'][0]

## inference

In [None]:
# del outputs
# del tokenizer
# del trainer
# del trainer_stats

In [None]:
torch.cuda.empty_cache()
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # llm_int8_enable_fp32_cpu_offload=True
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


In [None]:
# Running inference on single test
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
input_prompt = prompt.format(
        sample_ques, # ques
        sample_ans, # given answer
        sample_sols, # solution
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )

print("Input Promt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, top_p=0.9).to("cuda")
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
# response

In [None]:
!pip install tqdm pandas

import os
import torch
from tqdm import tqdm
import pandas as pd
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# def create_submission(model, tokenizer, test_dataset, batch_size=4):
#     FastLanguageModel.for_inference(model)
#     predictions = []

#     # Process test dataset in batches with progress bar
#     for i in tqdm(range(0, len(test_dataset), batch_size), desc="Processing test dataset"):
#         # Clear cache periodically
#         if i % 100 == 0:
#             torch.cuda.empty_cache()

#         batch_end = min(i + batch_size, len(test_dataset))
#         batch_questions = test_dataset['question'][i:batch_end]
#         batch_solutions = test_dataset['solution'][i:batch_end]
#         batch_answers = test_dataset['answer'][i:batch_end]

#         # Create prompts for the batch
#         batch_prompts = [
#             prompt.format(q, a, s,"")
#             for q, s, a in zip(batch_questions,batch_solutions,batch_answers)
#         ]

#         try:
#             # Tokenize batch
#             inputs = tokenizer(
#                 batch_prompts,
#                 return_tensors="pt",
#                 padding=True,
#                 truncation=True,
#                 max_length=2048
#             ).to("cuda")

#             inputs = {k: v.to("cuda") for k, v in inputs.items()}

#             # Generate predictions
#             with torch.inference_mode():
#                 outputs = model.generate(
#                     **inputs,
#                     max_new_tokens=64,
#                     top_p=0.9,
#                     use_cache=True,
#                     pad_token_id=tokenizer.eos_token_id
#                 ).to("cuda")


#             # Process outputs
#             for j in range(len(outputs)):
#                 input_len = len(inputs['input_ids'][j])
#                 response = tokenizer.decode(outputs[j][input_len:], skip_special_tokens=True).strip()
#                 prediction = True if 'True' in response.split()[0] else False
#                 predictions.append(prediction)

#             # Clean up
#             del inputs, outputs

#         except RuntimeError as e:
#             print(f"\nError in batch processing at index {i}. Switching to single example processing.")
#             # Process examples one by one if batch fails
#             for q, a,s in zip(batch_questions, batch_answers,batch_solutions):
#                 try:
#                     single_prompt = prompt.format(q, a, s,"")
#                     inputs = tokenizer([single_prompt], return_tensors="pt", truncation=True, max_length=2048)
#                     with torch.inference_mode():
#                         outputs = model.generate(**inputs, max_new_tokens=64, top_p=0.9, use_cache=True)
#                     response = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True).strip()
#                     prediction = True if 'True' in response.split()[0] else False
#                     predictions.append(prediction)
#                     del inputs, outputs
#                     torch.cuda.empty_cache()
#                 except Exception as e:
#                     print(f"\nError processing single example: {e}")
#                     predictions.append(False)  # Default prediction in case of error

#     # Create final submission DataFrame
#     submission_df = pd.DataFrame({
#         'ID': range(len(predictions)),
#         'is_correct': predictions
#     })

#     if len(submission_df) != len(test_dataset):
#         print(f"\nWarning: Number of predictions ({len(predictions)}) doesn't match test dataset size ({len(test_dataset)})")

#     # Save submission
#     submission_df.to_csv('submission.csv', index=False)
#     print("\nSubmission saved successfully!")
#     print(f"Total predictions: {len(predictions)}")
#     print("\nFirst few predictions:")
#     print(submission_df.head())

#     return submission_df

# # Clear GPU memory before running
# print("Clearing GPU memory...")
# torch.cuda.empty_cache()

# # Create submission
# test_dataset = dataset['test']
# submission = create_submission(
#     model=model,
#     tokenizer=tokenizer,
#     test_dataset=test_dataset,
#     batch_size=25
# )

In [None]:
import gc
import torch
from tqdm import tqdm
import pandas as pd
gc.collect()
torch.cuda.empty_cache()

if torch.cuda.is_available():
    # Clear any remaining memory in torch's cache
    with torch.cuda.device('cuda'):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

In [None]:
def create_submission(model, tokenizer, test_dataset, batch_size=4):
    # Move model to CUDA and set to eval mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    FastLanguageModel.for_inference(model)

    predictions = []

    # Process test dataset in batches with progress bar
    for i in tqdm(range(0, len(test_dataset), batch_size), desc="Processing test dataset"):
        # Clear cache periodically
        if i % 100 == 0:
            torch.cuda.empty_cache()

        batch_end = min(i + batch_size, len(test_dataset))
        batch_questions = test_dataset['question'][i:batch_end]
        batch_solutions = test_dataset['solution'][i:batch_end]
        batch_answers = test_dataset['answer'][i:batch_end]

        # Create prompts for the batch
        batch_prompts = [
            prompt.format(q, a, s, "")
            for q, s, a in zip(batch_questions, batch_solutions, batch_answers)
        ]

        try:
            # Tokenize batch and move to device
            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=1024
            )

            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Generate predictions
            with torch.inference_mode():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=64,
                    top_p=0.9,
                    use_cache=True,
                    pad_token_id=tokenizer.eos_token_id
                )

                # Move outputs to CPU for processing
                outputs = outputs.cpu()

            # Process outputs
            for j in range(len(outputs)):
                input_len = len(inputs['input_ids'][j])
                response = tokenizer.decode(outputs[j][input_len:], skip_special_tokens=True).strip()
                prediction = True if 'True' in response.split()[0] else False
                predictions.append(prediction)

            # Clean up
            del inputs, outputs
            torch.cuda.empty_cache()

        except Exception as e:
            print(f"\nError in batch processing at index {i}: {str(e)}")
            print("Switching to single example processing.")

            # Process examples one by one if batch fails
            for q, a, s in zip(batch_questions, batch_answers, batch_solutions):
                try:
                    single_prompt = prompt.format(q, a, s, "")
                    inputs = tokenizer([single_prompt], return_tensors="pt", truncation=True, max_length=2048)
                    inputs = {k: v.to(device) for k, v in inputs.items()}

                    with torch.inference_mode():
                        outputs = model.generate(
                            **inputs,
                            max_new_tokens=64,
                            top_p=0.9,
                            use_cache=True
                        ).cpu()

                    response = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True).strip()
                    prediction = True if 'True' in response.split()[0] else False
                    predictions.append(prediction)

                    del inputs, outputs
                    torch.cuda.empty_cache()

                except Exception as e:
                    print(f"\nError processing single example: {str(e)}")
                    predictions.append(False)  # Default prediction in case of error

    # Create final submission DataFrame
    submission_df = pd.DataFrame({
        'ID': range(len(predictions)),
        'is_correct': predictions
    })

    if len(submission_df) != len(test_dataset):
        print(f"\nWarning: Number of predictions ({len(predictions)}) doesn't match test dataset size ({len(test_dataset)})")

    # Save submission
    submission_df.to_csv('submission.csv', index=False)
    print("\nSubmission saved successfully!")
    print(f"Total predictions: {len(predictions)}")
    print("\nFirst few predictions:")
    print(submission_df.head())

    return submission_df

# Clear GPU memory before running
print("Clearing GPU memory...")
torch.cuda.empty_cache()

# Create submission
test_dataset = dataset['test']
submission = create_submission(
    model=model,
    tokenizer=tokenizer,
    test_dataset=test_dataset,
    batch_size=16
)