# Math Question Answer Verification Competition

## TEAM LLM training notebook

 ## INSTALLATION


In [None]:
# Installations without capture
# to see the output
!pip install uv

!uv pip install  \
  "unsloth @ git+https://github.com/unslothai/unsloth.git" \
  "xformers" "trl" "peft" \
  "accelerate" "bitsandbytes" "transformers"
!uv pip install unsloth_zoo

In [None]:
from unsloth import FastLanguageModel
import torch
# We tried:
# 1024 - did well, some samples were of higher sequence length
# 2048 - excessive, produced 62% accuracy
# 4094 - too high, no samples have such a high seq length
# 8192 - didn't need to try it at all, RAM crash!
max_seq_length = 1048 # to fit all samples :)
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

## Training Prompts

In [None]:
# We changed the training prompt to focus more on the solution
# The model did well even with the prompt in the starter notebook
# but this is even better
training_prompt = """You are a meticulous math expert. Your goal is to verify the *entire reasoning* of the solution, not just the final answer.
Read the problem, then carefully analyze each step of the solution for logical errors, calculation mistakes, or incorrect reasoning.
Based on your step-by-step analysis, determine if the solution is correct.

Answer with ONLY True or False.

### Problem:
{}

### Solution:
{}

### Output:
{}
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    question = examples["question"]
    solution=examples["solution"]
    output= examples["is_correct"]
    texts = []
    for instruction, input, output in zip(question,solution, output):
        text = training_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

## Competition dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
dataset

In [None]:
# Set the train dataset to the train split
train_dataset = dataset['train']

# Shuffle the training dataset
shuffled_train = train_dataset.shuffle(seed=42)

# Select the first 2,000 samples for validation
validation_set = shuffled_train.select(range(2000))

# Select the next 80,000 samples for the new training set
# We also retrained on the next 82,000 after this to get better results
# which lead to about 2% improvement in accuracy
training_set = shuffled_train.select(range(2000, 82000))

In [None]:
training_set = training_set.map(formatting_prompts_func, batched = True)

## Load model and wrap with LoRA adapters

In [None]:
# We tried many different combinations of r, from 1 to 128
# and tried keeping lora alpha 2*r, r and r/2
# r = 32, and lora_alpha = r/2 did great!
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 500,
    use_rslora = False,
    loftq_config = None,
)

## SFT

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = training_set,
    dataset_text_field = "text", # The column in the dataset that contains the pre-formatted text to be trained on
    max_seq_length = max_seq_length,
    dataset_num_proc = 2, # The number of CPU processes to use for tokenizing and preparing the data. (Speeds up preprocessing)
    packing = False, # maximizes the number of tokens per batch by packing multiple samples together
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine", # improved accuracy over linear
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

## TRAINING


In [None]:
trainer_stats = trainer.train()

## SAVING MODEL

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Define the path to save the model checkpoint in Google Drive
save_path = "/content/drive/MyDrive/soham-model"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer
trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model checkpoint and tokenizer saved to: {save_path}")

In [None]:
import shutil
import os
from IPython.display import FileLink


# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer locally in the Kaggle environment
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Compress the saved model directory into a ZIP file
shutil.make_archive("soham-model-updated", 'zip', save_path
                    )

# Generate a download link for the ZIP file
# display(FileLink(r'lora_model_50.zip'))


## INFERENCE

## INFERENCE PROMPT

In [None]:
# improved inference prompt
inference_prompt = """You are a meticulous math expert. Your goal is to verify the *entire reasoning* of the solution, not just the final answer.
Read the problem, then carefully analyze each step of the solution for logical errors, calculation mistakes, or incorrect reasoning.
Based on your step-by-step analysis, determine if the solution is correct.

Answer with ONLY True or False.

### Problem:
{}

### Solution:
{}

### Output:
"""

# Function to format prompts for inference
# just like training prompt but without the output filled in
def formatting_prompts_func_inference(examples):
    question = examples["question"]
    solution=examples["solution"]
    output= examples["is_correct"]
    texts = []
    for instruction, input, output in zip(question,solution, output):
        text = inference_prompt.format(instruction, input)
        texts.append(text)
    return { "text" : texts, }

## VALIDATION DATASET



In [None]:
validation_dataset = validation_set.map(formatting_prompts_func_inference, batched = True,)

In [None]:
# Running inference on single validation sample

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

example_no=0

input_prompt=validation_dataset['text'][example_no]

print("Input Promt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response

In [None]:
## Running inference in full Validation set

final_response = []
correct_predictions = 0  # Initialize correct predictions count
for i in range(len(validation_dataset)):
    FastLanguageModel.for_inference(model)
    input_prompt=validation_dataset['text'][i]
    inputs = tokenizer([
          input_prompt
      ], return_tensors = "pt").to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    final_response.append(response[0])

## VALIDATION ACCURACY

In [None]:
def extract_last_true_false(s):
    matches = re.findall(r'\b(True|False)\b', s, flags=re.IGNORECASE)
    if matches:
        last_match = matches[-1].lower()
        return True if last_match == 'true' else False
    else:
        return False

In [None]:
import re
validation_prediction_list = [extract_last_true_false(s) for s in final_response]

In [None]:
validation_truth_list=dataset['validation']['is_correct']

In [None]:
# Ensure both lists have the same length
assert len(validation_prediction_list) == len(validation_truth_list), "Lists must have the same length."

# Calculate the number of correct predictions
correct_predictions = sum(
    pred == truth for pred, truth in zip(validation_prediction_list, validation_truth_list)
)

# Calculate accuracy
accuracy = correct_predictions / len(validation_truth_list)

print(f"Validation Accuracy: {accuracy:.2%}") # we got around 79% here!


In [None]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# improved inference prompt
inference_prompt = """You are a meticulous math expert. Your goal is to verify the *entire reasoning* of the solution, not just the final answer.
Read the problem, then carefully analyze each step of the solution for logical errors, calculation mistakes, or incorrect reasoning.
Based on your step-by-step analysis, determine if the solution is correct.

Answer with ONLY True or False.

### Problem:
{}

### Solution:
{}

### Output:
"""

# Function to parse the model's output
# which can contain more text after True/False
def parse_output(response_text):
    output_part = response_text.split("Output:\n")[-1]
    if 'true' in output_part.lower():
        return True
    return False

for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]

    prompt = inference_prompt.format(question, str(solution))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")

In [None]:
# We reloaded the model to train on the next 82,000 samples later
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = save_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
# Select the next 80,000 samples for the new training set
new_training_set = shuffled_train.select(range(82000, 164000))


In [None]:
# Map the formatting function to the new training set
new_training_set = new_training_set.map(formatting_prompts_func, batched = True,)

In [None]:
trainer_stats = trainer.train()

In [None]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# improved inference prompt
inference_prompt = """You are a meticulous math expert. Your goal is to verify the *entire reasoning* of the solution, not just the final answer.
Read the problem, then carefully analyze each step of the solution for logical errors, calculation mistakes, or incorrect reasoning.
Based on your step-by-step analysis, determine if the solution is correct.

Answer with ONLY True or False.

### Problem:
{}

### Solution:
{}

### Output:
"""

def parse_output(response_text):
    output_part = response_text.split("Output:\n")[-1]
    if 'true' in output_part.lower():
        return True
    return False

for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]

    prompt = inference_prompt.format(question, str(solution))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

file_path = '/content/submission.csv'

files.download(file_path)

print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")

In [None]:
save_path = "/content/10hr-submission-model"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("‚úÖ Model and tokenizer saved to", save_path)

In [None]:
import os
import shutil
from google.colab import files

# üóúÔ∏è Zip the folder
zip_path = f"{save_path}.zip"
shutil.make_archive(save_path, 'zip', save_path)

# üíæ Download the zip file
files.download(zip_path)
print("‚¨áÔ∏è Download started! Your model zip file is ready.")

In [None]:
import zipfile
import os

zip_path = "/content/10hr-submission-model.zip"  
extract_path = "/content/10hr-submission-model"   

# Create folder if not exists
os.makedirs(extract_path, exist_ok=True)

# Unzip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"‚úÖ Unzipped to: {extract_path}")

In [None]:
save_path = "/content/10hr-submission-model"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = save_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Prepare the loaded model for faster inference
FastLanguageModel.for_inference(model)

print(f"Model and tokenizer loaded from: {save_path}")

In [None]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# Create the prompt template for inference (no answer included)
inference_prompt = """You are a meticulous math expert. Your goal is to verify the *entire reasoning* of the solution, not just the final answer.
Read the problem, then carefully analyze each step of the solution for logical errors, calculation mistakes, or incorrect reasoning.
Based on your step-by-step analysis, determine if the solution is correct.

Answer with ONLY True or False.

### Problem:
{}

### Solution:
{}

### Output:
"""

def parse_output(response_text):
    output_part = response_text.split("Output:\n")[-1]
    if 'true' in output_part.lower():
        return True
    return False

for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]

    prompt = inference_prompt.format(question, str(solution))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

file_path = '/content/submission.csv'

files.download(file_path)

print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")