# Math Question Answer Verification Competition

## Checkpoint Inference Notebook

**_Authors_**
- Chin - CPK286
- Divyansh Agarwal - DA3245
- Rohaan - RNA3535

## Loading a checkpoint model for inference

In [None]:
from unsloth import FastLanguageModel
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from datasets import load_dataset
from tqdm import tqdm



max_seq_length = 2048 # Choose any
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False 

### Option (1): Load from files

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained("outputs/")

### Option (2): using huggingface link to weights

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained("cinna1rolls/dlmid-cp3-72339")

In [None]:
# download and load competition dataset

dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

In [None]:
test_dataset = dataset['test']
train_dataset = dataset['train']

In [None]:
generation_prompt = """You are a highly skilled mathematician. Determine if the provided Answer and Explanation to a math question is correct or incorrect. Return True if it’s correct and False if it’s wrong. 

### Question:
{}

### Answer:
{}

### Explanation:
{}

### True/False:
"""


## inference 

### Compute Accuracy on Validation Set

In [None]:
split_dataset = train_dataset.train_test_split(test_size=0.001, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

In [None]:


FastLanguageModel.for_inference(model)
model.eval()

def batch_infer(batch_questions, batch_answers, batch_solutions):
    input_prompts = [generation_prompt.format(q, a, s) for q, a, s in zip(batch_questions, batch_answers, batch_solutions)]
    
    # Tokenize
    inputs = tokenizer(input_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1]

    with torch.amp.autocast("cuda"):
        outputs = model.generate(**inputs, max_new_tokens=3, use_cache=True)

    batch_responses = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)

        
    return batch_responses

In [None]:
# Prepare data for batching
val_questions = val_dataset['question']
val_answers = val_dataset['answer']
val_solutions = val_dataset['solution']

# Get predictions with progress bar
val_results = []
batch_size = 32
for i in tqdm(range(0, len(val_questions), batch_size), desc="Predicting"):
    batch_results = batch_infer(val_questions[i:i + batch_size], val_answers[i:i + batch_size], val_solutions[i:i + batch_size], batch_size)
    val_results.extend(batch_results)


In [None]:
val_results_bool=[]
for i in range(len(val_results)):
  if val_results[i][0][0]=='T':
    val_results_bool.append(True)
  else:
    val_results_bool.append(False)

In [None]:
accuracy = accuracy_score(val_dataset['is_correct'], val_results_bool)
print(f"Accuracy: {accuracy:.3f}")

# confusion matrix
cm = confusion_matrix(val_dataset['is_correct'], val_results_bool, labels=[True, False])

fig, ax = plt.subplots()
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['True', 'False'])
disp.plot(ax=ax, cmap="Blues")
plt.title("Confusion Matrix")
plt.show()


### inference on full test dataset

In [None]:
# Prepare data for batching
sample_questions = test_dataset['question']
sample_answers = test_dataset['answer']
sample_solutions = test_dataset['solution']

# Get predictions with progress bar
results = []
batch_size = 32
for i in tqdm(range(0, len(sample_questions), batch_size), desc="Predicting"):
    batch_results = batch_infer(sample_questions[i:i + batch_size], sample_answers[i:i + batch_size], sample_solutions[i:i + batch_size], batch_size)
    results.extend(batch_results)

results

In [None]:
# peek at one raw value
results[i][0]

In [None]:
results_bool=[]
for i in range(len(results)):
  if results[i][0][0]=='T':
    results_bool.append('True')
  else:
    results_bool.append('False')

In [None]:
results_bool[:10]

### Comparing diff

In [None]:
import csv

old_file = 'results_cpk286_3.csv'

old_preds = []
with open(old_file, mode="r") as file:
    reader = csv.reader(file)
    next(reader)  
    for row in reader:
        old_preds.append(row[1])  


differences = sum(old != new for old, new in zip(old_preds, results_bool))
total_predictions = len(old_preds)
difference_percentage = (differences / total_predictions) * 100

print(f"Total differences: {differences}")
print(f"Difference percentage: {difference_percentage:.2f}%")


### Writing File

In [None]:
with open('results_cpk286_4.csv', mode="w", newline="") as file:
    writer = csv.writer(file)
    # Write data as a single row
    writer.writerow(['ID','is_correct'])
    i=0
    for result in results_bool:
      writer.writerow([i,result])
      i+=1