In [1]:
import os

os.environ['HF_HOME'] = 'E:/Models/hf_cache'
os.environ['HUGGINGFACE_HUB_CACHE'] = 'E:/Models/hf_cache'

In [2]:
import pandas as pd

aqua_train_path = 'E:/Data/AQuA Dataset/train.json'

df = pd.read_json(aqua_train_path, lines = True)
df = df.head(5000)

In [4]:
df.shape

(5000, 4)

In [17]:
df.head()

Unnamed: 0,question,options,rationale,correct
0,"Two friends plan to walk along a 43-km trail, ...","[A)21, B)21.5, C)22, D)22.5, E)23]","If Q complete x kilometers, then P completes 1...",E
1,"In the coordinate plane, points (x, 1) and (5,...","[A)4 and 1, B)1 and 5, C)5 and 1, D)3 and 5, E...",Line k passes through the origin and has slope...,C
2,"For all numbers p and q, the operation @ is de...","[A)II, B)I and II, C)I and III, D)II and III, ...",p@q = p^2 - pq=p(p-q).... so p@q will be zero ...,B
3,Carl is facing very difficult financial times ...,"[A)$1600, B)$2000, C)$2150, D)$2500, E)$12000]","Usually, you are given the annual rate of inte...",A
4,The speed at which a man can row a boat in sti...,"[A)18 seconds, B)27 seconds, C)26 seconds, D)1...",Speed of the boat downstream = 25 +11\n= 36 km...,E


In [15]:
from sklearn.metrics import accuracy_score
def solve_rate(true_labels, predictions):
    if len(true_labels) == 0:
        return 0.0 # Avoid division by zero

    correct_count = 0
    # Ensure we only iterate up to the length of the shorter list to avoid errors
    num_samples = len(true_labels)

    for i in range(num_samples):
        # A prediction is correct only if it is not None AND it matches the label
        if predictions[i] is not None and predictions[i] == true_labels[i]:
            correct_count += 1
            
    return correct_count / num_samples

In [7]:
def create_cot_prompt(question, options):
    # The options in the dataframe are a list of strings like ['A)21', 'B)21.5', ...].
    # We'll format them into a more readable string for the prompt.
    options_str = " ".join(options)

    # This is the main prompt template with your few-shot examples
    prompt = f"""Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is?
Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
A: If 10 is added to each number, then the mean of the numbers also increases by 10. So the new mean would be
50. The answer is (a).

Q: If a / b = 3/4 and 8a + 5b = 22,then find the value of a.
Answer Choices: (a) 1/2 (b) 3/2 (c) 5/2 (d) 4/2 (e) 7/2
A: If a / b = 3/4, then b = 4a / 3. So 8a + 5(4a / 3) = 22. This simplifies to 8a + 20a / 3 = 22, which means 44a / 3
= 22. So a is equal to 3/2. The answer is (b).

Q: A person is traveling at 20 km/hr and reached his destiny in 2.5 hr then find the distance?
Answer Choices: (a) 53 km (b) 55 km (c) 52 km (d) 60 km (e) 50 km
A: The distance that the person traveled would have been 20 km/hr * 2.5 hrs = 50 km. The answer is (e).

Q: How many keystrokes are needed to type the numbers from 1 to 500?
Answer Choices: (a) 1156 (b) 1392 (c) 1480 (d) 1562 (e) 1788
A: There are 9 one-digit numbers from 1 to 9. There are 90 two-digit numbers from 10 to 99. There are 401
three-digit numbers from 100 to 500. 9 + 90(2) + 401(3) = 1392. The answer is (b).

Q: {question}
Answer Choices: {options_str}
A:"""
    return prompt

In [8]:
import re
def extract_answer(text):
    match = re.search(r"[Tt]he answer is:?\s*([\w\d\(\)\.\$]+)", text)
    
    if match:
        return match.group(1).strip('.()').upper() # since labels are in uppercase only
    return None # Return None if no match is found

In [9]:
example_text = "The answer is (b)."
print(extract_answer(example_text))

B


In [10]:
import torch

device = torch.device("cuda")
print(device)

cuda


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# 2. Pass it to the model loader
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/pythia-2.8b",
    quantization_config=nf4_config,
    device_map=device
)

In [12]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [13]:
predictions = []
true_labels = []
output_file_path = "aqua_results.txt"

# Open the file in write mode to save the outputs
with open(output_file_path, 'w', encoding='utf-8') as f:
    # Loop through each row in our dataframe subset
    for i, (index, row) in enumerate(df.iterrows()):
        question = row['question']
        options = row['options']
        correct_label = row['correct']

        prompt = create_cot_prompt(question, options)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # Generate a response
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        rationale = full_response[len(prompt):].strip()
        predicted_answer = extract_answer(rationale)
        
        f.write(f"Question: {question}\n")
        f.write(f"Rationale: {rationale}\n")
        f.write(f"Answer: {predicted_answer}\n")
        f.write("-" * 50 + "\n\n")

        print(f"({i+1}/{len(df)}) Predicted: {predicted_answer} | Correct: {correct_label}")

        # Store results for final accuracy calculation
        if predicted_answer:
            predictions.append(predicted_answer)
        # We always add the true label to keep the lists aligned for scoring
        # (even if the model failed to produce a valid answer)
        true_labels.append(correct_label)

print(f"\nEvaluation complete. All outputs saved to '{output_file_path}'")


(1/5000) Predicted: A | Correct: E
(2/5000) Predicted: None | Correct: C
(3/5000) Predicted: A | Correct: B
(4/5000) Predicted: A | Correct: A
(5/5000) Predicted: C | Correct: E
(6/5000) Predicted: None | Correct: A
(7/5000) Predicted: A | Correct: A
(8/5000) Predicted: A | Correct: E
(9/5000) Predicted: None | Correct: B
(10/5000) Predicted: None | Correct: C
(11/5000) Predicted: A | Correct: A
(12/5000) Predicted: A | Correct: C
(13/5000) Predicted: A | Correct: B
(14/5000) Predicted: None | Correct: D
(15/5000) Predicted: A | Correct: C
(16/5000) Predicted: B | Correct: B
(17/5000) Predicted: C | Correct: D
(18/5000) Predicted: A | Correct: B
(19/5000) Predicted: A | Correct: A
(20/5000) Predicted: E | Correct: E
(21/5000) Predicted: None | Correct: D
(22/5000) Predicted: None | Correct: C
(23/5000) Predicted: None | Correct: C
(24/5000) Predicted: A | Correct: B
(25/5000) Predicted: A | Correct: A
(26/5000) Predicted: A | Correct: C
(27/5000) Predicted: A | Correct: A
(28/5000) Pre

In [16]:
# --- Calculate and Display Final Accuracy ---

if len(predictions) > 0:
    accuracy = solve_rate(true_labels, predictions)
    print(f"\\nFinal Accuracy on {len(predictions)} samples: {accuracy * 100:.2f}%")
else:
    print("\\nCould not extract any answers to calculate accuracy.")

IndexError: list index out of range