In [None]:
!pip install -U transformers datasets accelerate


Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

# opt1.3b

In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load OPT-1.3B Model (FP16, No Quantization)
# -----------------------
model_name = "facebook/opt-1.3b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",              # Uses GPU automatically if available
    torch_dtype=torch.float16       # FP16 to reduce memory usage
)
model.eval()

# -----------------------
# 2. Load Dataset (Math QA - Multiple Choice Math Problems)
# -----------------------
dataset = load_dataset("math_qa")
test_set = dataset['test']

# -----------------------
# 3. Prompt Template for Mathematical Reasoning MCQs
# -----------------------
def generate_prompt(problem, choices):
    prompt = f"Problem: {problem}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    device = model.device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print("Running evaluation on MathQA with OPT-1.3B...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['Problem']
    choices = [opt.split(') ')[1] for opt in example['options'].split('\n') if ') ' in opt]

    correct_answer_label = example['correct']
    answer_idx = ord(correct_answer_label) - ord('a')
    correct_answer = choices[answer_idx] if 0 <= answer_idx < len(choices) else ""

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = correct_answer.lower() in output.lower()

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": f"{correct_answer_label.upper()}. {correct_answer}",
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on MathQA (Mathematical Reasoning Task): {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("opt_mathqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to opt_mathqa_predictions.json")


2025-06-17 15:55:14.262249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750175714.285485     218 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750175714.292611     218 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.44k [00:00<?, ?B/s]

math_qa.py:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

The repository for math_qa contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/math_qa.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29837 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2985 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4475 [00:00<?, ? examples/s]

Running evaluation on MathQA with OPT-1.3B...



100%|██████████| 2985/2985 [1:08:01<00:00,  1.37s/it]


Final Accuracy on MathQA (Mathematical Reasoning Task): 0.8931

Predictions saved to opt_mathqa_predictions.json





# Falcon 1b

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

model_name = "tiiuae/falcon-rw-1b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

dataset = load_dataset("math_qa")
test_set = dataset['test']

def generate_prompt(problem, choices):
    prompt = f"Problem: {problem}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

def get_model_output(prompt, max_new_tokens=64):
    device = model.device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

correct = 0
total = 0
predictions = []

print(f"Running evaluation on MathQA with {model_name}...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['Problem']
    choices = [opt.split(') ')[1] for opt in example['options'].split('\n') if ') ' in opt]
    correct_answer_label = example['correct']
    answer_idx = ord(correct_answer_label) - ord('a')
    correct_answer = choices[answer_idx] if 0 <= answer_idx < len(choices) else ""
    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)
    is_correct = correct_answer.lower() in output.lower()
    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": f"{correct_answer_label.upper()}. {correct_answer}",
        "correct": is_correct
    })
    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on MathQA: {accuracy:.4f}")

with open("falcon_rw1b_mathqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)


tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

2025-06-18 05:41:01.984866: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750225262.497943      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750225262.614857      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.44k [00:00<?, ?B/s]

math_qa.py:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

The repository for math_qa contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/math_qa.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29837 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2985 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4475 [00:00<?, ? examples/s]

Running evaluation on MathQA with tiiuae/falcon-rw-1b...



  0%|          | 0/2985 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/2985 [00:02<2:27:35,  2.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/2985 [00:04<1:50:06,  2.21s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/2985 [00:06<1:38:06,  1.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/2985 [00:08<1:32:25,  1.86s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/2985 [00:09<1:29:01,  1.79s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 6/2985 [00:11<1:27:11,  1.76s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 7/2985 [00:13<1:25:42,  1.73s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 8/2985 [00:14<1:26:59,  1.75s/it]Setting `pad_tok


Final Accuracy on MathQA: 0.8007





# GPT-Neo-1.3B

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load Pythia-1B Model (FP16, No Quantization)
# -----------------------
model_name = "EleutherAI/gpt-neo-1.3B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

# -----------------------
# 2. Load Dataset (Math QA - Multiple Choice Math Problems)
# -----------------------
dataset = load_dataset("math_qa")
test_set = dataset['test']

# -----------------------
# 3. Prompt Template for Mathematical Reasoning MCQs
# -----------------------
def generate_prompt(problem, choices):
    prompt = f"Problem: {problem}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    device = model.device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print(f"Running evaluation on MathQA with {model_name}...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['Problem']
    choices = [opt.split(') ')[1] for opt in example['options'].split('\n') if ') ' in opt]

    correct_answer_label = example['correct']
    answer_idx = ord(correct_answer_label) - ord('a')
    correct_answer = choices[answer_idx] if 0 <= answer_idx < len(choices) else ""

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = correct_answer.lower() in output.lower()

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": f"{correct_answer_label.upper()}. {correct_answer}",
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on MathQA with {model_name}: {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("pythia1b_mathqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to pythia1b_mathqa_predictions.json")


Running evaluation on MathQA with EleutherAI/gpt-neo-1.3B...



  0%|          | 0/2985 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/2985 [00:02<1:40:53,  2.03s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/2985 [00:04<1:39:40,  2.00s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/2985 [00:06<1:40:09,  2.02s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/2985 [00:08<1:39:45,  2.01s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 5/2985 [00:10<1:39:28,  2.00s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/2985 [00:12<1:39:14,  2.00s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 7/2985 [00:14<1:39:35,  2.01s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 8/2985 [00:16<1:3


Final Accuracy on MathQA with EleutherAI/gpt-neo-1.3B: 0.8536

Predictions saved to pythia1b_mathqa_predictions.json





# StableLM 2 (1.6B)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load Pythia-1B Model (FP16, No Quantization)
# -----------------------
model_name = "stabilityai/stablelm-2-1_6b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

# -----------------------
# 2. Load Dataset (Math QA - Multiple Choice Math Problems)
# -----------------------
dataset = load_dataset("math_qa")
test_set = dataset['test']

# -----------------------
# 3. Prompt Template for Mathematical Reasoning MCQs
# -----------------------
def generate_prompt(problem, choices):
    prompt = f"Problem: {problem}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    device = model.device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print(f"Running evaluation on MathQA with {model_name}...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['Problem']
    choices = [opt.split(') ')[1] for opt in example['options'].split('\n') if ') ' in opt]

    correct_answer_label = example['correct']
    answer_idx = ord(correct_answer_label) - ord('a')
    correct_answer = choices[answer_idx] if 0 <= answer_idx < len(choices) else ""

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = correct_answer.lower() in output.lower()

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": f"{correct_answer_label.upper()}. {correct_answer}",
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on MathQA with {model_name}: {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("pythia1b_mathqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to pythia1b_mathqa_predictions.json")


tokenizer_config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/784 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.29G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

Running evaluation on MathQA with stabilityai/stablelm-2-1_6b...



  0%|          | 0/2985 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 1/2985 [00:01<1:12:55,  1.47s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 2/2985 [00:03<1:17:38,  1.56s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 3/2985 [00:03<45:40,  1.09it/s]  Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 4/2985 [00:04<58:48,  1.18s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 5/2985 [00:06<1:04:59,  1.31s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 6/2985 [00:07<1:06:25,  1.34s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 7/2985 [00:09<1:07:19,  1.36s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
Setting `pad_token_id` to `e


Final Accuracy on MathQA with stabilityai/stablelm-2-1_6b: 0.7970

Predictions saved to pythia1b_mathqa_predictions.json





# pythia-1b

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load Pythia-1B Model (FP16, No Quantization)
# -----------------------
model_name = "EleutherAI/pythia-1b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

# -----------------------
# 2. Load Dataset (Math QA - Multiple Choice Math Problems)
# -----------------------
dataset = load_dataset("math_qa")
test_set = dataset['test']

# -----------------------
# 3. Prompt Template for Mathematical Reasoning MCQs
# -----------------------
def generate_prompt(problem, choices):
    prompt = f"Problem: {problem}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    device = model.device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print(f"Running evaluation on MathQA with {model_name}...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['Problem']
    choices = [opt.split(') ')[1] for opt in example['options'].split('\n') if ') ' in opt]

    correct_answer_label = example['correct']
    answer_idx = ord(correct_answer_label) - ord('a')
    correct_answer = choices[answer_idx] if 0 <= answer_idx < len(choices) else ""

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = correct_answer.lower() in output.lower()

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": f"{correct_answer_label.upper()}. {correct_answer}",
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on MathQA with {model_name}: {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("pythia1b_mathqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to pythia1b_mathqa_predictions.json")


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Running evaluation on MathQA with EleutherAI/pythia-1b...



  0%|          | 0/2985 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 1/2985 [00:01<50:17,  1.01s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 2/2985 [00:01<46:53,  1.06it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 3/2985 [00:02<45:34,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 4/2985 [00:03<44:31,  1.12it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 5/2985 [00:04<45:32,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 6/2985 [00:05<46:23,  1.07it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 7/2985 [00:06<46:15,  1.07it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 8/2985 [00:07<46:31,  1.07it/s]Setting `pad_token_id` to `eos_t


Final Accuracy on MathQA with EleutherAI/pythia-1b: 0.8191

Predictions saved to pythia1b_mathqa_predictions.json



