In [1]:
!pip install --upgrade datasets transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, transformers
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled

In [2]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.1

**OPT 1.3b**

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from tqdm import tqdm
import json



# -----------------------
# 1. Load OPT-1.3B Model
# -----------------------
model_name = "facebook/opt-1.3b"

# Configure quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # If 8-bit is still too large, try 4-bit:
    # load_in_4bit=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model with quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
)
# When using load_in_8bit or load_in_4bit, the model is automatically moved to the GPU
# So, you don't need the explicit model.to(device) call
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device) # This line is no longer necessary with load_in_8bit/4bit

model.eval()

# -----------------------
# 2. Load Dataset (CommonsenseQA - Reasoning MCQs)
# -----------------------
dataset = load_dataset("commonsense_qa")
test_set = dataset['validation']

# -----------------------
# 3. Prompt Template for Reasoning MCQs
# -----------------------
def generate_prompt(question, choices):
    prompt = f"Question: {question}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    # Ensure inputs are on the correct device.
    # When using 8-bit or 4-bit, the model is likely on GPU,
    # so inputs should also be on GPU.
    device = model.device # Get the actual device of the model
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print("Running evaluation on CommonsenseQA with OPT-1.3B...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['question']
    # Add the check for the structure of 'choices' to handle potential dictionary format
    if isinstance(example['choices'], dict):
        # Assuming the dictionary structure has 'text' and 'label' as lists
        choices_list = [{'text': t, 'label': l} for t, l in zip(example['choices']['text'], example['choices']['label'])]
    else:
        # Otherwise, assume it's already a list of dictionaries
        choices_list = example['choices']

    # Now, safely extract 'text' from the list of dictionaries
    choices = [choice['text'] for choice in choices_list]
    correct_answer = example['answerKey']

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = (correct_answer in output) or (any(opt.lower() in output.lower() for opt in choices if opt == correct_answer))

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": correct_answer,
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on CommonsenseQA (Reasoning Task): {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("opt_commonsenseqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to opt_commonsenseqa_predictions.json")

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

2025-06-16 17:17:37.886098: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750094258.117933      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750094258.186481      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/160k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Running evaluation on CommonsenseQA with OPT-1.3B...



100%|██████████| 1221/1221 [2:17:27<00:00,  6.75s/it] 


Final Accuracy on CommonsenseQA (Reasoning Task): 0.9558

Predictions saved to opt_commonsenseqa_predictions.json





**STABLE LM 2**

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load Model (StableLM 2 - 1.6B)
# -----------------------
model_name = "stabilityai/stablelm-2-zephyr-1_6b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# -----------------------
# 2. Load Dataset (CommonsenseQA - Reasoning MCQs)
# -----------------------
dataset = load_dataset("commonsense_qa")
test_set = dataset['validation']

# -----------------------
# 3. Prompt Template for Reasoning MCQs
# -----------------------
def generate_prompt(question, choices):
    prompt = f"Question: {question}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print("Running evaluation on CommonsenseQA with StableLM 2...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['question']
    # Add the check for the structure of 'choices' to handle potential dictionary format
    if isinstance(example['choices'], dict):
        # Assuming the dictionary structure has 'text' and 'label' as lists
        choices_list = [{'text': t, 'label': l} for t, l in zip(example['choices']['text'], example['choices']['label'])]
    else:
        # Otherwise, assume it's already a list of dictionaries
        choices_list = example['choices']

    # Now, safely extract 'text' from the list of dictionaries
    choices = [choice['text'] for choice in choices_list]

    correct_answer = example['answerKey']

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = correct_answer.lower() in output.lower()

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": correct_answer,
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on CommonsenseQA (Reasoning Task): {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("stablelm_commonsenseqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to stablelm_commonsenseqa_predictions.json")

tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/784 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

2025-06-17 01:32:06.327424: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750123926.518988      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750123926.579121      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/3.29G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/160k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Running evaluation on CommonsenseQA with StableLM 2...



  0%|          | 0/1221 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 1/1221 [00:02<57:30,  2.83s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 2/1221 [00:04<46:43,  2.30s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 3/1221 [00:06<42:54,  2.11s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 4/1221 [00:08<40:50,  2.01s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 5/1221 [00:10<39:36,  1.95s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  0%|          | 6/1221 [00:12<39:03,  1.93s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  1%|          | 7/1221 [00:14<38:49,  1.92s/it]Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  1%|          | 8/1221 [00:15<38:22,  1


Final Accuracy on CommonsenseQA (Reasoning Task): 0.9943

Predictions saved to stablelm_commonsenseqa_predictions.json





**gpt-neo-1.3B**

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load gpt-neo-1.3B Model
# -----------------------
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# -----------------------
# 2. Load Dataset (CommonsenseQA - Reasoning MCQs)
# -----------------------
dataset = load_dataset("commonsense_qa")
test_set = dataset['validation']

# -----------------------
# 3. Prompt Template for Reasoning MCQs
# -----------------------
def generate_prompt(question, choices):
    prompt = f"Question: {question}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print("Running evaluation on CommonsenseQA with gpt-neo-1.3B...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['question']
    # Add the check for the structure of 'choices' to handle potential dictionary format
    if isinstance(example['choices'], dict):
        # Assuming the dictionary structure has 'text' and 'label' as lists
        choices_list = [{'text': t, 'label': l} for t, l in zip(example['choices']['text'], example['choices']['label'])]
    else:
        # Otherwise, assume it's already a list of dictionaries
        choices_list = example['choices']

    # Now, safely extract 'text' from the list of dictionaries
    choices = [choice['text'] for choice in choices_list]
    correct_answer = example['answerKey']

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = (correct_answer in output) or (any(opt.lower() in output.lower() for opt in choices if opt == correct_answer))

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": correct_answer,
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on CommonsenseQA (Reasoning Task): {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("gptneo_commonsenseqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to gptneo_commonsenseqa_predictions.json")

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

Running evaluation on CommonsenseQA with gpt-neo-1.3B...



  0%|          | 0/1221 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/1221 [00:02<47:55,  2.36s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/1221 [00:04<45:47,  2.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/1221 [00:06<44:16,  2.18s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/1221 [00:07<30:54,  1.52s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 5/1221 [00:09<34:21,  1.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/1221 [00:11<36:24,  1.80s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 7/1221 [00:13<37:36,  1.86s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 8/1221 [00:15<38:12,  1.89s/it]


Final Accuracy on CommonsenseQA (Reasoning Task): 0.9066

Predictions saved to gptneo_commonsenseqa_predictions.json





**falcon-rw-1b**

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load Falcon-rw-1b Model
# -----------------------
model_name = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# -----------------------
# 2. Load Dataset (CommonsenseQA - Reasoning MCQs)
# -----------------------
dataset = load_dataset("commonsense_qa")
test_set = dataset['validation']

# -----------------------
# 3. Prompt Template for Reasoning MCQs
# -----------------------
def generate_prompt(question, choices):
    prompt = f"Question: {question}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print("Running evaluation on CommonsenseQA with Falcon-rw-1b...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['question']
    # Add the check for the structure of 'choices'
    if isinstance(example['choices'], dict):
        # Assuming the dictionary structure has 'text' and 'label' as lists
        choices_list = [{'text': t, 'label': l} for t, l in zip(example['choices']['text'], example['choices']['label'])]
    else:
        # Otherwise, assume it's already a list of dictionaries
        choices_list = example['choices']

    # Now, safely extract 'text' from the list of dictionaries
    choices = [choice['text'] for choice in choices_list]
    correct_answer = example['answerKey']

    prompt = generate_prompt(question, choices)
    output = get_model_output(prompt)

    is_correct = (correct_answer in output) or (any(opt.lower() in output.lower() for opt in choices if opt == correct_answer))

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices,
        "ground_truth_answer": correct_answer,
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on CommonsenseQA (Reasoning Task): {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("falcon_commonsenseqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to falcon_commonsenseqa_predictions.json")

tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Running evaluation on CommonsenseQA with Falcon-rw-1b...




  0%|          | 0/1221 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  0%|          | 1/1221 [00:01<36:28,  1.79s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  0%|          | 2/1221 [00:03<34:53,  1.72s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  0%|          | 3/1221 [00:05<34:22,  1.69s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  0%|          | 4/1221 [00:06<34:14,  1.69s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  0%|          | 5/1221 [00:08<34:24,  1.70s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  0%|          | 6/1221 [00:10<34:11,  1.69s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  1%|          | 7/1221 [00:11<34:06,  1.69s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

  1%|          | 8/1221 [00:13<33:58,  1.68s/it


Final Accuracy on CommonsenseQA (Reasoning Task): 0.7477

Predictions saved to falcon_commonsenseqa_predictions.json





**pythia-1b**

In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import json

# -----------------------
# 1. Load Model
# -----------------------
model_name = "EleutherAI/pythia-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# -----------------------
# 2. Load Dataset (CommonsenseQA - Reasoning MCQs)
# -----------------------
dataset = load_dataset("commonsense_qa")
test_set = dataset['validation']

# -----------------------
# 3. Prompt Template for Reasoning MCQs
# -----------------------
def generate_prompt(question, choices):
    prompt = f"Question: {question}\nChoices:\n"
    for idx, choice in enumerate(choices):
        prompt += f"{chr(65+idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt

# -----------------------
# 4. Generate Model Output
# -----------------------
def get_model_output(prompt, max_new_tokens=64):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(prompt):].strip()

# -----------------------
# 5. Evaluation
# -----------------------
correct = 0
total = 0
predictions = []

print("Running evaluation on CommonsenseQA Reasoning Task...\n")

for example in tqdm(test_set, total=len(test_set)):
    question = example['question']
    # Check if 'choices' is a dictionary and convert it to a list if needed
    # Based on dataset inspection, 'choices' can sometimes be a dict with 'text' and 'label' keys
    # We want a list of dictionaries, each containing a 'text' key
    if isinstance(example['choices'], dict):
        # Assuming the dictionary structure has 'text' and 'label' as lists
        choices = [{'text': t, 'label': l} for t, l in zip(example['choices']['text'], example['choices']['label'])]
    else:
        # Otherwise, assume it's already a list of dictionaries
        choices = example['choices']

    # Now, safely extract 'text' from the (now confirmed) list of dictionaries
    choices_text = [choice['text'] for choice in choices]
    correct_answer = example['answerKey']

    prompt = generate_prompt(question, choices_text) # Pass choices_text to generate_prompt
    output = get_model_output(prompt)

    is_correct = (correct_answer in output) or (any(opt.lower() in output.lower() for opt in choices_text if opt == correct_answer)) # Use choices_text here

    predictions.append({
        "question": question,
        "prompt": prompt,
        "output": output,
        "choices": choices_text, # Save choices_text
        "ground_truth_answer": correct_answer,
        "correct": is_correct
    })

    total += 1
    correct += int(is_correct)

accuracy = correct / total if total > 0 else 0.0
print(f"\nFinal Accuracy on CommonsenseQA (Reasoning Task): {accuracy:.4f}")

# -----------------------
# 6. Save Predictions
# -----------------------
with open("commonsenseqa_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("\nPredictions saved to commonsenseqa_predictions.json")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Running evaluation on CommonsenseQA Reasoning Task...



  0%|          | 0/1221 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 1/1221 [00:01<24:39,  1.21s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 2/1221 [00:02<24:04,  1.18s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 3/1221 [00:03<23:54,  1.18s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 4/1221 [00:04<24:14,  1.19s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 5/1221 [00:05<24:13,  1.19s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 6/1221 [00:07<24:06,  1.19s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|          | 7/1221 [00:08<24:01,  1.19s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|          | 8/1221 [00:09<24:03,  1.19s/it]Setting `pad_token_id` to `eos_t