In [1]:
import re
import json
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import os

# Configuration
MODEL_NAME = "Qwen/Qwen2.5-Math-1.5B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_SAMPLES = 500
CHECKPOINT_INTERVAL = 50
CHECKPOINT_FILE = "gsm8k_baseline_checkpoint.jsonl"
OUTPUT_FILE = "gsm8k_baseline.jsonl"

print(f"Using device: {DEVICE}")
print(f"Loading dataset...")

# Load dataset
dataset = load_dataset("openai/gsm8k", "main", split="test")
dataset = dataset.select(range(min(NUM_SAMPLES, len(dataset))))

print(f"Evaluating on {len(dataset)} samples")
print(f"Loading model: {MODEL_NAME}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval()

def normalize_numeric_answer(answer):
    """
    Normalize numeric answers for comparison by:
    - Removing commas (thousands separators)
    - Removing dollar signs and percent signs
    - Removing whitespace
    - Converting to string for comparison
    """
    if not answer:
        return ""

    # Convert to string if not already
    answer = str(answer).strip()

    # Remove common formatting characters
    answer = answer.replace(',', '')
    answer = answer.replace('$', '')
    answer = answer.replace('%', '')
    answer = answer.replace(' ', '')

    # Try to convert to float then back to string to normalize
    # This handles cases like "42.0" vs "42"
    try:
        num = float(answer)

        # Check for infinity or NaN
        if not (-1e15 < num < 1e15):
            return answer

        # If it's a whole number, convert to int
        if num == int(num):
            return str(int(num))
        return str(num)
    except (ValueError, OverflowError):
        return answer

def extract_answer(text):
    """
    Extract answer from model output.
    Priority order:
    1. Content inside \boxed{}
    2. Last number in the text
    """
    # First try to find answer in \boxed{}
    boxed_match = re.search(r"\\boxed\{([^}]+)\}", text)
    if boxed_match:
        return normalize_numeric_answer(boxed_match.group(1))

    # Fallback: extract last number from text
    nums = re.findall(r"-?\d+\.?\d*", text)
    if nums:
        return normalize_numeric_answer(nums[-1])

    return ""

def extract_gold_answer(answer_text):
    """
    Extract the gold answer from GSM8K format.
    GSM8K answers end with #### followed by the final answer.
    """
    # GSM8K format: solution text followed by #### answer
    parts = answer_text.strip().split("####")
    if len(parts) >= 2:
        gold = parts[-1].strip()
    else:
        # Fallback: take last number
        nums = re.findall(r"-?\d+\.?\d*", answer_text)
        gold = nums[-1] if nums else ""

    return normalize_numeric_answer(gold)

# Check if checkpoint exists and load progress
start_idx = 0
results = []
correct = 0

if os.path.exists(CHECKPOINT_FILE):
    print(f"Loading checkpoint from {CHECKPOINT_FILE}")
    with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            result = json.loads(line)
            results.append(result)
            if result["correct"]:
                correct += 1
    start_idx = len(results)
    print(f"Resuming from sample {start_idx}")

# Evaluation loop
print("Starting evaluation...")
for idx in tqdm(range(start_idx, len(dataset)), initial=start_idx, total=len(dataset)):
    ex = dataset[idx]
    prompt = ex['question']

    # Prepare messages
    messages = [
        {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
        {"role": "user", "content": prompt}
    ]

    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize and generate
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode only the generated tokens
    generated_ids = [
        out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Extract answers
    pred_ans = extract_answer(response)
    gold_ans = extract_gold_answer(ex["answer"])

    # Compare
    is_correct = (pred_ans == gold_ans) and (pred_ans != "")
    if is_correct:
        correct += 1

    # Print progress
    status = "✓" if is_correct else "✗"
    print(f"\n{status} [{idx+1}/{len(dataset)}] Gold: {gold_ans} | Pred: {pred_ans}")

    # Store result
    results.append({
        "question": prompt,
        "gold": gold_ans,
        "predicted": pred_ans,
        "model_output": response,
        "correct": is_correct
    })

    # Save checkpoint periodically
    if (idx + 1) % CHECKPOINT_INTERVAL == 0:
        with open(CHECKPOINT_FILE, "w", encoding="utf-8") as f:
            for r in results:
                json.dump(r, f, ensure_ascii=False)
                f.write("\n")
        print(f"Checkpoint saved at sample {idx+1}")

# Save final results
print(f"\nSaving final results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for r in results:
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

# Calculate and display final accuracy
accuracy = correct / len(dataset)
print(f"\n{'='*60}")
print(f"GSM8K BASELINE EVALUATION COMPLETE")
print(f"{'='*60}")
print(f"Model: {MODEL_NAME}")
print(f"Samples evaluated: {len(dataset)}")
print(f"Correct: {correct}")
print(f"Incorrect: {len(dataset) - correct}")
print(f"Accuracy: {accuracy:.2%} ({correct}/{len(dataset)})")
print(f"{'='*60}")

# Save summary
summary = {
    "model": MODEL_NAME,
    "total_samples": len(dataset),
    "correct": correct,
    "incorrect": len(dataset) - correct,
    "accuracy": accuracy
}

with open("gsm8k_summary.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"Summary saved to gsm8k_summary.json")

Using device: cuda
Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Evaluating on 500 samples
Loading model: Qwen/Qwen2.5-Math-1.5B-Instruct


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Starting evaluation...


  0%|          | 1/500 [00:19<2:40:56, 19.35s/it]


✓ [1/500] Gold: 18 | Pred: 18


  0%|          | 2/500 [00:29<1:56:50, 14.08s/it]


✓ [2/500] Gold: 3 | Pred: 3


  1%|          | 3/500 [00:46<2:06:38, 15.29s/it]


✗ [3/500] Gold: 70000 | Pred: 120000


  1%|          | 4/500 [00:52<1:35:42, 11.58s/it]


✗ [4/500] Gold: 540 | Pred: 180


  1%|          | 5/500 [01:06<1:44:23, 12.65s/it]


✓ [5/500] Gold: 20 | Pred: 20


  1%|          | 6/500 [01:18<1:40:20, 12.19s/it]


✓ [6/500] Gold: 64 | Pred: 64


  1%|▏         | 7/500 [01:39<2:05:44, 15.30s/it]


✗ [7/500] Gold: 260 | Pred: 777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777


  2%|▏         | 8/500 [01:55<2:06:07, 15.38s/it]


✗ [8/500] Gold: 160 | Pred: 140


  2%|▏         | 9/500 [02:17<2:22:05, 17.36s/it]


✗ [9/500] Gold: 45 | Pred: 31


  2%|▏         | 10/500 [02:29<2:08:58, 15.79s/it]


✓ [10/500] Gold: 460 | Pred: 460


  2%|▏         | 11/500 [02:41<1:59:25, 14.65s/it]


✓ [11/500] Gold: 366 | Pred: 366


  2%|▏         | 12/500 [02:51<1:46:53, 13.14s/it]


✓ [12/500] Gold: 694 | Pred: 694


  3%|▎         | 13/500 [03:04<1:46:06, 13.07s/it]


✗ [13/500] Gold: 13 | Pred: 12


  3%|▎         | 14/500 [03:25<2:06:21, 15.60s/it]


✓ [14/500] Gold: 18 | Pred: 18


  3%|▎         | 15/500 [03:41<2:06:37, 15.67s/it]


✓ [15/500] Gold: 60 | Pred: 60


  3%|▎         | 16/500 [03:54<2:00:30, 14.94s/it]


✗ [16/500] Gold: 125 | Pred: 96


  3%|▎         | 17/500 [04:03<1:45:40, 13.13s/it]


✓ [17/500] Gold: 230 | Pred: 230


  4%|▎         | 18/500 [04:18<1:49:48, 13.67s/it]


✓ [18/500] Gold: 57500 | Pred: 57500


  4%|▍         | 19/500 [04:33<1:51:47, 13.94s/it]


✓ [19/500] Gold: 7 | Pred: 7


  4%|▍         | 20/500 [04:49<1:57:00, 14.63s/it]


✓ [20/500] Gold: 6 | Pred: 6


  4%|▍         | 21/500 [05:06<2:01:48, 15.26s/it]


✓ [21/500] Gold: 15 | Pred: 15


  4%|▍         | 22/500 [05:14<1:44:26, 13.11s/it]


✗ [22/500] Gold: 14 | Pred: 23


  5%|▍         | 23/500 [05:22<1:33:05, 11.71s/it]


✓ [23/500] Gold: 7 | Pred: 7


  5%|▍         | 24/500 [05:33<1:30:39, 11.43s/it]


✓ [24/500] Gold: 8 | Pred: 8


  5%|▌         | 25/500 [05:41<1:23:31, 10.55s/it]


✓ [25/500] Gold: 26 | Pred: 26


  5%|▌         | 26/500 [05:54<1:28:37, 11.22s/it]


✓ [26/500] Gold: 2 | Pred: 2


  5%|▌         | 27/500 [06:10<1:38:37, 12.51s/it]


✓ [27/500] Gold: 243 | Pred: 243


  6%|▌         | 28/500 [06:21<1:35:37, 12.16s/it]


✓ [28/500] Gold: 16 | Pred: 16


  6%|▌         | 29/500 [06:30<1:27:54, 11.20s/it]


✓ [29/500] Gold: 25 | Pred: 25


  6%|▌         | 30/500 [06:41<1:27:17, 11.14s/it]


✓ [30/500] Gold: 104 | Pred: 104


  6%|▌         | 31/500 [06:53<1:28:28, 11.32s/it]


✓ [31/500] Gold: 109 | Pred: 109


  6%|▋         | 32/500 [07:04<1:28:36, 11.36s/it]


✓ [32/500] Gold: 80 | Pred: 80


  7%|▋         | 33/500 [07:14<1:24:23, 10.84s/it]


✓ [33/500] Gold: 35 | Pred: 35


  7%|▋         | 34/500 [07:26<1:26:31, 11.14s/it]


✓ [34/500] Gold: 70 | Pred: 70


  7%|▋         | 35/500 [07:35<1:21:14, 10.48s/it]


✓ [35/500] Gold: 23 | Pred: 23


  7%|▋         | 36/500 [07:46<1:22:17, 10.64s/it]


✓ [36/500] Gold: 9 | Pred: 9


  7%|▋         | 37/500 [07:58<1:25:51, 11.13s/it]


✓ [37/500] Gold: 75 | Pred: 75


  8%|▊         | 38/500 [08:19<1:48:13, 14.06s/it]


✗ [38/500] Gold: 2 | Pred: 0


  8%|▊         | 39/500 [08:29<1:39:42, 12.98s/it]


✓ [39/500] Gold: 10 | Pred: 10


  8%|▊         | 40/500 [08:45<1:45:52, 13.81s/it]


✓ [40/500] Gold: 18 | Pred: 18


  8%|▊         | 41/500 [08:53<1:32:40, 12.11s/it]


✓ [41/500] Gold: 8 | Pred: 8


  8%|▊         | 42/500 [09:09<1:40:36, 13.18s/it]


✓ [42/500] Gold: 200 | Pred: 200


  9%|▊         | 43/500 [09:18<1:31:48, 12.05s/it]


✓ [43/500] Gold: 26 | Pred: 26


  9%|▉         | 44/500 [09:35<1:42:22, 13.47s/it]


✓ [44/500] Gold: 48 | Pred: 48


  9%|▉         | 45/500 [09:50<1:46:18, 14.02s/it]


✗ [45/500] Gold: 20 | Pred: -160


  9%|▉         | 46/500 [10:04<1:46:01, 14.01s/it]


✓ [46/500] Gold: 104 | Pred: 104


  9%|▉         | 47/500 [10:19<1:46:48, 14.15s/it]


✓ [47/500] Gold: 163 | Pred: 163


 10%|▉         | 48/500 [10:39<1:59:39, 15.88s/it]


✓ [48/500] Gold: 800 | Pred: 800


 10%|▉         | 49/500 [10:47<1:42:13, 13.60s/it]


✓ [49/500] Gold: 8 | Pred: 8


 10%|█         | 50/500 [10:58<1:36:37, 12.88s/it]


✓ [50/500] Gold: 30 | Pred: 30
Checkpoint saved at sample 50


 10%|█         | 51/500 [11:11<1:36:47, 12.93s/it]


✓ [51/500] Gold: 294 | Pred: 294


 10%|█         | 52/500 [11:35<2:00:36, 16.15s/it]


✗ [52/500] Gold: 5 | Pred: 15


 11%|█         | 53/500 [11:48<1:54:33, 15.38s/it]


✓ [53/500] Gold: 15 | Pred: 15


 11%|█         | 54/500 [12:07<2:01:01, 16.28s/it]


✓ [54/500] Gold: 40 | Pred: 40


 11%|█         | 55/500 [12:21<1:56:53, 15.76s/it]


✓ [55/500] Gold: 40 | Pred: 40


 11%|█         | 56/500 [12:32<1:45:10, 14.21s/it]


✓ [56/500] Gold: 14 | Pred: 14


 11%|█▏        | 57/500 [12:41<1:34:04, 12.74s/it]


✓ [57/500] Gold: 3 | Pred: 3


 12%|█▏        | 58/500 [12:54<1:34:41, 12.85s/it]


✓ [58/500] Gold: 83 | Pred: 83


 12%|█▏        | 59/500 [13:05<1:29:40, 12.20s/it]


✗ [59/500] Gold: 57 | Pred: 54


 12%|█▏        | 60/500 [13:12<1:17:36, 10.58s/it]


✓ [60/500] Gold: 187 | Pred: 187


 12%|█▏        | 61/500 [13:20<1:12:57,  9.97s/it]


✓ [61/500] Gold: 17 | Pred: 17


 12%|█▏        | 62/500 [13:30<1:13:00, 10.00s/it]


✓ [62/500] Gold: 1430 | Pred: 1430


 13%|█▎        | 63/500 [13:47<1:26:20, 11.86s/it]


✗ [63/500] Gold: 25000 | Pred: 50000


 13%|█▎        | 64/500 [14:03<1:35:28, 13.14s/it]


✓ [64/500] Gold: 1596 | Pred: 1596


 13%|█▎        | 65/500 [14:15<1:33:13, 12.86s/it]


✓ [65/500] Gold: 300 | Pred: 300


 13%|█▎        | 66/500 [14:28<1:32:31, 12.79s/it]


✓ [66/500] Gold: 36 | Pred: 36


 13%|█▎        | 67/500 [14:39<1:30:18, 12.51s/it]


✓ [67/500] Gold: 48 | Pred: 48


 14%|█▎        | 68/500 [14:52<1:30:13, 12.53s/it]


✓ [68/500] Gold: 595 | Pred: 595


 14%|█▍        | 69/500 [15:03<1:26:50, 12.09s/it]


✓ [69/500] Gold: 36 | Pred: 36


 14%|█▍        | 70/500 [15:13<1:22:01, 11.45s/it]


✓ [70/500] Gold: 60 | Pred: 60


 14%|█▍        | 71/500 [15:28<1:28:59, 12.45s/it]


✓ [71/500] Gold: 7425 | Pred: 7425


 14%|█▍        | 72/500 [15:36<1:19:07, 11.09s/it]


✓ [72/500] Gold: 60 | Pred: 60


 15%|█▍        | 73/500 [15:48<1:21:16, 11.42s/it]


✓ [73/500] Gold: 221 | Pred: 221


 15%|█▍        | 74/500 [16:02<1:26:58, 12.25s/it]


✓ [74/500] Gold: 255 | Pred: 255


 15%|█▌        | 75/500 [16:15<1:28:07, 12.44s/it]


✓ [75/500] Gold: 88 | Pred: 88


 15%|█▌        | 76/500 [16:28<1:30:00, 12.74s/it]


✓ [76/500] Gold: 60 | Pred: 60


 15%|█▌        | 77/500 [16:49<1:47:03, 15.19s/it]


✓ [77/500] Gold: 5 | Pred: 5


 16%|█▌        | 78/500 [17:05<1:46:45, 15.18s/it]


✓ [78/500] Gold: 100 | Pred: 100


 16%|█▌        | 79/500 [17:17<1:40:24, 14.31s/it]


✓ [79/500] Gold: 6 | Pred: 6


 16%|█▌        | 80/500 [17:25<1:27:16, 12.47s/it]


✓ [80/500] Gold: 70 | Pred: 70


 16%|█▌        | 81/500 [17:39<1:30:42, 12.99s/it]


✓ [81/500] Gold: 10 | Pred: 10


 16%|█▋        | 82/500 [17:52<1:30:16, 12.96s/it]


✓ [82/500] Gold: 17 | Pred: 17


 17%|█▋        | 83/500 [18:05<1:30:19, 13.00s/it]


✓ [83/500] Gold: 623 | Pred: 623


 17%|█▋        | 84/500 [18:13<1:19:48, 11.51s/it]


✓ [84/500] Gold: 600 | Pred: 600


 17%|█▋        | 85/500 [18:25<1:19:53, 11.55s/it]


✓ [85/500] Gold: 15 | Pred: 15


 17%|█▋        | 86/500 [18:33<1:12:04, 10.45s/it]


✓ [86/500] Gold: 44 | Pred: 44


 17%|█▋        | 87/500 [18:42<1:09:55, 10.16s/it]


✓ [87/500] Gold: 22 | Pred: 22


 18%|█▊        | 88/500 [18:52<1:09:46, 10.16s/it]


✗ [88/500] Gold: 9360 | Pred: 798.6


 18%|█▊        | 89/500 [19:02<1:08:44, 10.03s/it]


✓ [89/500] Gold: 8000 | Pred: 8000


 18%|█▊        | 90/500 [19:14<1:12:05, 10.55s/it]


✗ [90/500] Gold: 24 | Pred: 18


 18%|█▊        | 91/500 [19:25<1:14:02, 10.86s/it]


✓ [91/500] Gold: 225 | Pred: 225


 18%|█▊        | 92/500 [19:38<1:18:06, 11.49s/it]


✓ [92/500] Gold: 28 | Pred: 28


 19%|█▊        | 93/500 [19:53<1:24:44, 12.49s/it]


✓ [93/500] Gold: 4 | Pred: 4


 19%|█▉        | 94/500 [20:12<1:38:09, 14.50s/it]


✗ [94/500] Gold: 36 | Pred: 36.36


 19%|█▉        | 95/500 [20:29<1:41:32, 15.04s/it]


✓ [95/500] Gold: 348 | Pred: 348


 19%|█▉        | 96/500 [20:46<1:46:10, 15.77s/it]


✓ [96/500] Gold: 40 | Pred: 40


 19%|█▉        | 97/500 [20:55<1:31:40, 13.65s/it]


✓ [97/500] Gold: 3 | Pred: 3


 20%|█▉        | 98/500 [21:07<1:28:17, 13.18s/it]


✓ [98/500] Gold: 12 | Pred: 12


 20%|█▉        | 99/500 [21:23<1:34:22, 14.12s/it]


✓ [99/500] Gold: 5 | Pred: 5


 20%|██        | 100/500 [21:34<1:27:55, 13.19s/it]


✗ [100/500] Gold: 58 | Pred: 40
Checkpoint saved at sample 100


 20%|██        | 101/500 [21:43<1:19:17, 11.92s/it]


✗ [101/500] Gold: 175 | Pred: 695


 20%|██        | 102/500 [21:55<1:18:52, 11.89s/it]


✗ [102/500] Gold: 6 | Pred: 8.5


 21%|██        | 103/500 [22:07<1:18:42, 11.89s/it]


✗ [103/500] Gold: 26 | Pred: 15


 21%|██        | 104/500 [22:17<1:13:48, 11.18s/it]


✓ [104/500] Gold: 140 | Pred: 140


 21%|██        | 105/500 [22:29<1:17:01, 11.70s/it]


✓ [105/500] Gold: 500 | Pred: 500


 21%|██        | 106/500 [22:40<1:13:57, 11.26s/it]


✓ [106/500] Gold: 20 | Pred: 20


 21%|██▏       | 107/500 [22:53<1:18:47, 12.03s/it]


✓ [107/500] Gold: 72 | Pred: 72


 22%|██▏       | 108/500 [23:07<1:21:55, 12.54s/it]


✓ [108/500] Gold: 3 | Pred: 3


 22%|██▏       | 109/500 [23:19<1:19:20, 12.18s/it]


✓ [109/500] Gold: 50 | Pred: 50


 22%|██▏       | 110/500 [23:31<1:19:17, 12.20s/it]


✓ [110/500] Gold: 28 | Pred: 28


 22%|██▏       | 111/500 [23:42<1:17:26, 11.94s/it]


✓ [111/500] Gold: 45 | Pred: 45


 22%|██▏       | 112/500 [23:59<1:26:54, 13.44s/it]


✓ [112/500] Gold: 16 | Pred: 16


 23%|██▎       | 113/500 [24:09<1:20:09, 12.43s/it]


✓ [113/500] Gold: 24 | Pred: 24


 23%|██▎       | 114/500 [24:18<1:13:16, 11.39s/it]


✓ [114/500] Gold: 25 | Pred: 25


 23%|██▎       | 115/500 [24:34<1:21:20, 12.68s/it]


✓ [115/500] Gold: 6 | Pred: 6


 23%|██▎       | 116/500 [24:53<1:33:03, 14.54s/it]


✗ [116/500] Gold: 90 | Pred: 0


 23%|██▎       | 117/500 [25:04<1:25:51, 13.45s/it]


✓ [117/500] Gold: 42 | Pred: 42


 24%|██▎       | 118/500 [25:11<1:13:47, 11.59s/it]


✓ [118/500] Gold: 360 | Pred: 360


 24%|██▍       | 119/500 [25:22<1:11:48, 11.31s/it]


✓ [119/500] Gold: 4 | Pred: 4


 24%|██▍       | 120/500 [25:43<1:30:38, 14.31s/it]


✗ [120/500] Gold: 95200 | Pred: 430


 24%|██▍       | 121/500 [25:53<1:22:51, 13.12s/it]


✓ [121/500] Gold: 240 | Pred: 240


 24%|██▍       | 122/500 [26:04<1:19:04, 12.55s/it]


✓ [122/500] Gold: 27 | Pred: 27


 25%|██▍       | 123/500 [26:17<1:18:28, 12.49s/it]


✓ [123/500] Gold: 48 | Pred: 48


 25%|██▍       | 124/500 [26:26<1:12:35, 11.58s/it]


✓ [124/500] Gold: 50 | Pred: 50


 25%|██▌       | 125/500 [26:38<1:13:12, 11.71s/it]


✓ [125/500] Gold: 10 | Pred: 10


 25%|██▌       | 126/500 [26:46<1:06:06, 10.61s/it]


✓ [126/500] Gold: 10 | Pred: 10


 25%|██▌       | 127/500 [26:56<1:03:50, 10.27s/it]


✓ [127/500] Gold: 82 | Pred: 82


 26%|██▌       | 128/500 [27:05<1:01:20,  9.89s/it]


✓ [128/500] Gold: 120 | Pred: 120


 26%|██▌       | 129/500 [27:19<1:09:21, 11.22s/it]


✓ [129/500] Gold: 880 | Pred: 880


 26%|██▌       | 130/500 [27:33<1:14:52, 12.14s/it]


✓ [130/500] Gold: 10000 | Pred: 10000


 26%|██▌       | 131/500 [27:41<1:07:17, 10.94s/it]


✓ [131/500] Gold: 30 | Pred: 30


 26%|██▋       | 132/500 [27:50<1:01:53, 10.09s/it]


✓ [132/500] Gold: 940 | Pred: 940


 27%|██▋       | 133/500 [28:06<1:12:48, 11.90s/it]


✓ [133/500] Gold: 60 | Pred: 60


 27%|██▋       | 134/500 [28:16<1:10:15, 11.52s/it]


✓ [134/500] Gold: 13 | Pred: 13


 27%|██▋       | 135/500 [28:27<1:07:56, 11.17s/it]


✓ [135/500] Gold: 720 | Pred: 720


 27%|██▋       | 136/500 [28:34<1:01:18, 10.11s/it]


✓ [136/500] Gold: 40 | Pred: 40


 27%|██▋       | 137/500 [28:41<55:47,  9.22s/it]  


✓ [137/500] Gold: 6 | Pred: 6


 28%|██▊       | 138/500 [28:57<1:07:45, 11.23s/it]


✓ [138/500] Gold: 29 | Pred: 29


 28%|██▊       | 139/500 [29:11<1:12:22, 12.03s/it]


✓ [139/500] Gold: 105 | Pred: 105


 28%|██▊       | 140/500 [29:20<1:05:56, 10.99s/it]


✓ [140/500] Gold: 70 | Pred: 70


 28%|██▊       | 141/500 [29:31<1:06:23, 11.10s/it]


✓ [141/500] Gold: 20 | Pred: 20


 28%|██▊       | 142/500 [29:48<1:16:35, 12.84s/it]


✓ [142/500] Gold: 400 | Pred: 400


 29%|██▊       | 143/500 [29:58<1:10:53, 11.91s/it]


✓ [143/500] Gold: 140 | Pred: 140


 29%|██▉       | 144/500 [30:09<1:10:01, 11.80s/it]


✗ [144/500] Gold: 16 | Pred: 16.3


 29%|██▉       | 145/500 [30:24<1:14:41, 12.62s/it]


✓ [145/500] Gold: 20 | Pred: 20


 29%|██▉       | 146/500 [30:38<1:17:14, 13.09s/it]


✓ [146/500] Gold: 4000 | Pred: 4000


 29%|██▉       | 147/500 [30:47<1:09:37, 11.83s/it]


✓ [147/500] Gold: 2125 | Pred: 2125


 30%|██▉       | 148/500 [31:00<1:11:02, 12.11s/it]


✗ [148/500] Gold: 75 | Pred: 60


 30%|██▉       | 149/500 [31:09<1:05:18, 11.16s/it]


✓ [149/500] Gold: 30 | Pred: 30


 30%|███       | 150/500 [31:18<1:02:12, 10.66s/it]


✓ [150/500] Gold: 16 | Pred: 16
Checkpoint saved at sample 150


 30%|███       | 151/500 [31:34<1:11:05, 12.22s/it]


✓ [151/500] Gold: 4 | Pred: 4


 30%|███       | 152/500 [31:55<1:25:49, 14.80s/it]


✓ [152/500] Gold: 5 | Pred: 5


 31%|███       | 153/500 [32:03<1:13:38, 12.73s/it]


✓ [153/500] Gold: 4 | Pred: 4


 31%|███       | 154/500 [32:22<1:23:54, 14.55s/it]


✓ [154/500] Gold: 48 | Pred: 48


 31%|███       | 155/500 [32:34<1:19:34, 13.84s/it]


✓ [155/500] Gold: 272 | Pred: 272


 31%|███       | 156/500 [32:50<1:22:57, 14.47s/it]


✓ [156/500] Gold: 280 | Pred: 280


 31%|███▏      | 157/500 [32:59<1:13:00, 12.77s/it]


✓ [157/500] Gold: 1400 | Pred: 1400


 32%|███▏      | 158/500 [33:12<1:14:49, 13.13s/it]


✓ [158/500] Gold: 80 | Pred: 80


 32%|███▏      | 159/500 [33:22<1:07:41, 11.91s/it]


✓ [159/500] Gold: 34 | Pred: 34


 32%|███▏      | 160/500 [33:31<1:03:07, 11.14s/it]


✗ [160/500] Gold: 15 | Pred: 3


 32%|███▏      | 161/500 [33:40<58:42, 10.39s/it]  


✓ [161/500] Gold: 16 | Pred: 16


 32%|███▏      | 162/500 [33:59<1:13:57, 13.13s/it]


✓ [162/500] Gold: 32 | Pred: 32


 33%|███▎      | 163/500 [34:14<1:16:56, 13.70s/it]


✗ [163/500] Gold: 92 | Pred: 259.49


 33%|███▎      | 164/500 [34:25<1:11:13, 12.72s/it]


✗ [164/500] Gold: 50 | Pred: 54


 33%|███▎      | 165/500 [34:46<1:25:07, 15.25s/it]


✗ [165/500] Gold: 15 | Pred: 3


 33%|███▎      | 166/500 [35:02<1:26:02, 15.46s/it]


✓ [166/500] Gold: 77 | Pred: 77


 33%|███▎      | 167/500 [35:11<1:16:05, 13.71s/it]


✓ [167/500] Gold: 5 | Pred: 5


 34%|███▎      | 168/500 [35:25<1:15:08, 13.58s/it]


✓ [168/500] Gold: 16 | Pred: 16


 34%|███▍      | 169/500 [35:31<1:03:57, 11.59s/it]


✓ [169/500] Gold: 18 | Pred: 18


 34%|███▍      | 170/500 [35:42<1:02:30, 11.36s/it]


✓ [170/500] Gold: 120 | Pred: 120


 34%|███▍      | 171/500 [35:55<1:04:03, 11.68s/it]


✓ [171/500] Gold: 150 | Pred: 150


 34%|███▍      | 172/500 [36:06<1:03:00, 11.53s/it]


✓ [172/500] Gold: 1210 | Pred: 1210


 35%|███▍      | 173/500 [36:20<1:07:36, 12.41s/it]


✗ [173/500] Gold: 51 | Pred: 43


 35%|███▍      | 174/500 [36:35<1:10:58, 13.06s/it]


✓ [174/500] Gold: 18000 | Pred: 18000


 35%|███▌      | 175/500 [36:49<1:11:59, 13.29s/it]


✓ [175/500] Gold: 95 | Pred: 95


 35%|███▌      | 176/500 [37:05<1:15:45, 14.03s/it]


✓ [176/500] Gold: 15 | Pred: 15


 35%|███▌      | 177/500 [37:12<1:04:46, 12.03s/it]


✓ [177/500] Gold: 100 | Pred: 100


 36%|███▌      | 178/500 [37:32<1:17:18, 14.41s/it]


✗ [178/500] Gold: 350 | Pred: 450


 36%|███▌      | 179/500 [37:46<1:16:38, 14.33s/it]


✓ [179/500] Gold: 122 | Pred: 122


 36%|███▌      | 180/500 [37:57<1:11:17, 13.37s/it]


✓ [180/500] Gold: 130 | Pred: 130


 36%|███▌      | 181/500 [38:04<1:00:21, 11.35s/it]


✓ [181/500] Gold: 20 | Pred: 20


 36%|███▋      | 182/500 [38:15<1:00:42, 11.45s/it]


✓ [182/500] Gold: 160 | Pred: 160


 37%|███▋      | 183/500 [38:27<1:00:19, 11.42s/it]


✓ [183/500] Gold: 23 | Pred: 23


 37%|███▋      | 184/500 [38:40<1:02:59, 11.96s/it]


✓ [184/500] Gold: 2 | Pred: 2


 37%|███▋      | 185/500 [38:57<1:10:56, 13.51s/it]


✓ [185/500] Gold: 25 | Pred: 25


 37%|███▋      | 186/500 [39:06<1:03:17, 12.09s/it]


✓ [186/500] Gold: 30 | Pred: 30


 37%|███▋      | 187/500 [39:27<1:17:20, 14.83s/it]


✓ [187/500] Gold: 5 | Pred: 5


 38%|███▊      | 188/500 [39:44<1:19:35, 15.31s/it]


✗ [188/500] Gold: 106 | Pred: 106.12


 38%|███▊      | 189/500 [39:57<1:15:53, 14.64s/it]


✓ [189/500] Gold: 50 | Pred: 50


 38%|███▊      | 190/500 [40:07<1:09:10, 13.39s/it]


✓ [190/500] Gold: 34 | Pred: 34


 38%|███▊      | 191/500 [40:16<1:02:06, 12.06s/it]


✓ [191/500] Gold: 360 | Pred: 360


 38%|███▊      | 192/500 [40:24<56:09, 10.94s/it]  


✓ [192/500] Gold: 5 | Pred: 5


 39%|███▊      | 193/500 [40:41<1:04:23, 12.58s/it]


✓ [193/500] Gold: 91 | Pred: 91


 39%|███▉      | 194/500 [40:53<1:04:12, 12.59s/it]


✓ [194/500] Gold: 24 | Pred: 24


 39%|███▉      | 195/500 [41:01<56:57, 11.20s/it]  


✓ [195/500] Gold: 10 | Pred: 10


 39%|███▉      | 196/500 [41:11<53:37, 10.58s/it]


✓ [196/500] Gold: 12 | Pred: 12


 39%|███▉      | 197/500 [41:18<49:23,  9.78s/it]


✓ [197/500] Gold: 120 | Pred: 120


 40%|███▉      | 198/500 [41:31<53:46, 10.68s/it]


✓ [198/500] Gold: 6277 | Pred: 6277


 40%|███▉      | 199/500 [41:47<1:01:41, 12.30s/it]


✓ [199/500] Gold: 320 | Pred: 320


 40%|████      | 200/500 [42:09<1:16:09, 15.23s/it]


✗ [200/500] Gold: 7500 | Pred: 1000
Checkpoint saved at sample 200


 40%|████      | 201/500 [42:22<1:12:11, 14.49s/it]


✓ [201/500] Gold: 55 | Pred: 55


 40%|████      | 202/500 [42:32<1:05:00, 13.09s/it]


✗ [202/500] Gold: 114200 | Pred: 109000


 41%|████      | 203/500 [42:41<59:31, 12.02s/it]  


✓ [203/500] Gold: 100 | Pred: 100


 41%|████      | 204/500 [42:58<1:05:18, 13.24s/it]


✓ [204/500] Gold: 31 | Pred: 31


 41%|████      | 205/500 [43:09<1:01:44, 12.56s/it]


✓ [205/500] Gold: 98 | Pred: 98


 41%|████      | 206/500 [43:24<1:06:05, 13.49s/it]


✗ [206/500] Gold: 98 | Pred: 110


 41%|████▏     | 207/500 [43:39<1:08:16, 13.98s/it]


✓ [207/500] Gold: 860 | Pred: 860


 42%|████▏     | 208/500 [43:51<1:04:35, 13.27s/it]


✓ [208/500] Gold: 2600 | Pred: 2600


 42%|████▏     | 209/500 [43:59<56:42, 11.69s/it]  


✓ [209/500] Gold: 76 | Pred: 76


 42%|████▏     | 210/500 [44:12<58:11, 12.04s/it]


✗ [210/500] Gold: 145 | Pred: 155


 42%|████▏     | 211/500 [44:29<1:05:00, 13.50s/it]


✓ [211/500] Gold: 10 | Pred: 10


 42%|████▏     | 212/500 [44:42<1:04:33, 13.45s/it]


✓ [212/500] Gold: 4 | Pred: 4


 43%|████▎     | 213/500 [44:54<1:02:54, 13.15s/it]


✓ [213/500] Gold: 5 | Pred: 5


 43%|████▎     | 214/500 [45:08<1:03:24, 13.30s/it]


✓ [214/500] Gold: 250 | Pred: 250


 43%|████▎     | 215/500 [45:24<1:07:14, 14.16s/it]


✓ [215/500] Gold: 8 | Pred: 8


 43%|████▎     | 216/500 [45:32<57:19, 12.11s/it]  


✓ [216/500] Gold: 44 | Pred: 44


 43%|████▎     | 217/500 [45:46<1:00:47, 12.89s/it]


✓ [217/500] Gold: 220 | Pred: 220


 44%|████▎     | 218/500 [45:55<54:24, 11.58s/it]  


✓ [218/500] Gold: 15 | Pred: 15


 44%|████▍     | 219/500 [46:10<59:28, 12.70s/it]


✓ [219/500] Gold: 45 | Pred: 45


 44%|████▍     | 220/500 [46:23<59:43, 12.80s/it]


✓ [220/500] Gold: 54 | Pred: 54


 44%|████▍     | 221/500 [46:34<57:03, 12.27s/it]


✓ [221/500] Gold: 70 | Pred: 70


 44%|████▍     | 222/500 [46:45<55:06, 11.89s/it]


✓ [222/500] Gold: 90 | Pred: 90


 45%|████▍     | 223/500 [46:53<49:33, 10.73s/it]


✓ [223/500] Gold: 140 | Pred: 140


 45%|████▍     | 224/500 [47:04<48:43, 10.59s/it]


✓ [224/500] Gold: 20000 | Pred: 20000


 45%|████▌     | 225/500 [47:14<48:53, 10.67s/it]


✓ [225/500] Gold: 180 | Pred: 180


 45%|████▌     | 226/500 [47:28<52:04, 11.40s/it]


✗ [226/500] Gold: 9 | Pred: 18


 45%|████▌     | 227/500 [47:44<58:27, 12.85s/it]


✓ [227/500] Gold: 33 | Pred: 33


 46%|████▌     | 228/500 [48:00<1:03:07, 13.92s/it]


✓ [228/500] Gold: 9 | Pred: 9


 46%|████▌     | 229/500 [48:12<1:00:34, 13.41s/it]


✓ [229/500] Gold: 1 | Pred: 1


 46%|████▌     | 230/500 [48:30<1:06:24, 14.76s/it]


✓ [230/500] Gold: 21 | Pred: 21


 46%|████▌     | 231/500 [48:44<1:04:11, 14.32s/it]


✓ [231/500] Gold: 276000 | Pred: 276000


 46%|████▋     | 232/500 [48:52<55:52, 12.51s/it]  


✓ [232/500] Gold: 50 | Pred: 50


 47%|████▋     | 233/500 [49:02<52:49, 11.87s/it]


✓ [233/500] Gold: 75 | Pred: 75


 47%|████▋     | 234/500 [49:17<56:29, 12.74s/it]


✓ [234/500] Gold: 12 | Pred: 12


 47%|████▋     | 235/500 [49:28<53:49, 12.19s/it]


✗ [235/500] Gold: 21 | Pred: 28


 47%|████▋     | 236/500 [49:36<48:26, 11.01s/it]


✓ [236/500] Gold: 10 | Pred: 10


 47%|████▋     | 237/500 [49:48<49:13, 11.23s/it]


✓ [237/500] Gold: 31 | Pred: 31


 48%|████▊     | 238/500 [49:55<43:32,  9.97s/it]


✓ [238/500] Gold: 90 | Pred: 90


 48%|████▊     | 239/500 [50:06<44:24, 10.21s/it]


✓ [239/500] Gold: 68 | Pred: 68


 48%|████▊     | 240/500 [50:22<51:53, 11.98s/it]


✓ [240/500] Gold: 280 | Pred: 280


 48%|████▊     | 241/500 [50:32<49:34, 11.48s/it]


✓ [241/500] Gold: 21 | Pred: 21


 48%|████▊     | 242/500 [50:41<45:31, 10.59s/it]


✓ [242/500] Gold: 6 | Pred: 6


 49%|████▊     | 243/500 [50:52<46:44, 10.91s/it]


✓ [243/500] Gold: 3 | Pred: 3


 49%|████▉     | 244/500 [51:02<44:47, 10.50s/it]


✓ [244/500] Gold: 250 | Pred: 250


 49%|████▉     | 245/500 [51:17<50:26, 11.87s/it]


✓ [245/500] Gold: 20 | Pred: 20


 49%|████▉     | 246/500 [51:27<47:21, 11.19s/it]


✓ [246/500] Gold: 7 | Pred: 7


 49%|████▉     | 247/500 [51:42<52:17, 12.40s/it]


✓ [247/500] Gold: 27000 | Pred: 27000


 50%|████▉     | 248/500 [51:50<46:28, 11.07s/it]


✓ [248/500] Gold: 32 | Pred: 32


 50%|████▉     | 249/500 [51:58<42:52, 10.25s/it]


✓ [249/500] Gold: 300 | Pred: 300


 50%|█████     | 250/500 [52:14<49:40, 11.92s/it]


✓ [250/500] Gold: 5600 | Pred: 5600
Checkpoint saved at sample 250


 50%|█████     | 251/500 [52:29<53:26, 12.88s/it]


✓ [251/500] Gold: 17 | Pred: 17


 50%|█████     | 252/500 [52:37<46:53, 11.35s/it]


✓ [252/500] Gold: 70 | Pred: 70


 51%|█████     | 253/500 [52:51<50:48, 12.34s/it]


✓ [253/500] Gold: 73 | Pred: 73


 51%|█████     | 254/500 [53:00<45:54, 11.20s/it]


✓ [254/500] Gold: 18 | Pred: 18


 51%|█████     | 255/500 [53:10<44:53, 11.00s/it]


✓ [255/500] Gold: 84 | Pred: 84


 51%|█████     | 256/500 [53:26<50:41, 12.46s/it]


✓ [256/500] Gold: 192 | Pred: 192


 51%|█████▏    | 257/500 [53:44<56:22, 13.92s/it]


✓ [257/500] Gold: 45 | Pred: 45


 52%|█████▏    | 258/500 [54:00<59:32, 14.76s/it]


✓ [258/500] Gold: 5600 | Pred: 5600


 52%|█████▏    | 259/500 [54:09<51:39, 12.86s/it]


✓ [259/500] Gold: 6 | Pred: 6


 52%|█████▏    | 260/500 [54:23<53:02, 13.26s/it]


✓ [260/500] Gold: 168 | Pred: 168


 52%|█████▏    | 261/500 [54:39<55:42, 13.99s/it]


✓ [261/500] Gold: 11 | Pred: 11


 52%|█████▏    | 262/500 [54:51<53:04, 13.38s/it]


✓ [262/500] Gold: 62 | Pred: 62


 53%|█████▎    | 263/500 [54:59<46:18, 11.73s/it]


✓ [263/500] Gold: 270 | Pred: 270


 53%|█████▎    | 264/500 [55:20<57:24, 14.59s/it]


✗ [264/500] Gold: 8 | Pred: 17


 53%|█████▎    | 265/500 [55:31<52:39, 13.45s/it]


✓ [265/500] Gold: 400 | Pred: 400


 53%|█████▎    | 266/500 [55:45<54:04, 13.87s/it]


✓ [266/500] Gold: 9500 | Pred: 9500


 53%|█████▎    | 267/500 [55:59<54:04, 13.93s/it]


✗ [267/500] Gold: 118000 | Pred: 182000


 54%|█████▎    | 268/500 [56:16<57:08, 14.78s/it]


✗ [268/500] Gold: 91 | Pred: 63


 54%|█████▍    | 269/500 [56:27<51:41, 13.43s/it]


✓ [269/500] Gold: 1375 | Pred: 1375


 54%|█████▍    | 270/500 [56:33<43:53, 11.45s/it]


✓ [270/500] Gold: 4 | Pred: 4


 54%|█████▍    | 271/500 [56:42<40:53, 10.72s/it]


✓ [271/500] Gold: 762 | Pred: 762


 54%|█████▍    | 272/500 [56:57<44:47, 11.79s/it]


✓ [272/500] Gold: 20 | Pred: 20


 55%|█████▍    | 273/500 [57:06<42:14, 11.17s/it]


✓ [273/500] Gold: 5 | Pred: 5


 55%|█████▍    | 274/500 [57:23<48:40, 12.92s/it]


✓ [274/500] Gold: 315 | Pred: 315


 55%|█████▌    | 275/500 [57:36<47:51, 12.76s/it]


✓ [275/500] Gold: 3200 | Pred: 3200


 55%|█████▌    | 276/500 [57:54<54:18, 14.55s/it]


✓ [276/500] Gold: 138 | Pred: 138


 55%|█████▌    | 277/500 [58:12<57:53, 15.57s/it]


✓ [277/500] Gold: 9 | Pred: 9


 56%|█████▌    | 278/500 [58:27<55:57, 15.12s/it]


✓ [278/500] Gold: 4 | Pred: 4


 56%|█████▌    | 279/500 [58:37<50:23, 13.68s/it]


✓ [279/500] Gold: 40 | Pred: 40


 56%|█████▌    | 280/500 [58:48<47:34, 12.98s/it]


✓ [280/500] Gold: 6 | Pred: 6


 56%|█████▌    | 281/500 [58:59<44:36, 12.22s/it]


✓ [281/500] Gold: 7 | Pred: 7


 56%|█████▋    | 282/500 [59:16<49:36, 13.66s/it]


✓ [282/500] Gold: 2450 | Pred: 2450


 57%|█████▋    | 283/500 [59:27<47:05, 13.02s/it]


✓ [283/500] Gold: 195 | Pred: 195


 57%|█████▋    | 284/500 [59:48<55:42, 15.47s/it]


✗ [284/500] Gold: 68 | Pred: 


 57%|█████▋    | 285/500 [1:00:00<51:07, 14.27s/it]


✓ [285/500] Gold: 360 | Pred: 360


 57%|█████▋    | 286/500 [1:00:07<43:22, 12.16s/it]


✓ [286/500] Gold: 21 | Pred: 21


 57%|█████▋    | 287/500 [1:00:21<44:47, 12.62s/it]


✓ [287/500] Gold: 90 | Pred: 90


 58%|█████▊    | 288/500 [1:00:33<43:52, 12.42s/it]


✓ [288/500] Gold: 8 | Pred: 8


 58%|█████▊    | 289/500 [1:00:42<40:02, 11.39s/it]


✓ [289/500] Gold: 3 | Pred: 3


 58%|█████▊    | 290/500 [1:00:56<43:05, 12.31s/it]


✓ [290/500] Gold: 16 | Pred: 16


 58%|█████▊    | 291/500 [1:01:09<43:42, 12.55s/it]


✓ [291/500] Gold: 390 | Pred: 390


 58%|█████▊    | 292/500 [1:01:24<46:01, 13.28s/it]


✓ [292/500] Gold: 2 | Pred: 2


 59%|█████▊    | 293/500 [1:01:39<46:58, 13.62s/it]


✓ [293/500] Gold: 75 | Pred: 75


 59%|█████▉    | 294/500 [1:01:54<48:43, 14.19s/it]


✗ [294/500] Gold: 83 | Pred: 82


 59%|█████▉    | 295/500 [1:02:04<44:07, 12.92s/it]


✓ [295/500] Gold: 3 | Pred: 3


 59%|█████▉    | 296/500 [1:02:15<42:06, 12.39s/it]


✓ [296/500] Gold: 370 | Pred: 370


 59%|█████▉    | 297/500 [1:02:32<46:16, 13.68s/it]


✓ [297/500] Gold: 3 | Pred: 3


 60%|█████▉    | 298/500 [1:02:41<41:36, 12.36s/it]


✓ [298/500] Gold: 55 | Pred: 55


 60%|█████▉    | 299/500 [1:02:54<41:46, 12.47s/it]


✗ [299/500] Gold: 500 | Pred: 1100


 60%|██████    | 300/500 [1:03:10<44:51, 13.46s/it]


✗ [300/500] Gold: 31800 | Pred: 37500
Checkpoint saved at sample 300


 60%|██████    | 301/500 [1:03:19<40:20, 12.16s/it]


✓ [301/500] Gold: 78 | Pred: 78


 60%|██████    | 302/500 [1:03:32<41:05, 12.45s/it]


✗ [302/500] Gold: 8 | Pred: 16


 61%|██████    | 303/500 [1:03:41<37:36, 11.46s/it]


✓ [303/500] Gold: 15 | Pred: 15


 61%|██████    | 304/500 [1:04:01<45:16, 13.86s/it]


✗ [304/500] Gold: 1300 | Pred: 900


 61%|██████    | 305/500 [1:04:12<42:18, 13.02s/it]


✗ [305/500] Gold: 3200 | Pred: 3300


 61%|██████    | 306/500 [1:04:19<36:18, 11.23s/it]


✓ [306/500] Gold: 4 | Pred: 4


 61%|██████▏   | 307/500 [1:04:29<35:12, 10.95s/it]


✓ [307/500] Gold: 10 | Pred: 10


 62%|██████▏   | 308/500 [1:04:41<35:59, 11.25s/it]


✗ [308/500] Gold: 16 | Pred: 1


 62%|██████▏   | 309/500 [1:04:51<34:40, 10.89s/it]


✓ [309/500] Gold: 6 | Pred: 6


 62%|██████▏   | 310/500 [1:05:03<35:27, 11.20s/it]


✓ [310/500] Gold: 8 | Pred: 8


 62%|██████▏   | 311/500 [1:05:16<37:25, 11.88s/it]


✗ [311/500] Gold: 2050 | Pred: 1050


 62%|██████▏   | 312/500 [1:05:28<36:40, 11.71s/it]


✓ [312/500] Gold: 91 | Pred: 91


 63%|██████▎   | 313/500 [1:05:34<31:50, 10.22s/it]


✓ [313/500] Gold: 32 | Pred: 32


 63%|██████▎   | 314/500 [1:05:50<36:59, 11.93s/it]


✓ [314/500] Gold: 120000 | Pred: 120000


 63%|██████▎   | 315/500 [1:06:08<42:13, 13.69s/it]


✓ [315/500] Gold: 30 | Pred: 30


 63%|██████▎   | 316/500 [1:06:22<41:56, 13.68s/it]


✓ [316/500] Gold: 14 | Pred: 14


 63%|██████▎   | 317/500 [1:06:35<41:43, 13.68s/it]


✓ [317/500] Gold: 156 | Pred: 156


 64%|██████▎   | 318/500 [1:06:47<39:47, 13.12s/it]


✓ [318/500] Gold: 12 | Pred: 12


 64%|██████▍   | 319/500 [1:07:02<40:52, 13.55s/it]


✓ [319/500] Gold: 123 | Pred: 123


 64%|██████▍   | 320/500 [1:07:12<37:29, 12.50s/it]


✓ [320/500] Gold: 15 | Pred: 15


 64%|██████▍   | 321/500 [1:07:27<39:55, 13.38s/it]


✓ [321/500] Gold: 8 | Pred: 8


 64%|██████▍   | 322/500 [1:07:37<36:11, 12.20s/it]


✓ [322/500] Gold: 1 | Pred: 1


 65%|██████▍   | 323/500 [1:07:50<36:53, 12.51s/it]


✓ [323/500] Gold: 9 | Pred: 9


 65%|██████▍   | 324/500 [1:08:06<40:01, 13.64s/it]


✓ [324/500] Gold: 75 | Pred: 75


 65%|██████▌   | 325/500 [1:08:15<35:15, 12.09s/it]


✓ [325/500] Gold: 14 | Pred: 14


 65%|██████▌   | 326/500 [1:08:32<39:42, 13.69s/it]


✓ [326/500] Gold: 224000 | Pred: 224000


 65%|██████▌   | 327/500 [1:08:41<35:16, 12.23s/it]


✓ [327/500] Gold: 14 | Pred: 14


 66%|██████▌   | 328/500 [1:08:53<34:29, 12.03s/it]


✓ [328/500] Gold: 31 | Pred: 31


 66%|██████▌   | 329/500 [1:09:03<32:39, 11.46s/it]


✓ [329/500] Gold: 2 | Pred: 2


 66%|██████▌   | 330/500 [1:09:10<29:08, 10.29s/it]


✓ [330/500] Gold: 14 | Pred: 14


 66%|██████▌   | 331/500 [1:09:22<30:11, 10.72s/it]


✓ [331/500] Gold: 31 | Pred: 31


 66%|██████▋   | 332/500 [1:09:43<38:53, 13.89s/it]


✗ [332/500] Gold: 8400 | Pred: 36000


 67%|██████▋   | 333/500 [1:09:51<33:32, 12.05s/it]


✓ [333/500] Gold: 44 | Pred: 44


 67%|██████▋   | 334/500 [1:10:04<33:53, 12.25s/it]


✓ [334/500] Gold: 100 | Pred: 100


 67%|██████▋   | 335/500 [1:10:11<29:52, 10.86s/it]


✓ [335/500] Gold: 6 | Pred: 6


 67%|██████▋   | 336/500 [1:10:24<30:55, 11.32s/it]


✓ [336/500] Gold: 310 | Pred: 310


 67%|██████▋   | 337/500 [1:10:38<33:02, 12.16s/it]


✓ [337/500] Gold: 72 | Pred: 72


 68%|██████▊   | 338/500 [1:10:48<31:26, 11.65s/it]


✓ [338/500] Gold: 1 | Pred: 1


 68%|██████▊   | 339/500 [1:11:00<30:58, 11.55s/it]


✓ [339/500] Gold: 60 | Pred: 60


 68%|██████▊   | 340/500 [1:11:12<31:08, 11.68s/it]


✓ [340/500] Gold: 160 | Pred: 160


 68%|██████▊   | 341/500 [1:11:23<30:38, 11.56s/it]


✓ [341/500] Gold: 4 | Pred: 4


 68%|██████▊   | 342/500 [1:11:44<37:40, 14.31s/it]


✓ [342/500] Gold: 260 | Pred: 260


 69%|██████▊   | 343/500 [1:11:54<34:27, 13.17s/it]


✓ [343/500] Gold: 87 | Pred: 87


 69%|██████▉   | 344/500 [1:12:05<32:28, 12.49s/it]


✓ [344/500] Gold: 180000 | Pred: 180000


 69%|██████▉   | 345/500 [1:12:19<33:18, 12.89s/it]


✓ [345/500] Gold: 2 | Pred: 2


 69%|██████▉   | 346/500 [1:12:28<29:52, 11.64s/it]


✓ [346/500] Gold: 310 | Pred: 310


 69%|██████▉   | 347/500 [1:12:41<31:19, 12.29s/it]


✓ [347/500] Gold: 9 | Pred: 9


 70%|██████▉   | 348/500 [1:12:52<29:47, 11.76s/it]


✗ [348/500] Gold: 36 | Pred: 24


 70%|██████▉   | 349/500 [1:13:08<32:58, 13.11s/it]


✓ [349/500] Gold: 10 | Pred: 10


 70%|███████   | 350/500 [1:13:19<31:13, 12.49s/it]


✓ [350/500] Gold: 2640 | Pred: 2640
Checkpoint saved at sample 350


 70%|███████   | 351/500 [1:13:35<33:23, 13.45s/it]


✓ [351/500] Gold: 8 | Pred: 8


 70%|███████   | 352/500 [1:13:46<31:22, 12.72s/it]


✓ [352/500] Gold: 10 | Pred: 10


 71%|███████   | 353/500 [1:13:53<26:42, 10.90s/it]


✓ [353/500] Gold: 21 | Pred: 21


 71%|███████   | 354/500 [1:14:06<28:10, 11.58s/it]


✓ [354/500] Gold: 20 | Pred: 20


 71%|███████   | 355/500 [1:14:21<30:41, 12.70s/it]


✓ [355/500] Gold: 45 | Pred: 45


 71%|███████   | 356/500 [1:14:31<28:33, 11.90s/it]


✓ [356/500] Gold: 34 | Pred: 34


 71%|███████▏  | 357/500 [1:14:44<29:17, 12.29s/it]


✓ [357/500] Gold: 21 | Pred: 21


 72%|███████▏  | 358/500 [1:14:56<28:26, 12.02s/it]


✗ [358/500] Gold: 2 | Pred: 12


 72%|███████▏  | 359/500 [1:15:11<30:23, 12.94s/it]


✓ [359/500] Gold: 20 | Pred: 20


 72%|███████▏  | 360/500 [1:15:27<32:09, 13.78s/it]


✗ [360/500] Gold: 4 | Pred: \frac{16


 72%|███████▏  | 361/500 [1:15:35<28:09, 12.15s/it]


✓ [361/500] Gold: 25 | Pred: 25


 72%|███████▏  | 362/500 [1:15:50<29:52, 12.99s/it]


✓ [362/500] Gold: 20 | Pred: 20


 73%|███████▎  | 363/500 [1:16:05<31:24, 13.76s/it]


✓ [363/500] Gold: 23 | Pred: 23


 73%|███████▎  | 364/500 [1:16:20<31:44, 14.01s/it]


✓ [364/500] Gold: 6 | Pred: 6


 73%|███████▎  | 365/500 [1:16:29<27:57, 12.42s/it]


✓ [365/500] Gold: 49 | Pred: 49


 73%|███████▎  | 366/500 [1:16:37<25:05, 11.24s/it]


✓ [366/500] Gold: 18 | Pred: 18


 73%|███████▎  | 367/500 [1:16:46<23:05, 10.42s/it]


✓ [367/500] Gold: 9 | Pred: 9


 74%|███████▎  | 368/500 [1:17:00<25:46, 11.72s/it]


✓ [368/500] Gold: 19 | Pred: 19


 74%|███████▍  | 369/500 [1:17:11<24:34, 11.25s/it]


✗ [369/500] Gold: 18 | Pred: 10


 74%|███████▍  | 370/500 [1:17:29<29:12, 13.48s/it]


✗ [370/500] Gold: 1198 | Pred: 1245


 74%|███████▍  | 371/500 [1:17:39<26:26, 12.30s/it]


✗ [371/500] Gold: 320 | Pred: 500


 74%|███████▍  | 372/500 [1:17:51<26:08, 12.25s/it]


✓ [372/500] Gold: 50 | Pred: 50


 75%|███████▍  | 373/500 [1:17:59<23:01, 10.88s/it]


✓ [373/500] Gold: 5 | Pred: 5


 75%|███████▍  | 374/500 [1:18:07<21:22, 10.18s/it]


✓ [374/500] Gold: 240000 | Pred: 240000


 75%|███████▌  | 375/500 [1:18:22<24:05, 11.56s/it]


✓ [375/500] Gold: 45 | Pred: 45


 75%|███████▌  | 376/500 [1:18:29<21:12, 10.26s/it]


✓ [376/500] Gold: 48 | Pred: 48


 75%|███████▌  | 377/500 [1:18:41<21:50, 10.66s/it]


✓ [377/500] Gold: 15 | Pred: 15


 76%|███████▌  | 378/500 [1:18:52<21:53, 10.76s/it]


✓ [378/500] Gold: 50 | Pred: 50


 76%|███████▌  | 379/500 [1:19:02<21:40, 10.74s/it]


✓ [379/500] Gold: 15 | Pred: 15


 76%|███████▌  | 380/500 [1:19:10<19:38,  9.82s/it]


✓ [380/500] Gold: 21 | Pred: 21


 76%|███████▌  | 381/500 [1:19:25<22:26, 11.31s/it]


✗ [381/500] Gold: 803 | Pred: 760.44


 76%|███████▋  | 382/500 [1:19:39<23:47, 12.10s/it]


✓ [382/500] Gold: 67 | Pred: 67


 77%|███████▋  | 383/500 [1:19:54<25:28, 13.06s/it]


✓ [383/500] Gold: 350 | Pred: 350


 77%|███████▋  | 384/500 [1:20:10<27:07, 14.03s/it]


✓ [384/500] Gold: 2 | Pred: 2


 77%|███████▋  | 385/500 [1:20:25<27:10, 14.18s/it]


✓ [385/500] Gold: 32 | Pred: 32


 77%|███████▋  | 386/500 [1:20:40<27:22, 14.41s/it]


✓ [386/500] Gold: 16 | Pred: 16


 77%|███████▋  | 387/500 [1:20:50<24:46, 13.16s/it]


✓ [387/500] Gold: 80 | Pred: 80


 78%|███████▊  | 388/500 [1:21:00<22:57, 12.30s/it]


✓ [388/500] Gold: 36 | Pred: 36


 78%|███████▊  | 389/500 [1:21:10<21:19, 11.52s/it]


✓ [389/500] Gold: 88 | Pred: 88


 78%|███████▊  | 390/500 [1:21:20<20:21, 11.11s/it]


✓ [390/500] Gold: 6 | Pred: 6


 78%|███████▊  | 391/500 [1:21:29<18:58, 10.45s/it]


✓ [391/500] Gold: 12 | Pred: 12


 78%|███████▊  | 392/500 [1:21:42<20:05, 11.16s/it]


✓ [392/500] Gold: 15 | Pred: 15


 79%|███████▊  | 393/500 [1:21:57<21:50, 12.25s/it]


✗ [393/500] Gold: 34 | Pred: 60


 79%|███████▉  | 394/500 [1:22:13<23:47, 13.47s/it]


✓ [394/500] Gold: 20 | Pred: 20


 79%|███████▉  | 395/500 [1:22:21<20:45, 11.86s/it]


✓ [395/500] Gold: 92 | Pred: 92


 79%|███████▉  | 396/500 [1:22:34<20:51, 12.03s/it]


✗ [396/500] Gold: 38 | Pred: 4


 79%|███████▉  | 397/500 [1:22:45<20:19, 11.84s/it]


✓ [397/500] Gold: 3 | Pred: 3


 80%|███████▉  | 398/500 [1:22:55<18:58, 11.16s/it]


✓ [398/500] Gold: 25 | Pred: 25


 80%|███████▉  | 399/500 [1:23:07<19:35, 11.64s/it]


✓ [399/500] Gold: 168 | Pred: 168


 80%|████████  | 400/500 [1:23:18<19:01, 11.42s/it]


✓ [400/500] Gold: 12 | Pred: 12
Checkpoint saved at sample 400


 80%|████████  | 401/500 [1:23:26<16:50, 10.20s/it]


✓ [401/500] Gold: 48 | Pred: 48


 80%|████████  | 402/500 [1:23:37<17:00, 10.41s/it]


✓ [402/500] Gold: 14400 | Pred: 14400


 81%|████████  | 403/500 [1:23:53<19:33, 12.10s/it]


✓ [403/500] Gold: 4 | Pred: 4


 81%|████████  | 404/500 [1:24:14<23:47, 14.87s/it]


✗ [404/500] Gold: 81 | Pred: 135


 81%|████████  | 405/500 [1:24:26<22:10, 14.01s/it]


✗ [405/500] Gold: 22 | Pred: \frac{88


 81%|████████  | 406/500 [1:24:44<24:02, 15.34s/it]


✓ [406/500] Gold: 50 | Pred: 50


 81%|████████▏ | 407/500 [1:24:59<23:13, 14.99s/it]


✗ [407/500] Gold: 200 | Pred: 140


 82%|████████▏ | 408/500 [1:25:14<23:22, 15.24s/it]


✓ [408/500] Gold: 2000 | Pred: 2000


 82%|████████▏ | 409/500 [1:25:28<22:12, 14.64s/it]


✓ [409/500] Gold: 20 | Pred: 20


 82%|████████▏ | 410/500 [1:25:49<24:59, 16.66s/it]


✗ [410/500] Gold: 168000 | Pred: 42000


 82%|████████▏ | 411/500 [1:26:01<22:37, 15.25s/it]


✓ [411/500] Gold: 3 | Pred: 3


 82%|████████▏ | 412/500 [1:26:12<20:19, 13.86s/it]


✓ [412/500] Gold: 1110 | Pred: 1110


 83%|████████▎ | 413/500 [1:26:21<18:07, 12.50s/it]


✓ [413/500] Gold: 5 | Pred: 5


 83%|████████▎ | 414/500 [1:26:32<17:15, 12.05s/it]


✓ [414/500] Gold: 25 | Pred: 25


 83%|████████▎ | 415/500 [1:26:42<16:06, 11.37s/it]


✓ [415/500] Gold: 56 | Pred: 56


 83%|████████▎ | 416/500 [1:26:56<17:18, 12.36s/it]


✓ [416/500] Gold: 350 | Pred: 350


 83%|████████▎ | 417/500 [1:27:11<17:59, 13.00s/it]


✓ [417/500] Gold: 56 | Pred: 56


 84%|████████▎ | 418/500 [1:27:24<17:49, 13.04s/it]


✓ [418/500] Gold: 3140 | Pred: 3140


 84%|████████▍ | 419/500 [1:27:33<15:49, 11.72s/it]


✓ [419/500] Gold: 40 | Pred: 40


 84%|████████▍ | 420/500 [1:27:44<15:22, 11.53s/it]


✓ [420/500] Gold: 3000 | Pred: 3000


 84%|████████▍ | 421/500 [1:27:53<14:24, 10.94s/it]


✓ [421/500] Gold: 17000 | Pred: 17000


 84%|████████▍ | 422/500 [1:28:08<15:42, 12.08s/it]


✓ [422/500] Gold: 12 | Pred: 12


 85%|████████▍ | 423/500 [1:28:20<15:37, 12.17s/it]


✗ [423/500] Gold: 284 | Pred: 240


 85%|████████▍ | 424/500 [1:28:40<18:25, 14.54s/it]


✗ [424/500] Gold: 8 | Pred: 6


 85%|████████▌ | 425/500 [1:28:50<16:26, 13.15s/it]


✓ [425/500] Gold: 570 | Pred: 570


 85%|████████▌ | 426/500 [1:29:04<16:14, 13.17s/it]


✓ [426/500] Gold: 150 | Pred: 150


 85%|████████▌ | 427/500 [1:29:15<15:25, 12.68s/it]


✓ [427/500] Gold: 11 | Pred: 11


 86%|████████▌ | 428/500 [1:29:36<18:05, 15.08s/it]


✗ [428/500] Gold: 150 | Pred: 3


 86%|████████▌ | 429/500 [1:29:50<17:23, 14.69s/it]


✗ [429/500] Gold: 26 | Pred: 44


 86%|████████▌ | 430/500 [1:30:04<16:52, 14.46s/it]


✓ [430/500] Gold: 13 | Pred: 13


 86%|████████▌ | 431/500 [1:30:16<15:51, 13.79s/it]


✓ [431/500] Gold: 132 | Pred: 132


 86%|████████▋ | 432/500 [1:30:24<13:51, 12.22s/it]


✓ [432/500] Gold: 1 | Pred: 1


 87%|████████▋ | 433/500 [1:30:35<13:00, 11.66s/it]


✓ [433/500] Gold: 30 | Pred: 30


 87%|████████▋ | 434/500 [1:30:46<12:41, 11.54s/it]


✓ [434/500] Gold: 6 | Pred: 6


 87%|████████▋ | 435/500 [1:31:07<15:41, 14.48s/it]


✗ [435/500] Gold: 5 | Pred: 99


 87%|████████▋ | 436/500 [1:31:15<13:16, 12.44s/it]


✓ [436/500] Gold: 5 | Pred: 5


 87%|████████▋ | 437/500 [1:31:25<12:24, 11.81s/it]


✓ [437/500] Gold: 15 | Pred: 15


 88%|████████▊ | 438/500 [1:31:35<11:29, 11.11s/it]


✓ [438/500] Gold: 7 | Pred: 7


 88%|████████▊ | 439/500 [1:31:50<12:27, 12.26s/it]


✓ [439/500] Gold: 2 | Pred: 2


 88%|████████▊ | 440/500 [1:32:00<11:47, 11.78s/it]


✗ [440/500] Gold: 17 | Pred: 12


 88%|████████▊ | 441/500 [1:32:16<12:40, 12.89s/it]


✓ [441/500] Gold: 98 | Pred: 98


 88%|████████▊ | 442/500 [1:32:30<12:50, 13.28s/it]


✓ [442/500] Gold: 80 | Pred: 80


 89%|████████▊ | 443/500 [1:32:45<13:01, 13.71s/it]


✓ [443/500] Gold: 49 | Pred: 49


 89%|████████▉ | 444/500 [1:33:04<14:15, 15.28s/it]


✓ [444/500] Gold: 59 | Pred: 59


 89%|████████▉ | 445/500 [1:33:14<12:36, 13.75s/it]


✓ [445/500] Gold: 20 | Pred: 20


 89%|████████▉ | 446/500 [1:33:33<13:53, 15.43s/it]


✓ [446/500] Gold: 6 | Pred: 6


 89%|████████▉ | 447/500 [1:33:44<12:29, 14.13s/it]


✓ [447/500] Gold: 2 | Pred: 2


 90%|████████▉ | 448/500 [1:33:51<10:24, 12.01s/it]


✓ [448/500] Gold: 5 | Pred: 5


 90%|████████▉ | 449/500 [1:34:07<11:02, 12.99s/it]


✓ [449/500] Gold: 539 | Pred: 539


 90%|█████████ | 450/500 [1:34:16<09:48, 11.77s/it]


✓ [450/500] Gold: 112 | Pred: 112
Checkpoint saved at sample 450


 90%|█████████ | 451/500 [1:34:31<10:28, 12.84s/it]


✗ [451/500] Gold: 4 | Pred: 6


 90%|█████████ | 452/500 [1:34:48<11:10, 13.98s/it]


✓ [452/500] Gold: 11050 | Pred: 11050


 91%|█████████ | 453/500 [1:35:02<11:09, 14.24s/it]


✓ [453/500] Gold: 50 | Pred: 50


 91%|█████████ | 454/500 [1:35:22<12:05, 15.77s/it]


✓ [454/500] Gold: 6400 | Pred: 6400


 91%|█████████ | 455/500 [1:35:31<10:27, 13.94s/it]


✗ [455/500] Gold: 150 | Pred: 240


 91%|█████████ | 456/500 [1:35:51<11:27, 15.63s/it]


✓ [456/500] Gold: 1920 | Pred: 1920


 91%|█████████▏| 457/500 [1:36:02<10:13, 14.28s/it]


✓ [457/500] Gold: 78 | Pred: 78


 92%|█████████▏| 458/500 [1:36:11<08:50, 12.62s/it]


✓ [458/500] Gold: 45 | Pred: 45


 92%|█████████▏| 459/500 [1:36:21<08:07, 11.89s/it]


✓ [459/500] Gold: 35 | Pred: 35


 92%|█████████▏| 460/500 [1:36:39<09:05, 13.63s/it]


✓ [460/500] Gold: 2 | Pred: 2


 92%|█████████▏| 461/500 [1:36:52<08:44, 13.45s/it]


✓ [461/500] Gold: 84 | Pred: 84


 92%|█████████▏| 462/500 [1:37:01<07:44, 12.21s/it]


✓ [462/500] Gold: 9 | Pred: 9


 93%|█████████▎| 463/500 [1:37:08<06:38, 10.76s/it]


✓ [463/500] Gold: 71 | Pred: 71


 93%|█████████▎| 464/500 [1:37:16<05:48,  9.67s/it]


✓ [464/500] Gold: 18 | Pred: 18


 93%|█████████▎| 465/500 [1:37:30<06:24, 10.99s/it]


✗ [465/500] Gold: 6 | Pred: 1


 93%|█████████▎| 466/500 [1:37:47<07:18, 12.90s/it]


✓ [466/500] Gold: 30 | Pred: 30


 93%|█████████▎| 467/500 [1:37:59<06:56, 12.61s/it]


✓ [467/500] Gold: 1 | Pred: 1


 94%|█████████▎| 468/500 [1:38:11<06:38, 12.47s/it]


✓ [468/500] Gold: 1200 | Pred: 1200


 94%|█████████▍| 469/500 [1:38:26<06:47, 13.16s/it]


✓ [469/500] Gold: 120 | Pred: 120


 94%|█████████▍| 470/500 [1:38:35<05:58, 11.96s/it]


✓ [470/500] Gold: 4 | Pred: 4


 94%|█████████▍| 471/500 [1:38:45<05:33, 11.50s/it]


✓ [471/500] Gold: 3 | Pred: 3


 94%|█████████▍| 472/500 [1:39:01<05:55, 12.69s/it]


✓ [472/500] Gold: 80 | Pred: 80


 95%|█████████▍| 473/500 [1:39:11<05:17, 11.76s/it]


✓ [473/500] Gold: 6 | Pred: 6


 95%|█████████▍| 474/500 [1:39:23<05:09, 11.92s/it]


✓ [474/500] Gold: 10 | Pred: 10


 95%|█████████▌| 475/500 [1:39:31<04:27, 10.69s/it]


✓ [475/500] Gold: 80 | Pred: 80


 95%|█████████▌| 476/500 [1:39:40<04:04, 10.18s/it]


✓ [476/500] Gold: 20 | Pred: 20


 95%|█████████▌| 477/500 [1:39:48<03:39,  9.56s/it]


✓ [477/500] Gold: 5 | Pred: 5


 96%|█████████▌| 478/500 [1:40:01<03:54, 10.64s/it]


✓ [478/500] Gold: 20 | Pred: 20


 96%|█████████▌| 479/500 [1:40:10<03:32, 10.10s/it]


✓ [479/500] Gold: 621 | Pred: 621


 96%|█████████▌| 480/500 [1:40:23<03:38, 10.94s/it]


✓ [480/500] Gold: 15400 | Pred: 15400


 96%|█████████▌| 481/500 [1:40:33<03:22, 10.66s/it]


✓ [481/500] Gold: 11 | Pred: 11


 96%|█████████▋| 482/500 [1:40:44<03:17, 10.99s/it]


✓ [482/500] Gold: 84 | Pred: 84


 97%|█████████▋| 483/500 [1:40:52<02:51, 10.10s/it]


✓ [483/500] Gold: 26 | Pred: 26


 97%|█████████▋| 484/500 [1:41:04<02:48, 10.52s/it]


✓ [484/500] Gold: 40 | Pred: 40


 97%|█████████▋| 485/500 [1:41:14<02:37, 10.49s/it]


✓ [485/500] Gold: 240 | Pred: 240


 97%|█████████▋| 486/500 [1:41:26<02:30, 10.75s/it]


✓ [486/500] Gold: 220 | Pred: 220


 97%|█████████▋| 487/500 [1:41:34<02:11, 10.09s/it]


✓ [487/500] Gold: 6 | Pred: 6


 98%|█████████▊| 488/500 [1:41:48<02:15, 11.29s/it]


✓ [488/500] Gold: 4 | Pred: 4


 98%|█████████▊| 489/500 [1:41:57<01:55, 10.54s/it]


✓ [489/500] Gold: 6 | Pred: 6


 98%|█████████▊| 490/500 [1:42:04<01:33,  9.37s/it]


✓ [490/500] Gold: -10 | Pred: -10


 98%|█████████▊| 491/500 [1:42:12<01:21,  9.10s/it]


✓ [491/500] Gold: 4 | Pred: 4


 98%|█████████▊| 492/500 [1:42:21<01:11,  8.97s/it]


✓ [492/500] Gold: 16 | Pred: 16


 99%|█████████▊| 493/500 [1:42:30<01:02,  8.93s/it]


✓ [493/500] Gold: 32 | Pred: 32


 99%|█████████▉| 494/500 [1:42:46<01:07, 11.26s/it]


✗ [494/500] Gold: 25 | Pred: 50


 99%|█████████▉| 495/500 [1:42:55<00:52, 10.48s/it]


✗ [495/500] Gold: 21 | Pred: 53


 99%|█████████▉| 496/500 [1:43:04<00:40, 10.05s/it]


✓ [496/500] Gold: 200 | Pred: 200


 99%|█████████▉| 497/500 [1:43:11<00:27,  9.23s/it]


✓ [497/500] Gold: 38 | Pred: 38


100%|█████████▉| 498/500 [1:43:23<00:20, 10.03s/it]


✓ [498/500] Gold: 112 | Pred: 112


100%|█████████▉| 499/500 [1:43:35<00:10, 10.52s/it]


✓ [499/500] Gold: 40 | Pred: 40


100%|██████████| 500/500 [1:43:47<00:00, 12.46s/it]


✗ [500/500] Gold: 10 | Pred: 8
Checkpoint saved at sample 500

Saving final results to gsm8k_baseline.jsonl

GSM8K BASELINE EVALUATION COMPLETE
Model: Qwen/Qwen2.5-Math-1.5B-Instruct
Samples evaluated: 500
Correct: 425
Incorrect: 75
Accuracy: 85.00% (425/500)
Summary saved to gsm8k_summary.json



