In [1]:
%%capture
%pip install -U transformers datasets accelerate peft trl bitsandbytes wandb gradio

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

In [3]:
# Load Hugging Face and Weights & Biases tokens
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
wb_token = user_secrets.get_secret("wandb")

login(token=hf_token)
wandb.login(key=wb_token)

run = wandb.init(
    project='Fine-tune Gemma-2- 2B on MMLU', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33muu0712[0m ([33muu0712-engineering-student-council[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
base_model = "google/gemma-2-2b-it"
dataset = "openai/gsm8k"# Updated to cais/mmlu
dataset2 = "openai/gsm8k"
new_model = "gemma-2b-mmlu-pro-openai/gsm8k"

torch_dtype = torch.float16
attn_implementation = "eager"

In [5]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [6]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [21]:
print(dataset)


openai/gsm8k


In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 🔹 Step 1: Load the dataset
dataset = load_dataset("openai/gsm8k", "main")

# 🔹 Step 2: Format the prompt (Question-Answer pair)
def format_prompt(example):
    return {
        "text": f"Question: {example['question']}\nAnswer: {example['answer']}"
    }

formatted_dataset = dataset["train"].map(format_prompt)

# 🔹 Step 3: Load the Gemma tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as PAD token

# 🔹 Step 4: Tokenize the prompts
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=False)

tokenized_dataset = formatted_dataset.map(tokenize, remove_columns=formatted_dataset.column_names)

# ✅ Now tokenized_dataset is ready for fine-tuning


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
print(tokenized_dataset[0])
print(tokenizer.decode(tokenized_dataset[0]["input_ids"]))


{'input_ids': [2, 9413, 235292, 101268, 7596, 35382, 577, 235248, 235310, 235321, 576, 1070, 4078, 575, 4623, 235269, 578, 1492, 1284, 7596, 3933, 685, 1767, 35382, 575, 2782, 235265, 2250, 1767, 35382, 1498, 101268, 4874, 29911, 575, 4623, 578, 2782, 235336, 108, 1261, 235292, 101268, 7596, 235248, 235310, 235321, 235283, 235284, 589, 3245, 235310, 235321, 235283, 235284, 235293, 235284, 235310, 2492, 235284, 235310, 35382, 575, 2782, 235265, 108, 140199, 7596, 235248, 235310, 235321, 235340, 235284, 235310, 589, 3245, 235310, 235321, 235340, 235284, 235310, 235293, 235324, 235284, 2492, 235324, 235284, 35382, 29911, 575, 4623, 578, 2782, 235265, 108, 3308, 235248, 235324, 235284], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
print (dataset[1])

{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?', 'answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10'}


In [7]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
)

model = get_peft_model(model, peft_config)

In [9]:
# for seperate split

In [10]:
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
import wandb

# Initialize wandb manually
wandb.init(project="myv", name="run_gemma_gsm8k_noeval")

# Define your model names
base_model = "google/gemma-2b"
new_model = "gemma-2b-gsm8k-lora"

# Define training arguments
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=50,
    warmup_steps=100,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb",
    save_strategy="epoch",
    save_total_limit=3,
    run_name="gemma_gsm8k_noeval",
    evaluation_strategy="no",   # ✅ Disable evaluation
)

# Start training
print("\n🚀 Starting fine-tuning on full GSM8K dataset (no evaluation)...\n")

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,  # Use full dataset for training
    eval_dataset=None,                # ✅ No evaluation
    peft_config=peft_config,
    args=training_arguments,
)

trainer.train()

# Save final model
trainer.save_model(f"{new_model}_final")





🚀 Starting fine-tuning on full GSM8K dataset (no evaluation)...



Truncating train dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,1.1091
100,0.9451
150,0.9101
200,0.9545
250,0.9071
300,0.9579
350,0.9244
400,0.9179
450,0.9135
500,0.908


In [14]:
import torch

# Save the model and tokenizer
model.save_pretrained("gemma-2b-gsm8k-finetuned")
tokenizer.save_pretrained("gemma-2b-gsm8k-finetuned")


('gemma-2b-gsm8k-finetuned/tokenizer_config.json',
 'gemma-2b-gsm8k-finetuned/special_tokens_map.json',
 'gemma-2b-gsm8k-finetuned/tokenizer.model',
 'gemma-2b-gsm8k-finetuned/added_tokens.json',
 'gemma-2b-gsm8k-finetuned/tokenizer.json')

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the fine-tuned model
model_path = "gemma-2b-gsm8k-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).to("cuda")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
def generate_answer(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_length=200)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example Question
test_question = "If a train travels 120 miles in 2 hours, what is its average speed?"
generated_answer = generate_answer(test_question)

print("🔹 Model's Answer:\n", generated_answer)


🔹 Model's Answer:
 Question: If a train travels 120 miles in 2 hours, what is its average speed?
Answer: The train travels 120 miles / 2 hours = <<120/2=60>>60 miles per hour.
#### 60 miles per hour
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60 mph
#### 60


In [17]:
from huggingface_hub import create_repo

repo_name = "Working-gemma-2-2b-it-gsm8k"  # Change this if needed
create_repo(repo_name, repo_type="model", exist_ok=True)


RepoUrl('https://huggingface.co/sparky353454/Working-gemma-2-2b-it-gsm8k', endpoint='https://huggingface.co', repo_type='model', repo_id='sparky353454/Working-gemma-2-2b-it-gsm8k')

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "/kaggle/working/gemma-2b-gsm8k-finetuned"  # Your saved model path

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Push to Hugging Face Hub
model.push_to_hub("sparky353454/gemma-2b-gsm8k")
tokenizer.push_to_hub("sparky353454/gemma-2b-gsm8k")
print("completed ")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

completed 


In [None]:
!pip install gradio

In [10]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model from Hugging Face
model_name = "sparky353454/gemma-2b-gsm8k"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Function to generate response
def generate_answer(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
    
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()

# Create Gradio UI
iface = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(label="Enter Your Math Question"),
    outputs=gr.Textbox(label="Model's Answer"),
    title="Gemma 2B - GSM8K Math Solver",
    description="Enter a mathematical reasoning question, and the fine-tuned Gemma 2B model will generate the answer."
)

# Launch the app
iface.launch()


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://2700bf3668f2c8378b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [5]:
import getpass

# Prompt for GitHub token
token = getpass.getpass("Enter your GitHub token (starts with 'ghp_' or 'github_pat_...'): ")

# Use EleutherAI repo instead of OpenAI
repo_url = f"https://{token}:x-oauth-basic@github.com/EleutherAI/lm-evaluation-harness.git"

# Clone it
!git clone {repo_url}


Enter your GitHub token (starts with 'ghp_' or 'github_pat_...'):  ········


Cloning into 'lm-evaluation-harness'...
remote: Enumerating objects: 49581, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 49581 (delta 10), reused 2 (delta 2), pack-reused 49563 (from 2)[K
Receiving objects: 100% (49581/49581), 29.58 MiB | 22.00 MiB/s, done.
Resolving deltas: 100% (34299/34299), done.


In [None]:
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


#git token

In [6]:
%cd lm-evaluation-harness
!pip install -e .


/kaggle/working/lm-evaluation-harness
Obtaining file:///kaggle/working/lm-evaluation-harness
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting evaluate (from lm_eval==0.4.8)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jsonlines (from lm_eval==0.4.8)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pytablewriter (from lm_eval==0.4.8)
  Downloading pytablewriter-1.2.1-py3-none-any.whl.metadata (38 kB)
Collecting rouge-score>=0.0.4 (from lm_eval==0.4.8)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu>=1.5.0 (from lm_eval==0.4.8)
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
#for finetunned model command  for bench marks

In [None]:
# !python -m lm_eval \
#   --model hf \
#   --model_args pretrained=sparky353454/gemma-2b-gsm8k,use_auth_token=True \
#   --tasks gsm8k \
#   --device cuda \
#   --output_path results.json


In [None]:
# for base model benchmarks 

In [None]:
# !python -m lm_eval \
#   --model hf \
#   --model_args pretrained=google/gemma-2-2b-it,use_auth_token=True \
#   --tasks gsm8k \
#   --device cuda \
#   --output_path results_base.json


In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Your model names
lora_model_name = "sparky353454/gemma-2b-gsm8k"   # Fine-tuned LoRA model
base_model_name = "google/gemma-2-2b-it"              # Base model

# Load PEFT config from the LoRA model
peft_config = PeftConfig.from_pretrained(lora_model_name)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Load the LoRA adapter weights
model = PeftModel.from_pretrained(base_model, lora_model_name)

# Merge LoRA into base weights
model = model.merge_and_unload()

# Save the merged model locally
merged_model_dir = "gemma-2b-gsm8k-merged"
model.save_pretrained(merged_model_dir)

# Save tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.save_pretrained(merged_model_dir)

print(f"✅ Merged model saved to: {merged_model_dir}")


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

✅ Merged model saved to: gemma-2b-gsm8k-merged


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path to the merged model directory
model_path = "/kaggle/working/gemma-2b-gsm8k-merged"  # Make sure this folder contains config.json, pytorch_model.bin, tokenizer files, etc.

# Load the merged model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Push to Hugging Face Hub
model.push_to_hub("sparky353454/gemma-2-2b-it-gsm8k-merged")
tokenizer.push_to_hub("sparky353454/gemma-2-2b-it-gsm8k-merged")

print("✅ Merged model and tokenizer pushed to Hugging Face Hub successfully!")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

✅ Merged model and tokenizer pushed to Hugging Face Hub successfully!


In [6]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model from Hugging Face
model_name = "sparky353454/gemma-2-2b-it-gsm8k-merged"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Function to generate response
def generate_answer(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
    
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()

# Create Gradio UI
iface = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(label="Enter Your Math Question"),
    outputs=gr.Textbox(label="Model's Answer"),
    title="Gemma 2B - GSM8K Math Solver",
    description="Enter a mathematical reasoning question, and the fine-tuned Gemma 2B model will generate the answer."
)

# Launch the app
iface.launch()


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://bd59d572e775b9bf8f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [8]:
|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.4556|±  |0.0137|
|     |       |strict-match    |     5|exact_match|↑  |0.4496|±  |0.0137|


#basemodel benchmarks

SyntaxError: invalid character '↑' (U+2191) (<ipython-input-8-24723f08d6c5>, line 3)