In [41]:
import re
import pandas as pd

data = pd.read_csv("llama-3.2-3b-gsm8k-fullpreprocessed.csv")

In [44]:
data.isnull().sum()

question        0
response        0
answer          0
ground_truth    0
dtype: int64

In [43]:
data = data.dropna(subset = ["response"])
data = data.dropna(subset = ["ground_truth"])
data

Unnamed: 0,question,response,answer,ground_truth
0,Natalia sold clips to 48 of her friends in Apr...,## Step 1: Calculate the number of clips sold ...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...,72.0
1,Weng earns $12 an hour for babysitting. Yester...,## Step 1: Determine Weng's hourly rate\nWeng ...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...,10.0
2,Betty is saving money for a new wallet which c...,## Step 1: Calculate how much money Betty init...,"In the beginning, Betty has only 100 / 2 = $<<...",5.0
3,"Julie is reading a 120-page book. Yesterday, s...",## Step 1: Calculate the total number of pages...,Maila read 12 x 2 = <<12*2=24>>24 pages today....,42.0
4,James writes a 3-page letter to 2 different fr...,## Step 1: Determine how many times James writ...,He writes each friend 3*2=<<3*2=6>>6 pages a w...,624.0
...,...,...,...,...
8792,John had a son James when he was 19. James is...,"{'id': 'chatcmpl-zjgtfdyb0frqi68vdn7xb', 'obje...",Dora is 12-3=<<12-3=9>>9\nSo James is 9*2=<<9*...,8.0
8793,There are some oranges in a basket. Ana spends...,"{'id': 'chatcmpl-nnfi2o160fys577rgg31', 'objec...",There are 60 minutes in an hour. Ana peels an ...,5.0
8794,Mark's car breaks down and he needs to get a n...,"{'id': 'chatcmpl-3trlmpl1dbl01bw4natk8el', 'ob...",The discount on the radiator was 400*.8=$<<400...,230.0
8795,"Farmer Brown has 20 animals on his farm, all e...","{'id': 'chatcmpl-tesufrk2o49wted4mfljxs', 'obj...",Let C be the number of chickens.\nThere are 20...,5.0


In [45]:
def check_row_accuracy(row):
    """
    For a given row, build a regex pattern that verifies the presence of
    the ground truth value in the message.
    
    This function does two key things:
      1. Converts the ground truth to a float.
      2. If the float is an integer (like 11.0), it allows for matching either "11" or "11.0".
      3. Otherwise, it does a straightforward match.
    """
    try:
        # Get the numeric ground truth value
        gt_val = float(row["ground_truth"])
    except (ValueError, TypeError):
        # In case of conversion failure, we consider it not accurate.
        return 0

    # Create a regex pattern using word boundaries to avoid partial matches.
    # For integers: match either the pure integer or the integer followed by a decimal part (like .0 or .000)
    if gt_val.is_integer():
        # Use int(gt_val) for canonical integer representation.
        pattern = rf"\b{int(gt_val)}(?:\.0+)?\b"
    else:
        # For non-integer numbers, just match the float as is.
        pattern = rf"\b{gt_val}\b"
    
    # Search for the pattern in the message.
    if re.search(pattern, str(row["response"])):
        return 1
    else:
        return 0

# Apply the accuracy check row by row and create a new column "automated_accuracy"
data["automated_accuracy"] = data[:7000].apply(check_row_accuracy, axis=1)

# Optionally, print a sample to verify
sample_columns = data[["response", "ground_truth", "automated_accuracy"]].head(10)
print(sample_columns)

# Compare with your manually set "Accurate" column (if desired)
print("\nComparison with manual accuracy:")
comparison = data[["ground_truth", "automated_accuracy"]]
print(comparison.head(10))


                                            response  ground_truth  \
0  ## Step 1: Calculate the number of clips sold ...          72.0   
1  ## Step 1: Determine Weng's hourly rate\nWeng ...          10.0   
2  ## Step 1: Calculate how much money Betty init...           5.0   
3  ## Step 1: Calculate the total number of pages...          42.0   
4  ## Step 1: Determine how many times James writ...         624.0   
5  ## Step 1: Calculate the number of purple flow...          35.0   
6  ## Step 1: Calculate the number of slices from...          48.0   
7  ## Step 1: First, we need to establish the ini...          16.0   
8  ## Step 1: Calculate the total amount spent by...          41.0   
9  ## Step 1: Calculate Tina's regular pay for th...         990.0   

   automated_accuracy  
0                 1.0  
1                 1.0  
2                 1.0  
3                 1.0  
4                 1.0  
5                 1.0  
6                 1.0  
7                 1.0  
8            

In [46]:
data

Unnamed: 0,question,response,answer,ground_truth,automated_accuracy
0,Natalia sold clips to 48 of her friends in Apr...,## Step 1: Calculate the number of clips sold ...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...,72.0,1.0
1,Weng earns $12 an hour for babysitting. Yester...,## Step 1: Determine Weng's hourly rate\nWeng ...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...,10.0,1.0
2,Betty is saving money for a new wallet which c...,## Step 1: Calculate how much money Betty init...,"In the beginning, Betty has only 100 / 2 = $<<...",5.0,1.0
3,"Julie is reading a 120-page book. Yesterday, s...",## Step 1: Calculate the total number of pages...,Maila read 12 x 2 = <<12*2=24>>24 pages today....,42.0,1.0
4,James writes a 3-page letter to 2 different fr...,## Step 1: Determine how many times James writ...,He writes each friend 3*2=<<3*2=6>>6 pages a w...,624.0,1.0
...,...,...,...,...,...
8792,John had a son James when he was 19. James is...,"{'id': 'chatcmpl-zjgtfdyb0frqi68vdn7xb', 'obje...",Dora is 12-3=<<12-3=9>>9\nSo James is 9*2=<<9*...,8.0,
8793,There are some oranges in a basket. Ana spends...,"{'id': 'chatcmpl-nnfi2o160fys577rgg31', 'objec...",There are 60 minutes in an hour. Ana peels an ...,5.0,
8794,Mark's car breaks down and he needs to get a n...,"{'id': 'chatcmpl-3trlmpl1dbl01bw4natk8el', 'ob...",The discount on the radiator was 400*.8=$<<400...,230.0,
8795,"Farmer Brown has 20 animals on his farm, all e...","{'id': 'chatcmpl-tesufrk2o49wted4mfljxs', 'obj...",Let C be the number of chickens.\nThere are 20...,5.0,


In [47]:
data["automated_accuracy"].value_counts()

automated_accuracy
1.0    6455
0.0     545
Name: count, dtype: int64

In [48]:
6455/8792 * 100

73.41901728844404