## 使用新数据

In [1]:
from main_utils import *
mydevice = 'cuda'

In [3]:
import os
import random
import re

class create_test_data:
    def __init__(self, non_overlap_data_path, num_test_samples=100) -> None:
        if not os.path.exists(non_overlap_data_path):
            raise ValueError("There is no nonoverlap data file")
        self.non_overlap_data_path = non_overlap_data_path
        self.num_test_samples = num_test_samples
        self.samples = None
        # Open non-overlap data path to get a bunch of test samples
        with open(self.non_overlap_data_path, 'r') as f:
            lines = f.readlines()
            random.shuffle(lines)
            self.samples = lines[:self.num_test_samples]
            
        
    def create_prompt(self):
        """
        To create prompt data file like: 'Ta+b=' .
        The output should be correct answer and judgement
        """
        with open(f'prompt.txt', 'w') as f2:
            for line in self.samples:
                prompt = line.split('=')[0]+'=\n'
                f2.write(prompt)
                    
    def create_add_noise_judge_prompt(self):
        """
        To create prompt data file like: 'a+b=c' and 'a+b=d', where 'd' means wrong answer
        The output should be judgement
        """
        with open('add_noise_judge_prompt.txt', 'w') as f3:
            for line in self.samples:
                # 取出表达式部分
                prompt = line.split('T')[1].strip()
                # prompt = prompt.split('=')[0].strip()
                new_prompt = self.modify_result(prompt, random.randint(1, 9), 'noise_add')
                f3.write(new_prompt + '?\n')
                
    def create_extra_num_judge_prompt(self):
        """
        To create prompt data file like: 'a+b=c' and 'a+b=cd', where 'd' means extra number
        The output should be judgement
        """
        with open('extra_num_judge_prompt.txt', 'w') as f4:
            for line in self.samples:
                # 取出表达式部分
                prompt = line.split('T')[1].strip()
                # prompt = prompt.split('=')[0].strip()
                new_prompt = self.modify_result(prompt, random.randint(1, 9), 'extra_num')
                f4.write(new_prompt + '?\n')

    
    def modify_result(self, expression, addend, mode):
        # 使用正则表达式提取表达式中的数字
        match = re.match(r'(\d+)\+(\d+)=(\d+)', expression)
    
        if match:
            # 提取数字并计算新的结果
            num1 = int(match.group(1))
            num2 = int(match.group(2))
            result = int(match.group(3))
            num_digit = len(match.group(3).strip())
            if mode == 'extra_num':
                if random.uniform(0,1)>0.5 and random.uniform(0,1)<0.75:
                    extra = random.randint(1, 9)
                    new_expression = f"F{num1}+{num2}={result}{extra}"
                elif random.uniform(0,1)>0.75:
                    extra = random.randint(1, 9)
                    new_expression = f"F{num1}+{num2}={extra}{result}"
                else:
                    new_expression = f"T{num1}+{num2}={result}"
            elif mode == 'noise_add':
                if random.uniform(0,1)>0.5:
                    # 决定添加错误的位置
                    wrong_loc = random.randint(0, num_digit)
        
                    new_result = result + addend * (10**wrong_loc)

                    # 构建新的表达式
                    new_expression = f"F{num1}+{num2}={new_result}"
                else:
                    new_expression = f"T{num1}+{num2}={result}"
            
            else:
                return "Invalid modify pattern"
        
            return new_expression
        else:
            return "Invalid expression format"

In [4]:
non_overlap_data_path = './data/get_data_with_label/train_3digit_bilabeled10000_nonoverlap.txt'
num_examples = 10000
new_creator = create_test_data(non_overlap_data_path, num_examples)
new_creator.create_prompt()
new_creator.create_extra_num_judge_prompt()
new_creator.create_add_noise_judge_prompt()

加载模型

In [5]:
from model import GPTConfig, GPT
import torch

# init from a model saved in a specific directory
ckpt_path = 'ckpt_acc_bilabel(0.6p).pt'
checkpoint = torch.load(ckpt_path, map_location=mydevice)
gptconf = GPTConfig(**checkpoint['model_args'])
model = GPT(gptconf)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.to(mydevice)

number of parameters: 10.63M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(16, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=384, out_features=1152, bias=False)
          (c_proj): Linear(in_features=384, out_features=384, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=384, out_features=1536, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1536, out_features=384, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=384, out_features=16, bias=False)
)

In [6]:
encode, decode = get_encode_decode('./data/addition_bilabel/meta.pkl')

Loading meta from ./data/addition_bilabel/meta.pkl...


In [7]:
from contextlib import nullcontext
ctx = nullcontext()
config={
    'start': 'FILE:./prompt.txt',
    'device': mydevice,
    'temperature': 0.8
}
eval_addition_batch(config, model, ctx, encode, decode, judge=True)

evaluating addition from: FILE:./prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 20597.15it/s]
100%|██████████| 81/81 [00:01<00:00, 52.61it/s]


Judgement accuracy of 10000 examples: 9531/10000 (95.30999999999999%)
accuracy of 10000 examples: 9532/10000 (95.32000000000001%)
{'carry0': 95.96199524940617, 'carry1': 93.80505825518614, 'carry2': 96.03174603174604, 'carry3': 96.6903073286052}


(95.30999999999999,
 95.32000000000001,
 {'carry0': 95.96199524940617,
  'carry1': 93.80505825518614,
  'carry2': 96.03174603174604,
  'carry3': 96.6903073286052})

测试extra number

In [8]:
from contextlib import nullcontext
ctx = nullcontext()
config={
    'start': 'FILE:./extra_num_judge_prompt.txt',
    'device': mydevice,
}
eval_judge_batch(config, model, ctx, encode, decode, max_new_tokens=1)

evaluating addition from: FILE:./extra_num_judge_prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 16871.84it/s]
100%|██████████| 82/82 [00:00<00:00, 198.25it/s]

Judgement accuracy of 10000 examples: 9388/10000 (93.88%)
No judging probability of 10000 examples: 0/10000 (0.0%)
True Positive Examples: 4114/10000
False Positive Examples: 89/10000
True Negative Examples: 5274/10000
False Negative Examples: 523/10000
{'carry0': 93.58669833729216, 'carry1': 92.55470304063654, 'carry2': 94.38775510204081, 'carry3': 96.53270291568164}





(93.88,
 0.0,
 {'carry0': 93.58669833729216,
  'carry1': 92.55470304063654,
  'carry2': 94.38775510204081,
  'carry3': 96.53270291568164})

测试add_noise

In [9]:
ctx = nullcontext()
config={
    'start': 'FILE:./add_noise_judge_prompt.txt',
    'device': mydevice,
}
eval_judge_batch(config, model, ctx, encode, decode, max_new_tokens=1)

evaluating addition from: FILE:./add_noise_judge_prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 22073.09it/s]
100%|██████████| 83/83 [00:00<00:00, 192.77it/s]

Judgement accuracy of 10000 examples: 6691/10000 (66.91%)
No judging probability of 10000 examples: 0/10000 (0.0%)
True Positive Examples: 4453/10000
False Positive Examples: 2764/10000
True Negative Examples: 2238/10000
False Negative Examples: 545/10000
{'carry0': 64.66745843230403, 'carry1': 65.70048309178745, 'carry2': 67.3469387755102, 'carry3': 72.02521670606778}





(66.91,
 0.0,
 {'carry0': 64.66745843230403,
  'carry1': 65.70048309178745,
  'carry2': 67.3469387755102,
  'carry3': 72.02521670606778})

## 新数据：20%positive + 80%negative

In [10]:
import os
import random
import re

class create_test_data:
    def __init__(self, non_overlap_data_path, num_test_samples=100) -> None:
        if not os.path.exists(non_overlap_data_path):
            raise ValueError("There is no nonoverlap data file")
        self.non_overlap_data_path = non_overlap_data_path
        self.num_test_samples = num_test_samples
        self.samples = None
        # Open non-overlap data path to get a bunch of test samples
        with open(self.non_overlap_data_path, 'r') as f:
            lines = f.readlines()
            random.shuffle(lines)
            self.samples = lines[:self.num_test_samples]
            
        
    def create_prompt(self):
        """
        To create prompt data file like: 'Ta+b=' .
        The output should be correct answer and judgement
        """
        with open(f'prompt.txt', 'w') as f2:
            for line in self.samples:
                prompt = line.split('=')[0]+'=\n'
                f2.write(prompt)
                    
    def create_add_noise_judge_prompt(self):
        """
        To create prompt data file like: 'a+b=c' and 'a+b=d', where 'd' means wrong answer
        The output should be judgement
        """
        with open('add_noise_judge_prompt.txt', 'w') as f3:
            for line in self.samples:
                # 取出表达式部分
                prompt = line.split('T')[1].strip()
                # prompt = prompt.split('=')[0].strip()
                new_prompt = self.modify_result(prompt, random.randint(1, 9), 'noise_add')
                f3.write(new_prompt + '?\n')
                
    def create_extra_num_judge_prompt(self):
        """
        To create prompt data file like: 'a+b=c' and 'a+b=cd', where 'd' means extra number
        The output should be judgement
        """
        with open('extra_num_judge_prompt.txt', 'w') as f4:
            for line in self.samples:
                # 取出表达式部分
                prompt = line.split('T')[1].strip()
                # prompt = prompt.split('=')[0].strip()
                new_prompt = self.modify_result(prompt, random.randint(1, 9), 'extra_num')
                f4.write(new_prompt + '?\n')

    
    def modify_result(self, expression, addend, mode):
        # 使用正则表达式提取表达式中的数字
        match = re.match(r'(\d+)\+(\d+)=(\d+)', expression)
    
        if match:
            # 提取数字并计算新的结果
            num1 = int(match.group(1))
            num2 = int(match.group(2))
            result = int(match.group(3))
            num_digit = len(match.group(3).strip())
            if mode == 'extra_num':
                if random.uniform(0,1)>0.5 and random.uniform(0,1)<0.75:
                    extra = random.randint(1, 9)
                    new_expression = f"F{num1}+{num2}={result}{extra}"
                elif random.uniform(0,1)>0.75:
                    extra = random.randint(1, 9)
                    new_expression = f"F{num1}+{num2}={extra}{result}"
                else:
                    new_expression = f"T{num1}+{num2}={result}"
            elif mode == 'noise_add':
                if random.uniform(0,1)>0.5:
                    # 决定添加错误的位置
                    wrong_loc = random.randint(0, num_digit)
        
                    new_result = result + addend * (10**wrong_loc)

                    # 构建新的表达式
                    new_expression = f"F{num1}+{num2}={new_result}"
                else:
                    new_expression = f"T{num1}+{num2}={result}"
            
            else:
                return "Invalid modify pattern"
        
            return new_expression
        else:
            return "Invalid expression format"

In [11]:
non_overlap_data_path = './data/get_data_with_label/train_3digit_bilabeled10000_nonoverlap.txt'
num_examples = 10000
new_creator = create_test_data(non_overlap_data_path, num_examples)
new_creator.create_prompt()
new_creator.create_extra_num_judge_prompt()
new_creator.create_add_noise_judge_prompt()

In [12]:
from model import GPTConfig, GPT
import torch

# init from a model saved in a specific directory
ckpt_path = 'ckpt_judge_acc_bilabel(0.2p).pt'
checkpoint = torch.load(ckpt_path, map_location=mydevice)
gptconf = GPTConfig(**checkpoint['model_args'])
model = GPT(gptconf)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.to(mydevice)

number of parameters: 10.63M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(16, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=384, out_features=1152, bias=False)
          (c_proj): Linear(in_features=384, out_features=384, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=384, out_features=1536, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1536, out_features=384, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=384, out_features=16, bias=False)
)

In [13]:
encode, decode = get_encode_decode('./data/addition_bilabel/meta.pkl')

Loading meta from ./data/addition_bilabel/meta.pkl...


In [14]:
from contextlib import nullcontext
ctx = nullcontext()
config={
    'start': 'FILE:./prompt.txt',
    'device': mydevice,
    'temperature': 0.8
}
eval_addition_batch(config, model, ctx, encode, decode, judge=True)

evaluating addition from: FILE:./prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 21464.04it/s]
100%|██████████| 80/80 [00:01<00:00, 58.01it/s]

Judgement accuracy of 10000 examples: 9451/10000 (94.51%)
accuracy of 10000 examples: 9451/10000 (94.51%)
{'carry0': 96.17253948967192, 'carry1': 92.01692524682652, 'carry2': 95.63205091119468, 'carry3': 96.15384615384616}





(94.51,
 94.51,
 {'carry0': 96.17253948967192,
  'carry1': 92.01692524682652,
  'carry2': 95.63205091119468,
  'carry3': 96.15384615384616})

In [16]:
from contextlib import nullcontext
ctx = nullcontext()
config={
    'start': 'FILE:./extra_num_judge_prompt.txt',
    'device': mydevice,
}
eval_judge_batch(config, model, ctx, encode, decode, max_new_tokens=1)

evaluating addition from: FILE:./extra_num_judge_prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 20338.96it/s]
100%|██████████| 82/82 [00:00<00:00, 246.95it/s]

Judgement accuracy of 10000 examples: 8829/10000 (88.29%)
No judging probability of 10000 examples: 0/10000 (0.0%)
True Positive Examples: 3487/10000
False Positive Examples: 15/10000
True Negative Examples: 5342/10000
False Negative Examples: 1156/10000
{'carry0': 88.63912515188336, 'carry1': 87.67277856135401, 'carry2': 88.05322533989008, 'carry3': 90.08875739644971}





(88.29,
 0.0,
 {'carry0': 88.63912515188336,
  'carry1': 87.67277856135401,
  'carry2': 88.05322533989008,
  'carry3': 90.08875739644971})

In [19]:
ctx = nullcontext()
config={
    'start': 'FILE:./add_noise_judge_prompt.txt',
    'device': mydevice,
}
eval_judge_batch(config, model, ctx, encode, decode, max_new_tokens=1)

evaluating addition from: FILE:./add_noise_judge_prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 25607.16it/s]
100%|██████████| 83/83 [00:00<00:00, 277.73it/s]


Judgement accuracy of 10000 examples: 6672/10000 (66.72%)
No judging probability of 10000 examples: 0/10000 (0.0%)
True Positive Examples: 3746/10000
False Positive Examples: 2102/10000
True Negative Examples: 2926/10000
False Negative Examples: 1226/10000
{'carry0': 61.05710814094775, 'carry1': 65.8392101551481, 'carry2': 68.03586925079549, 'carry3': 72.55917159763314}


(66.72,
 0.0,
 {'carry0': 61.05710814094775,
  'carry1': 65.8392101551481,
  'carry2': 68.03586925079549,
  'carry3': 72.55917159763314})

## 测试一个positive instances对应5个不同的negative instances

In [1]:
import os
import random
import re

class create_test_data:
    def __init__(self, non_overlap_data_path, num_test_samples=100) -> None:
        if not os.path.exists(non_overlap_data_path):
            raise ValueError("There is no nonoverlap data file")
        self.non_overlap_data_path = non_overlap_data_path
        self.num_test_samples = num_test_samples
        self.samples = None
        # Open non-overlap data path to get a bunch of test samples
        with open(self.non_overlap_data_path, 'r') as f:
            lines = f.readlines()
            random.shuffle(lines)
            self.samples = lines[:self.num_test_samples]
            
        
    def create_prompt(self):
        """
        To create prompt data file like: 'Ta+b=' .
        The output should be correct answer and judgement
        """
        with open(f'prompt.txt', 'w') as f2:
            for line in self.samples:
                prompt = line.split('=')[0]+'=\n'
                f2.write(prompt)
                    
    def create_add_noise_judge_prompt(self):
        """
        To create prompt data file like: 'a+b=c' and 'a+b=d', where 'd' means wrong answer
        The output should be judgement
        """
        with open('add_noise_judge_prompt.txt', 'w') as f3:
            for line in self.samples:
                # 取出表达式部分
                prompt = line.split('T')[1].strip()
                # prompt = prompt.split('=')[0].strip()
                new_prompt = self.modify_result(prompt, random.randint(1, 9), 'noise_add')
                f3.write(new_prompt + '?\n')
                
    def create_extra_num_judge_prompt(self):
        """
        To create prompt data file like: 'a+b=c' and 'a+b=cd', where 'd' means extra number
        The output should be judgement
        """
        with open('extra_num_judge_prompt.txt', 'w') as f4:
            for line in self.samples:
                # 取出表达式部分
                prompt = line.split('T')[1].strip()
                # prompt = prompt.split('=')[0].strip()
                new_prompt = self.modify_result(prompt, random.randint(1, 9), 'extra_num')
                f4.write(new_prompt + '?\n')

    
    def modify_result(self, expression, addend, mode):
        # 使用正则表达式提取表达式中的数字
        match = re.match(r'(\d+)\+(\d+)=(\d+)', expression)
    
        if match:
            # 提取数字并计算新的结果
            num1 = int(match.group(1))
            num2 = int(match.group(2))
            result = int(match.group(3))
            num_digit = len(match.group(3).strip())
            if mode == 'extra_num':
                if random.uniform(0,1)>0.5 and random.uniform(0,1)<0.75:
                    extra = random.randint(1, 9)
                    new_expression = f"F{num1}+{num2}={result}{extra}"
                elif random.uniform(0,1)>0.75:
                    extra = random.randint(1, 9)
                    new_expression = f"F{num1}+{num2}={extra}{result}"
                else:
                    new_expression = f"T{num1}+{num2}={result}"
            elif mode == 'noise_add':
                if random.uniform(0,1)>0.5:
                    # 决定添加错误的位置
                    wrong_loc = random.randint(0, num_digit)
        
                    new_result = result + addend * (10**wrong_loc)

                    # 构建新的表达式
                    new_expression = f"F{num1}+{num2}={new_result}"
                else:
                    new_expression = f"T{num1}+{num2}={result}"
            
            else:
                return "Invalid modify pattern"
        
            return new_expression
        else:
            return "Invalid expression format"

In [2]:
non_overlap_data_path = './data/get_data_with_label/train_3digit_bilabeled10000_nonoverlap.txt'
num_examples = 10000
new_creator = create_test_data(non_overlap_data_path, num_examples)
new_creator.create_prompt()
new_creator.create_extra_num_judge_prompt()
new_creator.create_add_noise_judge_prompt()

In [5]:
from model import GPTConfig, GPT
import torch

# init from a model saved in a specific directory
ckpt_path = 'addition-bilabel-5neg-0.7p.pt'
checkpoint = torch.load(ckpt_path, map_location=mydevice)
gptconf = GPTConfig(**checkpoint['model_args'])
model = GPT(gptconf)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.to(mydevice)

number of parameters: 10.63M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(16, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=384, out_features=1152, bias=False)
          (c_proj): Linear(in_features=384, out_features=384, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=384, out_features=1536, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1536, out_features=384, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=384, out_features=16, bias=False)
)

In [6]:
encode, decode = get_encode_decode('./data/addition_bilabel_5neg/meta.pkl')

Loading meta from ./data/addition_bilabel_5neg/meta.pkl...


In [7]:
from contextlib import nullcontext
ctx = nullcontext()
config={
    'start': 'FILE:./prompt.txt',
    'device': mydevice,
    'temperature': 0.8
}
eval_addition_batch(config, model, ctx, encode, decode, judge=True)

evaluating addition from: FILE:./prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 21119.06it/s]
100%|██████████| 81/81 [00:01<00:00, 48.89it/s]

Judgement accuracy of 10000 examples: 4689/10000 (46.89%)
accuracy of 10000 examples: 9600/10000 (96.0%)
{'carry0': 96.74170616113744, 'carry1': 94.65584778959149, 'carry2': 96.94207586004117, 'carry3': 96.2602842183994}





(46.89,
 96.0,
 {'carry0': 96.74170616113744,
  'carry1': 94.65584778959149,
  'carry2': 96.94207586004117,
  'carry3': 96.2602842183994})

In [8]:
from contextlib import nullcontext
ctx = nullcontext()
config={
    'start': 'FILE:./extra_num_judge_prompt.txt',
    'device': mydevice,
}
eval_judge_batch(config, model, ctx, encode, decode, max_new_tokens=1)

evaluating addition from: FILE:./extra_num_judge_prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 23437.23it/s]
100%|██████████| 82/82 [00:00<00:00, 272.24it/s]

Judgement accuracy of 10000 examples: 8391/10000 (83.91%)
No judging probability of 10000 examples: 0/10000 (0.0%)
True Positive Examples: 3081/10000
False Positive Examples: 10/10000
True Negative Examples: 5310/10000
False Negative Examples: 1599/10000
{'carry0': 85.66350710900474, 'carry1': 83.04420817011751, 'carry2': 83.44604528079977, 'carry3': 85.19072550486163}





(83.91,
 0.0,
 {'carry0': 85.66350710900474,
  'carry1': 83.04420817011751,
  'carry2': 83.44604528079977,
  'carry3': 85.19072550486163})

In [9]:
ctx = nullcontext()
config={
    'start': 'FILE:./add_noise_judge_prompt.txt',
    'device': mydevice,
}
eval_judge_batch(config, model, ctx, encode, decode, max_new_tokens=1)

evaluating addition from: FILE:./add_noise_judge_prompt.txt


100%|██████████| 10000/10000 [00:00<00:00, 31638.93it/s]
100%|██████████| 81/81 [00:00<00:00, 273.09it/s]


Judgement accuracy of 10000 examples: 6700/10000 (67.0%)
No judging probability of 10000 examples: 0/10000 (0.0%)
True Positive Examples: 3294/10000
False Positive Examples: 1597/10000
True Negative Examples: 3406/10000
False Negative Examples: 1703/10000
{'carry0': 65.3436018957346, 'carry1': 65.5288192501399, 'carry2': 67.59776536312849, 'carry3': 71.50336574420344}


(67.0,
 0.0,
 {'carry0': 65.3436018957346,
  'carry1': 65.5288192501399,
  'carry2': 67.59776536312849,
  'carry3': 71.50336574420344})