# Setup

In [38]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
import getpass
import os
import sys

FOLDER = 'original_year_localization'

PROJECT_NAME = 'ood-prediction'
DATA_DIR = f'/nlp/scr/suzeva/data'
MODEL_DIR = '/nlp/scr/suzeva/models'

sys.path.append(f'/nlp/scr/suzeva/{PROJECT_NAME}/src')
os.environ["HF_HOME"] = '/nlp/scr/suzeva/models'
os.environ["HF_HUB"] = '/nlp/scr/suzeva/models'

CORE_LIB_DIR = f'/nlp/scr/hij/core'
RAVEL_LIB_DIR = f'/nlp/scr/hij/internal-ravel/src'
PYVENE_LIB_DIR = f'/nlp/scr/hij/pyvene'
import sys
sys.path.append(CORE_LIB_DIR)
sys.path.append(RAVEL_LIB_DIR)
sys.path.append(PYVENE_LIB_DIR)

In [40]:
import numpy as np
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(0)

In [41]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Models

In [42]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


model_id = "allenai/OLMo-2-0425-1B"
revision = "stage1-step10000-tokens21B"
tokenizer = AutoTokenizer.from_pretrained(
    model_id, padding_side='left', revision=revision,
    cache_dir=MODEL_DIR)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
      model_id, low_cpu_mem_usage=True, device_map='auto',
      revision=revision,
      torch_dtype=torch.bfloat16, cache_dir=MODEL_DIR)
model = model.eval()

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


# Behavioral Testing

In [43]:
from generation_utils import generate_distribution_batched


zero_count_years = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1029, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1058, 1059, 1060, 1061, 1063, 1064, 1069, 1070, 1073, 1074, 1077, 1078, 1080, 1082, 1083, 1084, 1085, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1097, 1098, 1099, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1128, 1129, 1130, 1131, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1141, 1142, 1143, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1155, 1156, 1157, 1160, 1162, 1163, 1164, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1213, 1214, 1217, 1218, 1219, 1221, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1237, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1259, 1261, 1263, 1264, 1265, 1267, 1268, 1269, 1270, 1273, 1275, 1276, 1277, 1278, 1280, 1281, 1282, 1284, 1285, 1287, 1288, 1289, 1294, 1297, 1301, 1303, 1304, 1307, 1313, 1314, 1317, 1318, 1319, 1321, 1322, 1324, 1325, 1326, 1328, 1329, 1331, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1341, 1342, 1345, 1346, 1350, 1351, 1352, 1353, 1355, 1358, 1359, 1362, 1363, 1364, 1365, 1367, 1368, 1369, 1370, 1372, 1373, 1376, 1380, 1383, 1385, 1386, 1387, 1389, 1392, 1393, 1395, 1398, 1399, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1410, 1411, 1412, 1414, 1415, 1417, 1419, 1420, 1422, 1423, 1424, 1425, 1426, 1427, 1429, 1430, 1431, 1434, 1435, 1438, 1443, 1447, 1449, 1451, 1455, 1456, 1457, 1458, 1459, 1461, 1462, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1474, 1475, 1476, 1480, 1481, 1484, 1487, 1488, 1495, 1497, 1499, 1501, 1504, 1505, 1507, 1508, 1510, 1519, 1523, 1526, 1529, 1532, 1541, 1546, 1549, 1554, 1561, 1565, 1567, 1569, 1570, 1574, 1575, 1581, 1583, 1587, 1588, 1591, 1593, 1594, 1597, 1601, 1610, 1612, 1615, 1631, 1654, 1656, 1671, 1684, 1690, 1691, 1694, 1698, 1713, 1734, 1753, 1762, 2026, 2028, 2029, 2032, 2033, 2037, 2039, 2041, 2042, 2044, 2046, 2047, 2048, 2051, 2052, 2053, 2054, 2056, 2058, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2085, 2086, 2087, 2088, 2089, 2091, 2092, 2094, 2095, 2096, 2097, 2098, 2099, 2101, 2102, 2103, 2104, 2106, 2107, 2108, 2109, 2110, 2112, 2114, 2115, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2126, 2127, 2129, 2130, 2131, 2132, 2133, 2134, 2135, 2136, 2137, 2138, 2139, 2141, 2142, 2143, 2144, 2145, 2146, 2147, 2148, 2149, 2150, 2151, 2152, 2154, 2155, 2156, 2157, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 2173, 2174, 2175, 2176, 2177, 2178, 2180, 2181, 2182, 2184, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 2271, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297, 2298, 2299, 2300, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2342, 2343, 2344, 2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, 2373, 2374, 2375, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421, 2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491, 2492, 2493, 2494, 2495, 2496, 2497, 2498, 2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535, 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 2544, 2545, 2546, 2547, 2548, 2549, 2550, 2551, 2552, 2553, 2554, 2555, 2556, 2557, 2558, 2559, 2560, 2561, 2562, 2563, 2564, 2565, 2566, 2567, 2568, 2569, 2570, 2571, 2572, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2582, 2583, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2594, 2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2622, 2623, 2624, 2625, 2626, 2627, 2628, 2629, 2630, 2631, 2632, 2633, 2634, 2635, 2636, 2637, 2638, 2639, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2653, 2654, 2655, 2656, 2657, 2658, 2659, 2660, 2661, 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, 2672, 2673, 2674, 2675, 2676, 2677, 2678, 2679, 2680, 2681, 2682, 2683, 2684, 2685, 2686, 2687, 2688, 2689, 2690, 2691, 2692, 2693, 2694, 2695, 2696, 2697, 2698, 2699, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, 2800, 2801, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2862, 2863, 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924, 2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932, 2933, 2934, 2935, 2936, 2937, 2938, 2939, 2940, 2941, 2942, 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2950, 2951, 2952, 2953, 2954, 2955, 2956, 2957, 2958, 2959, 2960, 2961, 2962, 2963, 2964, 2965, 2966, 2967, 2968, 2969, 2970, 2971, 2972, 2973, 2974, 2975, 2976, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, 2996, 2997, 2998, 2999]
prompt_template = ['In {year}, there']
year_value = list(set(range(1000, 3000)) - set(zero_count_years))
year_value = range(1950, 2050)
print(len(year_value))
prompts = [prompt_template[0].format(year=year) for year in year_value]

100


In [44]:
outputs = generate_distribution_batched(model, tokenizer, [prompt_template[0].format(year=i) for i in range(1000, 3000)])

100%|██████████| 63/63 [00:01<00:00, 37.30it/s]


In [45]:
for dist in outputs:
  print(dist)
  break

[(' were', 0.376953125), (' are', 0.259765625), (' was', 0.111328125), (' will', 0.09521484375), (' is', 0.055908203125), (' would', 0.019287109375), (' may', 0.01416015625), ("'s", 0.006683349609375), (' have', 0.006378173828125), (' might', 0.005279541015625)]


In [46]:
len(prompts)

100

In [None]:
# Create intervention data
import json
import os
import random

random.seed(0)
random.shuffle(prompts)

data = {
    'train': {'correct': prompts[:50], 'wrong': []},
    'val': {'correct': prompts[50:100], 'wrong': []},
    'test': {'correct': prompts[50:100], 'wrong': []},
}

# Create directory if it doesn't exist
os.makedirs(FOLDER, exist_ok=True)

# Save the data
output_file = f'{FOLDER}/year_{model_id.split("/")[-1]}-revision{revision}_in_distribution_split_0.json'
with open(output_file, 'w') as f:
    json.dump(data, f)

print(f"Data saved to: {output_file}")

Data saved to: original_year_localization/year_OLMo-2-0425-1B-revisionstage1-step10000-tokens21B_in_distribution_split_0.json


# Localizing Representations of Year

In [None]:
import json

from data_utils import load_intervention_data, _BASE_TEMPLATE
from generation_utils import generate_batched

sample_size = 512
split_type = ''
SPLIT_ID = '1'
mode = 'das'
data_split = json.load(open(os.path.join(f'{FOLDER}/year_{model_id.split("/")[-1]}-revision{revision}_in_distribution_split_0.json')))

verified_examples = data_split['train']['correct'][:sample_size]
print(verified_examples[:2])

intervention_prompt_to_output = generate_batched(model, tokenizer, [p for s in data_split for k in ('correct', 'wrong') for p in data_split[s][k]], max_new_tokens=1)
prompt_to_vars = {p: {'input': p,
                      'label': intervention_prompt_to_output[p],
                      'split': _BASE_TEMPLATE}
                 for s in data_split for k in ('correct', 'wrong') for p in data_split[s][k]}


def get_tense(be_word):
  be_word = be_word.lower().strip()
  # We distinguish present and future here even though English does not.
  if be_word == 'will':
    return 'future'
  elif be_word == 'is' or be_word == 'are':
    return 'present'
  elif be_word.endswith('ed') or be_word == 'was' or be_word == 'were':
    return 'past'
  else:
    raise ValueError(f'Unknown tense for {be_word}')


def set_tense(be_word, tense):
  normalize_be_word = be_word.lower().strip()
  tense_table = {
      'future': {'will': 'will', 'is': 'will', 'are': 'will', 'was': 'will', 'were': 'will'},
      'present': {'will': 'is', 'is': 'is', 'are': 'are', 'was': 'is', 'were': 'are'},
      'past': {'will': 'was', 'is': 'was', 'are': 'were', 'was': 'was', 'were': 'were'},
  }
  new_be_word =  tense_table[tense][normalize_be_word]
  if be_word.startswith(' '):
    new_be_word = ' ' + new_be_word
  return new_be_word


split_to_raw_example, split_to_dataset = load_intervention_data(
    mode, verified_examples, data_split, prompt_to_vars,
    inv_label_fn=lambda x, y: set_tense(x['label'], get_tense(y['label'])),
    filter_fn=lambda x, y: random.random() < 0.005 or get_tense(x['label']) != get_tense(y['label']),
    max_example_per_split=20480,
    max_example_per_eval_split=10)

['In 1973, there', 'In 1958, there']
Total #prompts=150
Set prompt_max_length=8


100%|██████████| 5/5 [00:00<00:00, 42.01it/s]

mode=das, #base_examples=50, #source_examples=50+50+0=100
BEFORE SPLIT: #Training examples=50*50=872, #Validation examples==0, #Test examples=50*(50+0)=946





AFTER SPLIT KEPT: #Training examples=min(50*50, 20480)=872, #Validation examples==0, #Test examples=50*10+0*10=500
#Splits=1 training split plus 1,024 test splits=51


In [49]:
len(split_to_dataset['das-train'])

872

In [50]:
split_to_dataset['das-train'][6]

{'input': 'In 2030, there',
 'label': ' will',
 'source_input': 'In 1974, there',
 'source_label': ' was',
 'inv_label': ' was',
 'split': 'BASE_TEMPLATE',
 'source_split': 'BASE_TEMPLATE'}

In [None]:
SCR_MODEL_DIR = f'/nlp/scr/suzeva/{FOLDER}'

In [52]:
import gc


gc.collect()
torch.cuda.empty_cache()

In [73]:
import collections
import gc
import re

from tqdm import tqdm, trange
from transformers import get_linear_schedule_with_warmup
from datasets import concatenate_datasets
from torch.nn import CrossEntropyLoss
from causal_interventions import compute_string_based_metrics

import pyvene as pv
from utils.intervention_utils import LowRankRotatedSpaceIntervention, get_intervention_config, train_intervention_step, remove_invalid_token_id, remove_all_forward_hooks

from utils.dataset_utils import get_multitask_dataloader
from utils.metric_utils import compute_cross_entropy_loss
from causal_interventions import eval_with_interventions_batched, compute_metrics


def train_alignment(config):
  print('Training Tasks: %s' % config['training_tasks'])
  concat_split_to_dataset = {f'joint-{split}':
      concatenate_datasets([split_to_dataset[f'{task_name}-{split}'].select(
          np.random.choice(len(split_to_dataset[f'{task_name}-{split}']),
                           size=(1024 if config['training_tasks'][task_name] == 'match_base' else len(split_to_dataset[f'{task_name}-{split}'])),
                           replace=False))
                            for task_name in config['training_tasks']
                            # repeat
                            for _ in range(1 if isinstance(config['training_tasks'][task_name], str) or split != 'train'
                                             else config['training_tasks'][task_name][1])
                            if f'{task_name}-{split}' in split_to_dataset])
      for split in ('train',)}
  inv_task = '|'.join([task_name for task_name, label in config['training_tasks'].items()
                       if label == 'match_source' or 'match_source' in label])
  inv_task = inv_task.split('|')
  print('Training tasks matching source label: %s' % inv_task)
  print('#Training examples: %d' % len(concat_split_to_dataset['joint-train']))
  max_train_example = int(config['max_train_percentage'] * len(concat_split_to_dataset['joint-train']))
  train_dataloader = get_multitask_dataloader(
      concat_split_to_dataset['joint-train'].select(range(max_train_example)),
      tokenizer=tokenizer,
      batch_size=TRAINING_BATCH_SIZE, prompt_max_length=INPUT_MAX_LEN,
      output_max_length=config['max_output_tokens'] + int(tokenizer.bos_token is not None),
      # The set of splits to load as cause tasks
      cause_tasks=[BASE_TEMPLATE, SOURCE_TEMPLATE],
      first_n=config['max_output_tokens'])


  # Create Model
  split_to_inv_locations = config['split_to_inv_locations']
  intervenable_config = get_intervention_config(
      type(model), config['intervenable_config']['intervenable_representation_type'],
      config['intervenable_config']['intervenable_layer'],
      config['intervenable_config']['intervenable_interventions_type'],
      intervention_dimension=config['intervention_dimension'])
  intervenable = pv.IntervenableModel(intervenable_config, model)
  intervenable.set_device("cuda")
  intervenable.disable_model_gradients()

  # Training
  epochs = config['training_epoch']
  gradient_accumulation_steps = 1
  total_step = 0

  warm_up_steps = 0 # 0.1 * t_total
  regularization_coefficient = config['regularization_coefficient']
  optimizer_params = []
  for k, v in intervenable.interventions.items():
      if isinstance(v[0], LowRankRotatedSpaceIntervention):
        optimizer_params += [{'params': v[0].rotate_layer.parameters()}]
      else:
        raise NotImplementedError
  optimizer = torch.optim.AdamW(
      optimizer_params, lr=config['init_lr'], weight_decay=0)
  scheduler = get_linear_schedule_with_warmup(
      optimizer, num_warmup_steps=warm_up_steps,
      num_training_steps=int(10 * len(train_dataloader))
  )

  #intervenable.model.train() # train enables drop-off but no grads
  print("base model trainable parameters: ", pv.count_parameters(intervenable.model))
  print("intervention trainable parameters: ", intervenable.count_parameters())
  train_iterator = trange(0, int(epochs), desc="Epoch")

  num_output_tokens = config['max_output_tokens']
  for epoch in train_iterator:
      epoch_iterator = tqdm(
          train_dataloader, desc=f"Epoch: {epoch}", position=0, leave=True
      )
      aggreated_stats = collections.defaultdict(list)
      for step, inputs in enumerate(epoch_iterator):
          for k, v in inputs.items():
              if v is not None and isinstance(v, torch.Tensor):
                  inputs[k] = v.to("cuda")
          b_s = inputs["input_ids"].shape[0]
          position_ids = {f'{prefix}position_ids': intervenable.model.prepare_inputs_for_generation(
                  input_ids=inputs[f"{prefix}input_ids"], attention_mask=inputs[f"{prefix}attention_mask"])['position_ids']
                  for prefix in ('', 'source_')}
          inputs.update(position_ids)
          for key in inputs:
            if key in ('input_ids', 'source_input_ids', 'attention_mask', 'source_attention_mask', 'position_ids', 'source_position_ids'):
              inputs[key] = inputs[key].to(device)

          counterfactual_outputs = train_intervention_step(
              intervenable, inputs, split_to_inv_locations, pad_token_id=tokenizer.pad_token_id)
          eval_metrics = compute_metrics(
              {'inv_outputs': [counterfactual_outputs.logits[:, -num_output_tokens-1:-1]]},
              [inputs['labels'][:, :num_output_tokens]],
              last_n_tokens=num_output_tokens,
              pad_token_id=tokenizer.pad_token_id,
          )
          loss = compute_cross_entropy_loss(
              counterfactual_outputs.logits,
              inputs["labels"][:, :num_output_tokens],
              next_n_tokens=num_output_tokens,
              pad_token_id=tokenizer.pad_token_id,
          )
          aggreated_stats['loss'].append(loss.item())
          aggreated_stats['acc'].append(eval_metrics['inv_outputs']["accuracy"])
          epoch_iterator.set_postfix({k: round(np.mean(aggreated_stats[k]), 2) for k in aggreated_stats})

          if step < 3:
            print('\nTokens to intervene:')
            intervention_locations = [split_to_inv_locations[inputs["split"][i]]['inv_position'] for i in range(len(inputs["split"]))]
            source_intervention_locations = [split_to_inv_locations[inputs["source_split"][i]]['inv_position'] for i in range(len(inputs["split"]))]
            print(inputs['input'][:3])
            print(inputs['source_input'][:3])
            print('Base:', tokenizer.batch_decode([inputs['input_ids'][i][intervention_locations[i]] for i in range(len(inputs["split"]))]))
            print('Source:', tokenizer.batch_decode([inputs['source_input_ids'][i][source_intervention_locations[i]] for i in range(len(inputs["split"]))]))
            print('Output:', tokenizer.batch_decode(torch.argmax(counterfactual_outputs.logits[:, -num_output_tokens-1:-1], dim=-1)))
            print('Label     :', tokenizer.batch_decode(remove_invalid_token_id(inputs['labels'][:, :num_output_tokens], tokenizer.pad_token_id)))
            print('Base Label:', tokenizer.batch_decode(remove_invalid_token_id(inputs['base_labels'][:, :num_output_tokens], tokenizer.pad_token_id)))

          if gradient_accumulation_steps > 1:
              loss = loss / gradient_accumulation_steps
          if total_step % gradient_accumulation_steps == 0:
              if not (gradient_accumulation_steps > 1 and total_step == 0):
                  loss.backward()
                  optimizer.step()
                  scheduler.step()
                  intervenable.set_zero_grad()
          total_step += 1
  return intervenable, intervenable_config


def run_exp(config):
  inv_tasks = '+'.join([''.join(re.findall(r'[A-Za-z]+', t)) + ('' if isinstance(l, str) else str(l[1])) for t, l in config['training_tasks'].items() if l == 'match_source' or 'match_source' in l])
  control_tasks = '+'.join([''.join(re.findall(r'[A-Za-z]+', t)) for t, l in config['training_tasks'].items() if l == 'match_base' or 'match_base' in l])
  task_compressed = ((inv_tasks + '_ex_' + control_tasks) if control_tasks else inv_tasks).replace('AZaz', '')
  das_type = 'multi_das' if len(config['training_tasks']) > 1 else 'das_baseline'
  if config['intervenable_config']['intervenable_interventions_type'] == LowRankRotatedSpaceIntervention:
    das_type = das_type.replace('das', 'daslora')
  split_to_inv_locations = config['split_to_inv_locations']
  input_len = list(split_to_inv_locations.values())[0]['max_input_length']
  inv_pos = min([x['inv_position'][0] for x in split_to_inv_locations.values()])
  inv_loc_name = 'len%d_pos%s' % (input_len, 'e' if inv_pos != input_len - 1 else 'f')
  training_data_percentage = int(config['max_train_percentage'] * 100)
  suffix = f"_example{len(verified_examples)}_{config['intervenable_config']['intervenable_representation_type']}"
  layer = '%s_%s' % (min(config['intervenable_config']['intervenable_layer']), max(config['intervenable_config']['intervenable_layer'])) if isinstance(config['intervenable_config']['intervenable_layer'], list) else config['intervenable_config']['intervenable_layer']
  model_name = model.name_or_path.split('/')[-1] + f'_{revision}'
  run_name = f"{model_name}-layer{layer}-dim{config['intervention_dimension']}-{das_type}_{config['max_output_tokens']}tok_{task_compressed}_id-{SPLIT_ID}_{inv_loc_name}_ep{config['training_epoch']}{suffix}"
  config['run_name_prefix'] = run_name#.rsplit('_ep', 1)[0]
  print(run_name)
  log_file_path = os.path.join(SCR_MODEL_DIR, 'logs', f'{run_name}.log')
  if True:
      print(run_name)
      intervenable, intervenable_config = train_alignment(config)
      # Save model
      torch.save({k: v[0].rotate_layer.weight for k, v in intervenable.interventions.items()},
                 os.path.join(SCR_MODEL_DIR, f'{run_name}.pt'))
      print('Model saved to %s' % os.path.join(SCR_MODEL_DIR, f'{run_name}.pt'))
      gc.collect()
      torch.cuda.empty_cache()
      # eval
      with torch.no_grad():
        split_to_eval_metrics = eval_with_interventions_batched(
            intervenable, eval_split_to_dataset,
            split_to_inv_locations,
            tokenizer,
            compute_metrics_fn=compute_metrics,
            max_new_tokens=config['max_output_tokens'],
            eval_batch_size=EVAL_BATCH_SIZE,
            inference_mode='generate',
            debug_print=True,
          )
      print('Mean IIA: %.4f' % np.mean(
          [v['metrics']['labels']['inv_outputs']['accuracy'] for k, v in split_to_eval_metrics.items()]))
      print('Mean correct IIA: %.4f' % np.mean(
          [v['metrics']['labels']['inv_outputs']['accuracy'] for k, v in split_to_eval_metrics.items() if '-correct' in k]))
      print('Mean wrong IIA: %.4f' % np.mean(
          [v['metrics']['labels']['inv_outputs']['accuracy'] for k, v in split_to_eval_metrics.items() if '-wrong' in k]))
  # Save model.
  #torch.save({k: v[0].rotate_layer.weight for k, v in intervenable.interventions.items()},
  #           os.path.join(SCR_MODEL_DIR, f'{config["run_name_prefix"]}.pt'))
  #print('Model saved to %s' % os.path.join(SCR_MODEL_DIR, f'{config["run_name_prefix"]}.pt'))
  # logging
  json.dump(split_to_eval_metrics, open(os.path.join(SCR_MODEL_DIR, f'{run_name}_evalall.json'), 'w'))
  print('Saved to %s' % os.path.join(SCR_MODEL_DIR, f'{run_name}.json'))
  remove_all_forward_hooks(intervenable)
  return intervenable



assert mode == 'das'

INPUT_MAX_LEN = 8
TRAINING_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16

from data_utils import _BASE_TEMPLATE, _SOURCE_TEMPLATE
BASE_TEMPLATE = _BASE_TEMPLATE
SOURCE_TEMPLATE = _SOURCE_TEMPLATE

SPLIT_TO_INV_LOCATIONS = {
    split: {'max_input_length': INPUT_MAX_LEN,
             'inv_position': [INPUT_MAX_LEN - 3]}
    for split in list(split_to_dataset) + [BASE_TEMPLATE, SOURCE_TEMPLATE]
}

training_tasks_list = [
  {'das': 'match_source'}
]

eval_split_to_dataset = {k: v for k, v in split_to_dataset.items()
                         if k.endswith('-test')
                         }

model = model.eval()



for inv_layer in [[i] for i in range(10)]:
  for lr in [1e-4]:
    for inv_dim in [16, 32]:
      # train
      for training_tasks in training_tasks_list:
        config = {
            'regularization_coefficient': 0,
            'intervention_dimension': inv_dim,
            'max_output_tokens': 1,
            'intervenable_config': {
              'intervenable_layer': inv_layer,
              'intervenable_representation_type': 'block_output',
              'intervenable_unit': 'pos',
              'max_number_of_units': 1,
              'intervenable_interventions_type': LowRankRotatedSpaceIntervention,
            },
            'training_tasks': training_tasks,
            'training_epoch': 1,
            'split_to_inv_locations': SPLIT_TO_INV_LOCATIONS,
            'split_to_labels': None,
            'max_train_percentage': 1.0 if len(training_tasks) <= 3 else 1.0,
            'init_lr': lr,
        }
        intervenable = run_exp(config)

OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 753.34 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 15300.38 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:03, 14.15it/s, loss=4.13, acc=0]


Tokens to intervene:
['In 2036, there', 'In 1958, there', 'In 2049, there']
['In 1958, there', 'In 2028, there', 'In 1998, there']
Base: ['6', '8', '9', '0', '6', '3', '6', '0', '9', '3', '6', '8', '4', '3', '7', '9']
Source: ['8', '8', '8', '6', '8', '9', '8', '5', '6', '4', '1', '0', '0', '7', '9', '4']
Output: [' will', ' was', ' will', ' were', ' was', ' was', ' will', ' will', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will']
Label     : [' was', ' will', ' was', ' will', ' will', ' will', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' was', ' was', ' was']
Base Label: [' will', ' was', ' will', ' were', ' was', ' was', ' will', ' will', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will']

Tokens to intervene:
['In 1978, there', 'In 2043, there', 'In 2017, there']
['In 2037, there', 'In 1954, there', 'In 2049, there']
Base: ['8', '3', '7', '8', '7', '1', '2', '4', '4', '0', '7', '4', '5', '0', '1', '1']
Source: ['7', '4', '9', '6', 

Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 14.59it/s, loss=3.74, acc=0.05]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 691.01 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 8826.06 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:12,  2.55it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was

Test:   6%|▋         | 2/32 [00:01<00:16,  1.79it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' was', ' were', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' were', ' were']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' 

Test:   9%|▉         | 3/32 [00:01<00:13,  2.21it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' will', ' will', ' was', ' was', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  16%|█▌        | 5/32 [00:02<00:10,  2.52it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.3}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.5}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  19%|█▉        | 6/32 [00:02<00:10,  2.46it/s]


 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  22%|██▏       | 7/32 [00:03<00:11,  2.25it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  25%|██▌       | 8/32 [00:03<00:09,  2.65it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.1}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  31%|███▏      | 10/32 [00:04<00:07,  2.91it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  34%|███▍      | 11/32 [00:04<00:07,  2.72it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  41%|████      | 13/32 [00:05<00:06,  2.75it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.4}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.4}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  44%|████▍     | 14/32 [00:05<00:06,  2.60it/s]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  50%|█████     | 16/32 [00:06<00:05,  2.75it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  53%|█████▎    | 17/32 [00:06<00:05,  2.54it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.3}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}


Test:  59%|█████▉    | 19/32 [00:07<00:04,  2.60it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}


Test:  62%|██████▎   | 20/32 [00:07<00:04,  2.94it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  66%|██████▌   | 21/32 [00:08<00:04,  2.69it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  69%|██████▉   | 22/32 [00:08<00:04,  2.50it/s]


 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  72%|███████▏  | 23/32 [00:09<00:04,  2.10it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  75%|███████▌  | 24/32 [00:09<00:04,  1.95it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 1.0}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  78%|███████▊  | 25/32 [00:10<00:03,  2.22it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  81%|████████▏ | 26/32 [00:10<00:02,  2.14it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}


Test:  88%|████████▊ | 28/32 [00:11<00:01,  2.41it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  91%|█████████ | 29/32 [00:11<00:01,  2.39it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.4}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.3}, 'accuracy': 0.4}}


Test:  94%|█████████▍| 30/32 [00:12<00:00,  2.75it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  97%|█████████▋| 31/32 [00:12<00:00,  2.63it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test: 100%|██████████| 32/32 [00:12<00:00,  2.49it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Mean IIA: 0.1900
Mean correct IIA: 0.1900
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:00<00:00, 1148.92 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 15332.06 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 12.44it/s, loss=4.18, acc=0.04]


Tokens to intervene:
['In 1998, there', 'In 1956, there', 'In 1956, there']
['In 2034, there', 'In 2028, there', 'In 2043, there']
Base: ['8', '6', '6', '3', '0', '8', '6', '6', '4', '7', '9', '3', '1', '7', '9', '7']
Source: ['4', '8', '3', '7', '7', '7', '3', '6', '4', '5', '3', '1', '9', '3', '3', '3']
Output: [' was', ' was', ' was', ' were', ' will', ' was', ' will', ' was', ' was', ' were', ' were', ' will', ' was', ' were', ' was', ' was']
Label     : [' will', ' will', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' were', ' will', ' was', ' will', ' will', ' will', ' will']
Base Label: [' was', ' was', ' was', ' were', ' will', ' was', ' will', ' was', ' was', ' were', ' were', ' will', ' was', ' were', ' was', ' was']

Tokens to intervene:
['In 2043, there', 'In 2041, there', 'In 2017, there']
['In 1958, there', 'In 1993, there', 'In 2037, there']
Base: ['3', '1', '7', '0', '2', '4', '3', '6', '3', '4', '4', '6', '4', '4', '8', '8']
Source: ['8', '3', '7', '3'

Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 14.16it/s, loss=3.67, acc=0.11]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 709.45 examples/s] 
Map: 100%|██████████| 500/500 [00:00<00:00, 6269.57 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:11,  2.77it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was

Test:   6%|▋         | 2/32 [00:00<00:14,  2.02it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' were', ' were']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' w

Test:   9%|▉         | 3/32 [00:01<00:10,  2.64it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' will', ' will', ' was', ' was', ' will', ' was', ' will', ' were', ' was', ' were', ' was', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will'

Test:  12%|█▎        | 4/32 [00:01<00:11,  2.36it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.3}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  19%|█▉        | 6/32 [00:02<00:10,  2.38it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.1}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}

 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  22%|██▏       | 7/32 [00:02<00:11,  2.26it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  25%|██▌       | 8/32 [00:03<00:09,  2.42it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  31%|███▏      | 10/32 [00:04<00:08,  2.54it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  34%|███▍      | 11/32 [00:04<00:08,  2.43it/s]


 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  38%|███▊      | 12/32 [00:04<00:08,  2.39it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.3}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  41%|████      | 13/32 [00:05<00:07,  2.63it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.3}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  44%|████▍     | 14/32 [00:05<00:07,  2.40it/s]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  50%|█████     | 16/32 [00:06<00:06,  2.48it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  56%|█████▋    | 18/32 [00:07<00:05,  2.67it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.3}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  59%|█████▉    | 19/32 [00:07<00:05,  2.52it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  66%|██████▌   | 21/32 [00:08<00:04,  2.51it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}


Test:  69%|██████▉   | 22/32 [00:08<00:04,  2.45it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  72%|███████▏  | 23/32 [00:09<00:03,  2.77it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  78%|███████▊  | 25/32 [00:10<00:02,  2.65it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 1.0}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}


Test:  81%|████████▏ | 26/32 [00:10<00:02,  2.34it/s]


 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}

 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  84%|████████▍ | 27/32 [00:10<00:02,  2.33it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  91%|█████████ | 29/32 [00:11<00:01,  2.48it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  94%|█████████▍| 30/32 [00:12<00:00,  2.66it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.3}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.4}, 'accuracy': 0.5}}

 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  97%|█████████▋| 31/32 [00:12<00:00,  2.48it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test: 100%|██████████| 32/32 [00:12<00:00,  2.51it/s]



 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}
Mean IIA: 0.2640
Mean correct IIA: 0.2640
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source l

Map: 100%|██████████| 872/872 [00:00<00:00, 1177.44 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 15384.82 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:03, 14.46it/s, loss=3.46, acc=0.12]


Tokens to intervene:
['In 2007, there', 'In 2033, there', 'In 2036, there']
['In 2038, there', 'In 2007, there', 'In 1961, there']
Base: ['7', '3', '6', '4', '1', '6', '7', '0', '6', '3', '1', '3', '8', '7', '3', '6']
Source: ['8', '7', '1', '7', '3', '4', '9', '7', '3', '6', '6', '1', '7', '0', '6', '4']
Output: [' were', ' will', ' will', ' will', ' was', ' will', ' will', ' will', ' were', ' will', ' was', ' were', ' were', ' were', ' was', ' will']
Label     : [' were', ' was', ' was', ' was', ' will', ' was', ' was', ' was', ' will', ' was', ' will', ' were', ' was', ' were', ' will', ' was']
Base Label: [' were', ' will', ' will', ' will', ' was', ' will', ' will', ' will', ' were', ' will', ' was', ' were', ' will', ' were', ' was', ' will']

Tokens to intervene:
['In 2030, there', 'In 1969, there', 'In 2028, there']
['In 2009, there', 'In 2036, there', 'In 1998, there']
Base: ['0', '9', '8', '6', '0', '5', '0', '7', '6', '0', '4', '1', '1', '1', '2', '0']
Source: ['9', '6', '8

Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 16.23it/s, loss=3.67, acc=0.11]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.33s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 1311.11 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 14680.49 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:09,  3.35it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was

Test:   6%|▋         | 2/32 [00:00<00:11,  2.66it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' was', ' were', '

Test:   9%|▉         | 3/32 [00:00<00:09,  3.20it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' will', ' is', ' was', ' was', ' will', ' is', ' will', ' were', ' was', ' were', ' was', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']



Test:  12%|█▎        | 4/32 [00:01<00:09,  2.80it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.3}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  16%|█▌        | 5/32 [00:01<00:08,  3.19it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.2}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}


Test:  19%|█▉        | 6/32 [00:02<00:09,  2.66it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  22%|██▏       | 7/32 [00:02<00:10,  2.48it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  28%|██▊       | 9/32 [00:03<00:08,  2.69it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.5}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.1}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  31%|███▏      | 10/32 [00:03<00:07,  3.03it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  34%|███▍      | 11/32 [00:03<00:07,  2.73it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}


Test:  38%|███▊      | 12/32 [00:04<00:08,  2.33it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.5}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}


Test:  41%|████      | 13/32 [00:04<00:07,  2.69it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  44%|████▍     | 14/32 [00:05<00:07,  2.37it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}


Test:  47%|████▋     | 15/32 [00:05<00:06,  2.69it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}


Test:  50%|█████     | 16/32 [00:05<00:06,  2.55it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  53%|█████▎    | 17/32 [00:06<00:06,  2.36it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.1}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  59%|█████▉    | 19/32 [00:07<00:05,  2.52it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  62%|██████▎   | 20/32 [00:07<00:04,  2.86it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  66%|██████▌   | 21/32 [00:07<00:04,  2.69it/s]


 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  69%|██████▉   | 22/32 [00:08<00:04,  2.39it/s]


 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}


Test:  72%|███████▏  | 23/32 [00:08<00:03,  2.73it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  75%|███████▌  | 24/32 [00:09<00:03,  2.51it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  78%|███████▊  | 25/32 [00:09<00:02,  2.79it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  81%|████████▏ | 26/32 [00:09<00:02,  2.57it/s]


 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}

 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  88%|████████▊ | 28/32 [00:10<00:01,  2.66it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  91%|█████████ | 29/32 [00:11<00:01,  2.56it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.2}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.3}, 'accuracy': 0.4}}


Test:  94%|█████████▍| 30/32 [00:11<00:00,  2.85it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  97%|█████████▋| 31/32 [00:11<00:00,  2.59it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test: 100%|██████████| 32/32 [00:12<00:00,  2.66it/s]


Mean IIA: 0.3180
Mean correct IIA: 0.3180
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:00<00:00, 1166.21 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 15404.39 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   4%|▎         | 2/54 [00:00<00:05, 10.32it/s, loss=3.89, acc=0]


Tokens to intervene:
['In 2044, there', 'In 2028, there', 'In 2049, there']
['In 2031, there', 'In 2038, there', 'In 2009, there']
Base: ['4', '8', '9', '7', '9', '2', '0', '6', '1', '1', '1', '4', '0', '1', '6', '3']
Source: ['1', '8', '9', '2', '3', '3', '8', '4', '8', '4', '0', '6', '6', '9', '8', '1']
Output: [' will', ' will', ' will', ' will', ' was', ' was', ' will', ' will', ' was', ' was', ' was', ' was', ' were', ' was', ' were', ' was']
Label     : [' was', ' was', ' was', ' was', ' will', ' will', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Base Label: [' will', ' will', ' will', ' will', ' was', ' was', ' will', ' will', ' was', ' was', ' was', ' was', ' were', ' was', ' were', ' was']

Tokens to intervene:
['In 2009, there', 'In 2004, there', 'In 2044, there']
['In 2030, there', 'In 2030, there', 'In 1966, there']
Base: ['9', '4', '4', '3', '2', '0', '3', '1', '3', '3', '1', '3', '3', '1', '8', '5']
Source: ['0', '0', '6', '9',

Epoch: 0:   4%|▎         | 2/54 [00:00<00:05, 10.32it/s, loss=3.97, acc=0]


Tokens to intervene:
['In 2034, there', 'In 1997, there', 'In 2043, there']
['In 1971, there', 'In 2034, there', 'In 1984, there']
Base: ['4', '7', '3', '3', '0', '8', '6', '8', '8', '4', '4', '1', '4', '6', '0', '1']
Source: ['1', '4', '4', '8', '9', '7', '0', '3', '8', '8', '3', '9', '8', '8', '9', '1']
Output: [' will', ' was', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' was', ' will', ' was', ' was', ' will', ' were', ' was']
Label     : [' was', ' will', ' was', ' was', ' was', ' was', ' will', ' was', ' will', ' will', ' was', ' will', ' will', ' was', ' will', ' will']
Base Label: [' will', ' was', ' will', ' will', ' will', ' will', ' was', ' will', ' was', ' was', ' will', ' was', ' was', ' will', ' were', ' was']


Epoch: 0: 100%|██████████| 54/54 [00:04<00:00, 12.74it/s, loss=3.55, acc=0.17]
Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 1273.66 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 14773.36 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:08,  3.52it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was

Test:   6%|▋         | 2/32 [00:00<00:11,  2.64it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' 

Test:   9%|▉         | 3/32 [00:00<00:09,  3.10it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' were', ' was', ' were', ' was', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']



Test:  12%|█▎        | 4/32 [00:01<00:10,  2.65it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.3}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  16%|█▌        | 5/32 [00:01<00:08,  3.02it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  22%|██▏       | 7/32 [00:02<00:10,  2.41it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  25%|██▌       | 8/32 [00:02<00:08,  2.78it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  28%|██▊       | 9/32 [00:03<00:08,  2.64it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.5}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  31%|███▏      | 10/32 [00:03<00:07,  2.90it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}


Test:  34%|███▍      | 11/32 [00:04<00:09,  2.31it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  38%|███▊      | 12/32 [00:04<00:09,  2.08it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.3}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  41%|████      | 13/32 [00:05<00:08,  2.34it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.3}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  44%|████▍     | 14/32 [00:05<00:08,  2.23it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:05<00:06,  2.51it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:06<00:06,  2.36it/s]


 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}

 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.8}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  53%|█████▎    | 17/32 [00:06<00:06,  2.36it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.1}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  56%|█████▋    | 18/32 [00:07<00:05,  2.64it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  59%|█████▉    | 19/32 [00:07<00:05,  2.51it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:07<00:04,  2.84it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  66%|██████▌   | 21/32 [00:08<00:04,  2.44it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  72%|███████▏  | 23/32 [00:08<00:03,  2.78it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  75%|███████▌  | 24/32 [00:09<00:03,  2.53it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  78%|███████▊  | 25/32 [00:09<00:02,  2.88it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 1.0}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  81%|████████▏ | 26/32 [00:10<00:02,  2.45it/s]


 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}

 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}


Test:  84%|████████▍ | 27/32 [00:10<00:02,  2.31it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:11<00:01,  2.53it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}


Test:  94%|█████████▍| 30/32 [00:11<00:00,  2.87it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.1}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.5}, 'accuracy': 0.6}}

 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  97%|█████████▋| 31/32 [00:12<00:00,  2.48it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test: 100%|██████████| 32/32 [00:12<00:00,  2.57it/s]


Mean IIA: 0.4160
Mean correct IIA: 0.4160
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer1_1-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:00<00:00, 1169.27 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 15359.43 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:02, 18.15it/s, loss=3.58, acc=0]


Tokens to intervene:
['In 1958, there', 'In 2007, there', 'In 1963, there']
['In 2033, there', 'In 2034, there', 'In 2026, there']
Base: ['8', '7', '3', '4', '3', '6', '3', '7', '3', '8', '4', '3', '0', '9', '3', '6']
Source: ['3', '4', '6', '6', '4', '0', '6', '4', '1', '6', '7', '0', '4', '7', '3', '7']
Output: [' was', ' were', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' were', ' was', ' will']
Label     : [' will', ' will', ' will', ' will', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' was']
Base Label: [' was', ' were', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' were', ' was', ' will']

Tokens to intervene:
['In 2034, there', 'In 2037, there', 'In 2044, there']
['In 1965, there', 'In 1980, there', 'In 2009, there']
Base: ['4', '7', '4', '2', '2', '7', '4', '0', '6', '3', '4', '1', '2', '4', '2', '4']
Source: ['5', '0', '9',

Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 14.19it/s, loss=3.06, acc=0.27]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.81s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 688.96 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 8545.96 examples/s]
Test:   0%|          | 0/32 [00:00<?, ?it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:   3%|▎         | 1/32 [00:00<00:11,  2.81it/s]


Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' was', ' was', ' were', ' were', ' was', ' was']
Labels:
           Base Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' was', ' was', ' was', ' was', ' was', ' was']

 'source-In

Test:   6%|▋         | 2/32 [00:00<00:13,  2.25it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' 

Test:   9%|▉         | 3/32 [00:01<00:10,  2.85it/s]


 'source-In 1975, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was',

Test:  16%|█▌        | 5/32 [00:01<00:09,  2.85it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.3}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  19%|█▉        | 6/32 [00:02<00:09,  2.68it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  25%|██▌       | 8/32 [00:02<00:08,  2.96it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  28%|██▊       | 9/32 [00:03<00:08,  2.59it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  31%|███▏      | 10/32 [00:03<00:07,  2.88it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.5}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  34%|███▍      | 11/32 [00:04<00:08,  2.51it/s]


 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  38%|███▊      | 12/32 [00:04<00:08,  2.33it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.5}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}


Test:  41%|████      | 13/32 [00:04<00:07,  2.57it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  44%|████▍     | 14/32 [00:05<00:07,  2.32it/s]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:06<00:06,  2.57it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  53%|█████▎    | 17/32 [00:06<00:06,  2.28it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  56%|█████▋    | 18/32 [00:07<00:05,  2.46it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  59%|█████▉    | 19/32 [00:07<00:06,  2.10it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:08<00:05,  2.34it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  66%|██████▌   | 21/32 [00:08<00:04,  2.29it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.5}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  72%|███████▏  | 23/32 [00:09<00:03,  2.64it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  75%|███████▌  | 24/32 [00:09<00:03,  2.38it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  78%|███████▊  | 25/32 [00:09<00:02,  2.73it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  81%|████████▏ | 26/32 [00:10<00:02,  2.60it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  84%|████████▍ | 27/32 [00:10<00:02,  2.37it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:11<00:01,  2.73it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:11<00:01,  2.40it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.6}, 'accuracy': 0.7}}


Test:  97%|█████████▋| 31/32 [00:12<00:00,  2.60it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.4}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test: 100%|██████████| 32/32 [00:12<00:00,  2.55it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}
Mean IIA: 0.5280
Mean correct IIA: 0.5280
Mean wrong IIA: nan





Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:00<00:00, 1100.79 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 14754.38 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   4%|▎         | 2/54 [00:00<00:03, 15.86it/s, loss=3.72, acc=0]


Tokens to intervene:
['In 2028, there', 'In 2030, there', 'In 2044, there']
['In 1960, there', 'In 1957, there', 'In 2009, there']
Base: ['8', '0', '4', '6', '6', '7', '9', '5', '1', '8', '7', '3', '7', '9', '6', '0']
Source: ['0', '7', '9', '7', '4', '8', '2', '3', '2', '3', '6', '7', '4', '6', '3', '1']
Output: [' were', ' will', ' will', ' were', ' was', ' was', ' will', ' was', ' will', ' was', ' was', ' will', ' was', ' was', ' was', ' will']
Label     : [' was', ' was', ' was', ' will', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' was', ' will', ' will', ' will', ' was']
Base Label: [' will', ' will', ' will', ' were', ' was', ' was', ' will', ' was', ' will', ' was', ' was', ' will', ' was', ' was', ' was', ' will']

Tokens to intervene:
['In 2030, there', 'In 2030, there', 'In 2004, there']
['In 1964, there', 'In 2009, there', 'In 2033, there']
Base: ['0', '0', '4', '0', '1', '4', '3', '4', '6', '3', '8', '6', '7', '4', '7', '8']
Source: ['4', '9', '3', '4', 

Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 17.42it/s, loss=2.76, acc=0.32]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.10s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 1249.33 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 14879.75 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:08,  3.48it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was

Test:   6%|▋         | 2/32 [00:00<00:11,  2.69it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' 

Test:   9%|▉         | 3/32 [00:00<00:09,  3.20it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']



Test:  12%|█▎        | 4/32 [00:01<00:13,  2.02it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.3}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}


Test:  16%|█▌        | 5/32 [00:02<00:11,  2.37it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:02<00:11,  2.19it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  25%|██▌       | 8/32 [00:03<00:11,  2.02it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  28%|██▊       | 9/32 [00:04<00:12,  1.87it/s]


 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.2}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  31%|███▏      | 10/32 [00:04<00:10,  2.04it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.4}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  34%|███▍      | 11/32 [00:05<00:10,  1.98it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  41%|████      | 13/32 [00:05<00:08,  2.37it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.3}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  44%|████▍     | 14/32 [00:06<00:07,  2.36it/s]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:06<00:06,  2.68it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:07<00:06,  2.46it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'accuracy': 1.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  56%|█████▋    | 18/32 [00:07<00:05,  2.56it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  59%|█████▉    | 19/32 [00:08<00:05,  2.49it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:08<00:04,  2.83it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  69%|██████▉   | 22/32 [00:09<00:03,  2.53it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.4}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  72%|███████▏  | 23/32 [00:09<00:03,  2.75it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  75%|███████▌  | 24/32 [00:10<00:03,  2.48it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.8}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  78%|███████▊  | 25/32 [00:10<00:02,  2.85it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  84%|████████▍ | 27/32 [00:11<00:01,  2.67it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:11<00:01,  3.04it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:11<00:01,  2.61it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.6}, 'accuracy': 0.7}}


Test:  94%|█████████▍| 30/32 [00:12<00:00,  2.92it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.2}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test: 100%|██████████| 32/32 [00:12<00:00,  2.48it/s]



 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}
Mean IIA: 0.5340
Mean correct IIA: 0.5340
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer2_2-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source l

Map: 100%|██████████| 872/872 [00:01<00:00, 625.19 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 8431.73 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 12.59it/s, loss=3.46, acc=0]


Tokens to intervene:
['In 2004, there', 'In 2037, there', 'In 2036, there']
['In 2033, there', 'In 1960, there', 'In 2013, there']
Base: ['4', '7', '6', '8', '9', '4', '5', '4', '8', '4', '3', '3', '1', '8', '4', '4']
Source: ['3', '0', '3', '6', '6', '4', '4', '0', '0', '7', '3', '6', '6', '4', '1', '3']
Output: [' were', ' will', ' will', ' was', ' was', ' will', ' was', ' will', ' were', ' were', ' will', ' was', ' will', ' was', ' will', ' will']
Label     : [' will', ' was', ' was', ' will', ' will', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' was', ' will', ' was', ' was']
Base Label: [' were', ' will', ' will', ' was', ' was', ' will', ' was', ' will', ' will', ' were', ' will', ' was', ' will', ' was', ' will', ' will']

Tokens to intervene:
['In 2028, there', 'In 2028, there', 'In 1980, there']
['In 1987, there', 'In 1972, there', 'In 2037, there']
Base: ['8', '8', '0', '7', '4', '3', '6', '6', '7', '1', '4', '0', '4', '6', '0', '7']
Source: ['7', '2', '7', '

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 12.59it/s, loss=3.76, acc=0]


Tokens to intervene:
['In 1971, there', 'In 2028, there', 'In 1970, there']
['In 2026, there', 'In 2013, there', 'In 2036, there']
Base: ['1', '8', '0', '1', '6', '3', '6', '9', '4', '7', '0', '9', '9', '6', '7', '7']
Source: ['6', '3', '6', '1', '3', '1', '8', '6', '9', '1', '7', '4', '1', '4', '4', '9']
Output: [' was', ' were', ' were', ' was', ' were', ' was', ' were', ' was', ' was', ' will', ' were', ' will', ' will', ' will', ' were', ' was']
Label     : [' will', ' was', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' was', ' will', ' was', ' was', ' was', ' will', ' will']
Base Label: [' was', ' will', ' were', ' was', ' will', ' was', ' will', ' was', ' was', ' will', ' were', ' will', ' will', ' will', ' were', ' was']


Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 16.67it/s, loss=2.81, acc=0.31]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.24s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 1218.54 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 13756.87 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:10,  2.90it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was

Test:   6%|▋         | 2/32 [00:00<00:13,  2.20it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' 

Test:   9%|▉         | 3/32 [00:01<00:11,  2.50it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' was', ' will', ' was', ' were', ' will', ' were', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']


Test:  12%|█▎        | 4/32 [00:01<00:12,  2.27it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.2}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}


Test:  16%|█▌        | 5/32 [00:01<00:10,  2.64it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:02<00:11,  2.33it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  22%|██▏       | 7/32 [00:03<00:11,  2.12it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  25%|██▌       | 8/32 [00:03<00:09,  2.43it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  28%|██▊       | 9/32 [00:03<00:09,  2.38it/s]


 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.1}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}


Test:  31%|███▏      | 10/32 [00:04<00:08,  2.64it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.4}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  34%|███▍      | 11/32 [00:04<00:08,  2.50it/s]


 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  38%|███▊      | 12/32 [00:05<00:08,  2.34it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.2}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.5}, 'accuracy': 0.5}}


Test:  41%|████      | 13/32 [00:05<00:07,  2.58it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  44%|████▍     | 14/32 [00:05<00:07,  2.41it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:06<00:06,  2.72it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:06<00:06,  2.39it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.4}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  53%|█████▎    | 17/32 [00:07<00:07,  2.11it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  56%|█████▋    | 18/32 [00:07<00:06,  2.31it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  59%|█████▉    | 19/32 [00:07<00:05,  2.26it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:08<00:04,  2.63it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  66%|██████▌   | 21/32 [00:08<00:04,  2.49it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.4}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  72%|███████▏  | 23/32 [00:09<00:03,  2.47it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  75%|███████▌  | 24/32 [00:09<00:03,  2.42it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.8}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  78%|███████▊  | 25/32 [00:10<00:02,  2.75it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  81%|████████▏ | 26/32 [00:10<00:02,  2.39it/s]


 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  88%|████████▊ | 28/32 [00:11<00:01,  2.52it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:12<00:01,  2.43it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  94%|█████████▍| 30/32 [00:12<00:00,  2.77it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.6}, 'accuracy': 0.7}}

 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  97%|█████████▋| 31/32 [00:12<00:00,  2.27it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.3}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}}

 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test: 100%|██████████| 32/32 [00:13<00:00,  2.41it/s]


Mean IIA: 0.5560
Mean correct IIA: 0.5560
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:00<00:00, 914.78 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 12884.00 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   4%|▎         | 2/54 [00:00<00:03, 13.98it/s, loss=3.75, acc=0]


Tokens to intervene:
['In 2049, there', 'In 2026, there', 'In 2030, there']
['In 1963, there', 'In 1972, there', 'In 1981, there']
Base: ['9', '6', '0', '3', '7', '6', '4', '3', '4', '6', '7', '8', '6', '8', '4', '9']
Source: ['3', '2', '1', '6', '8', '6', '0', '2', '3', '8', '9', '6', '4', '4', '3', '9']
Output: [' will', ' will', ' will', ' was', ' will', ' were', ' was', ' will', ' was', ' will', ' was', ' was', ' will', ' were', ' will', ' was']
Label     : [' was', ' was', ' was', ' will', ' was', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' was', ' was', ' was', ' will']
Base Label: [' will', ' will', ' will', ' was', ' will', ' were', ' was', ' will', ' was', ' will', ' was', ' was', ' will', ' will', ' will', ' was']

Tokens to intervene:
['In 1980, there', 'In 2033, there', 'In 2033, there']
['In 2037, there', 'In 2038, there', 'In 1961, there']
Base: ['0', '3', '3', '4', '5', '8', '5', '1', '4', '6', '2', '9', '3', '6', '4', '7']
Source: ['7', '8', '1', '1'

Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 13.50it/s, loss=2.36, acc=0.36]
Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.00s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 685.40 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 8515.59 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:11,  2.73it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was

Test:   6%|▋         | 2/32 [00:00<00:15,  1.99it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' were', ' was', ' were', ' were', ' were']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was']

Test:   9%|▉         | 3/32 [00:01<00:13,  2.21it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:01<00:14,  1.98it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.0}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  16%|█▌        | 5/32 [00:02<00:11,  2.34it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:02<00:11,  2.24it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  22%|██▏       | 7/32 [00:03<00:12,  2.05it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.6, 'token_accuracy': 0.6, 'class_0_accuracy': 0.6}, 'accuracy': 0.6}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  25%|██▌       | 8/32 [00:03<00:10,  2.21it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.0}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  28%|██▊       | 9/32 [00:04<00:12,  1.84it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  31%|███▏      | 10/32 [00:04<00:11,  1.97it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.3}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.4}, 'accuracy': 0.4}}


Test:  34%|███▍      | 11/32 [00:05<00:12,  1.68it/s]


 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  38%|███▊      | 12/32 [00:06<00:13,  1.53it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.0}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  41%|████      | 13/32 [00:06<00:11,  1.71it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.6}, 'accuracy': 0.9}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  44%|████▍     | 14/32 [00:07<00:11,  1.57it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:07<00:09,  1.77it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:08<00:09,  1.63it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.5, 'token_accuracy': 0.5, 'class_0_accuracy': 0.3}, 'accuracy': 0.5}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  56%|█████▋    | 18/32 [00:09<00:08,  1.71it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.4, 'token_accuracy': 0.4, 'class_0_accuracy': 0.2}, 'accuracy': 0.4}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}


Test:  59%|█████▉    | 19/32 [00:10<00:08,  1.57it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  62%|██████▎   | 20/32 [00:11<00:06,  1.77it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.5}, 'accuracy': 0.7}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  66%|██████▌   | 21/32 [00:11<00:06,  1.61it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.2}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  72%|███████▏  | 23/32 [00:12<00:05,  1.70it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  75%|███████▌  | 24/32 [00:13<00:05,  1.56it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}}


Test:  78%|███████▊  | 25/32 [00:14<00:04,  1.74it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  81%|████████▏ | 26/32 [00:14<00:03,  1.59it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  84%|████████▍ | 27/32 [00:15<00:03,  1.49it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:16<00:02,  1.69it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  91%|█████████ | 29/32 [00:16<00:01,  1.53it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}}


Test:  94%|█████████▍| 30/32 [00:17<00:01,  1.72it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  97%|█████████▋| 31/32 [00:18<00:00,  1.56it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test: 100%|██████████| 32/32 [00:18<00:00,  1.73it/s]


Mean IIA: 0.6700
Mean correct IIA: 0.6700
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer3_3-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:00<00:00, 939.61 examples/s] 
Map: 100%|██████████| 872/872 [00:00<00:00, 10706.33 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 12.13it/s, loss=3.77, acc=0.03]


Tokens to intervene:
['In 1979, there', 'In 2043, there', 'In 2013, there']
['In 2049, there', 'In 1984, there', 'In 2044, there']
Base: ['9', '3', '3', '1', '9', '4', '3', '9', '1', '0', '2', '9', '9', '9', '4', '0']
Source: ['9', '4', '4', '1', '3', '8', '7', '4', '6', '3', '9', '7', '1', '9', '7', '8']
Output: [' was', ' will', ' were', ' will', ' was', ' will', ' will', ' was', ' was', ' were', ' were', ' will', ' will', ' will', ' was', ' were']
Label     : [' will', ' was', ' will', ' was', ' will', ' was', ' was', ' will', ' will', ' will', ' will', ' was', ' was', ' was', ' will', ' will']
Base Label: [' was', ' will', ' were', ' will', ' was', ' will', ' will', ' was', ' was', ' were', ' were', ' will', ' will', ' will', ' was', ' were']

Tokens to intervene:
['In 2009, there', 'In 2036, there', 'In 1993, there']
['In 2033, there', 'In 2016, there', 'In 2026, there']
Base: ['9', '6', '3', '2', '8', '4', '3', '3', '6', '4', '6', '6', '7', '4', '4', '1']
Source: ['3', '6', '6',

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 12.13it/s, loss=4.02, acc=0.02]


Tokens to intervene:
['In 2037, there', 'In 1966, there', 'In 1971, there']
['In 1998, there', 'In 2037, there', 'In 2044, there']
Base: ['7', '6', '1', '1', '0', '4', '6', '6', '3', '2', '8', '9', '4', '8', '4', '6']
Source: ['8', '7', '4', '3', '4', '8', '1', '7', '8', '4', '6', '3', '8', '8', '3', '8']
Output: [' will', ' was', ' was', ' was', ' were', ' were', ' were', ' was', ' will', ' was', ' was', ' was', ' was', ' were', ' was', ' were']
Label     : [' was', ' will', ' will', ' will', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' will', ' will', ' was', ' will', ' was']
Base Label: [' will', ' was', ' was', ' was', ' were', ' were', ' will', ' was', ' will', ' was', ' was', ' was', ' was', ' will', ' was', ' will']


Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 14.24it/s, loss=2.09, acc=0.45]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 1063.17 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 9957.28 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:13,  2.38it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:18,  1.61it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' were', ' was', ' were', ' were', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was']


Test:   9%|▉         | 3/32 [00:01<00:15,  1.89it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:02<00:17,  1.60it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:02<00:15,  1.77it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:03<00:16,  1.59it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  25%|██▌       | 8/32 [00:04<00:13,  1.71it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  28%|██▊       | 9/32 [00:05<00:14,  1.58it/s]


 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  31%|███▏      | 10/32 [00:05<00:12,  1.73it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  34%|███▍      | 11/32 [00:06<00:13,  1.53it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  38%|███▊      | 12/32 [00:07<00:14,  1.40it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:08<00:12,  1.55it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:09<00:11,  1.52it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  50%|█████     | 16/32 [00:10<00:11,  1.38it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  53%|█████▎    | 17/32 [00:11<00:11,  1.27it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  56%|█████▋    | 18/32 [00:11<00:09,  1.44it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  59%|█████▉    | 19/32 [00:12<00:10,  1.30it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  62%|██████▎   | 20/32 [00:13<00:08,  1.44it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  66%|██████▌   | 21/32 [00:14<00:08,  1.30it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  69%|██████▉   | 22/32 [00:15<00:08,  1.21it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  72%|███████▏  | 23/32 [00:15<00:06,  1.37it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  75%|███████▌  | 24/32 [00:16<00:06,  1.25it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  78%|███████▊  | 25/32 [00:17<00:04,  1.40it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  81%|████████▏ | 26/32 [00:18<00:04,  1.25it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  84%|████████▍ | 27/32 [00:19<00:04,  1.19it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:19<00:02,  1.35it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  94%|█████████▍| 30/32 [00:21<00:01,  1.38it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.8}, 'accuracy': 0.9}}

 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  97%|█████████▋| 31/32 [00:22<00:00,  1.24it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test: 100%|██████████| 32/32 [00:22<00:00,  1.41it/s]


Mean IIA: 0.8960
Mean correct IIA: 0.8960
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 792.85 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 7416.40 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   2%|▏         | 1/54 [00:00<00:05,  9.19it/s, loss=4, acc=0]   


Tokens to intervene:
['In 2044, there', 'In 2036, there', 'In 2007, there']
['In 1980, there', 'In 1998, there', 'In 2036, there']
Base: ['4', '6', '7', '6', '7', '4', '4', '3', '8', '9', '8', '1', '0', '2', '8', '3']
Source: ['0', '8', '6', '4', '4', '6', '2', '7', '0', '9', '3', '8', '1', '4', '9', '8']
Output: [' will', ' will', ' were', ' will', ' were', ' will', ' will', ' will', ' will', ' were', ' was', ' was', ' were', ' was', ' was', ' was']
Label     : [' was', ' was', ' will', ' was', ' will', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Base Label: [' will', ' will', ' were', ' will', ' were', ' will', ' will', ' will', ' will', ' were', ' was', ' was', ' were', ' was', ' was', ' was']

Tokens to intervene:
['In 2034, there', 'In 1956, there', 'In 2037, there']
['In 2017, there', 'In 2049, there', 'In 1965, there']
Base: ['4', '6', '7', '7', '8', '9', '3', '0', '3', '0', '3', '0', '4', '2', '1', '9']
Source: ['7', '9', '5',

Epoch: 0:   7%|▋         | 4/54 [00:00<00:05,  8.96it/s, loss=3.74, acc=0.03]


Tokens to intervene:
['In 2037, there', 'In 2049, there', 'In 1994, there']
['In 1966, there', 'In 1970, there', 'In 2036, there']
Base: ['7', '9', '4', '2', '3', '5', '9', '3', '8', '8', '0', '2', '9', '2', '8', '9']
Source: ['6', '0', '6', '0', '8', '0', '8', '8', '6', '7', '2', '1', '8', '6', '6', '7']
Output: [' will', ' will', ' was', ' were', ' will', ' was', ' will', ' was', ' was', ' was', ' will', ' was', ' were', ' was', ' was', ' will']
Label     : [' was', ' was', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' will', ' was', ' will', ' will', ' will', ' will', ' was']
Base Label: [' will', ' will', ' was', ' were', ' will', ' was', ' will', ' was', ' was', ' was', ' will', ' was', ' were', ' was', ' was', ' will']


Epoch: 0: 100%|██████████| 54/54 [00:05<00:00,  9.53it/s, loss=1.74, acc=0.58]
Epoch: 100%|██████████| 1/1 [00:05<00:00,  5.67s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 878.73 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 8380.96 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:16,  1.88it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:24,  1.22it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' were', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was']

 

Test:   9%|▉         | 3/32 [00:02<00:20,  1.43it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:23,  1.21it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:19,  1.36it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:04<00:21,  1.20it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.29it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:07<00:19,  1.18it/s]


 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  31%|███▏      | 10/32 [00:07<00:16,  1.34it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  34%|███▍      | 11/32 [00:08<00:17,  1.20it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:14,  1.29it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  44%|████▍     | 14/32 [00:11<00:15,  1.17it/s]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  47%|████▋     | 15/32 [00:11<00:12,  1.32it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  50%|█████     | 16/32 [00:12<00:13,  1.21it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  53%|█████▎    | 17/32 [00:13<00:13,  1.12it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  56%|█████▋    | 18/32 [00:14<00:11,  1.27it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:16<00:09,  1.32it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  66%|██████▌   | 21/32 [00:17<00:09,  1.20it/s]


 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  72%|███████▏  | 23/32 [00:18<00:06,  1.31it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  75%|███████▌  | 24/32 [00:19<00:06,  1.19it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  78%|███████▊  | 25/32 [00:20<00:05,  1.32it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  81%|████████▏ | 26/32 [00:21<00:05,  1.18it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  88%|████████▊ | 28/32 [00:22<00:03,  1.26it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:23<00:02,  1.15it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'accuracy': 1.0}}


Test:  94%|█████████▍| 30/32 [00:24<00:01,  1.29it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:25<00:00,  1.24it/s]



 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}
Mean IIA: 0.9580
Mean correct IIA: 0.9580
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer4_4-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source l

Map: 100%|██████████| 872/872 [00:01<00:00, 795.24 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 9208.18 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.32it/s, loss=3.84, acc=0.03]


Tokens to intervene:
['In 1994, there', 'In 1997, there', 'In 1978, there']
['In 2041, there', 'In 2037, there', 'In 2030, there']
Base: ['4', '7', '8', '4', '1', '1', '7', '9', '1', '3', '1', '3', '9', '2', '0', '3']
Source: ['1', '7', '0', '3', '4', '4', '1', '3', '7', '6', '8', '4', '6', '6', '9', '6']
Output: [' was', ' was', ' was', ' was', ' will', ' was', ' will', ' were', ' will', ' will', ' was', ' will', ' were', ' was', ' were', ' will']
Label     : [' will', ' will', ' will', ' will', ' was', ' will', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' was']
Base Label: [' was', ' was', ' was', ' was', ' will', ' was', ' will', ' were', ' will', ' will', ' was', ' will', ' were', ' was', ' were', ' will']

Tokens to intervene:
['In 2036, there', 'In 2044, there', 'In 1987, there']
['In 2007, there', 'In 1956, there', 'In 2044, there']
Base: ['6', '4', '7', '3', '6', '9', '2', '4', '8', '9', '0', '8', '9', '8', '7', '8']
Source: ['7', '6', '4', '9

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.32it/s, loss=3.75, acc=0.02]


Tokens to intervene:
['In 2026, there', 'In 2037, there', 'In 1980, there']
['In 1951, there', 'In 1994, there', 'In 2036, there']
Base: ['6', '7', '0', '6', '0', '0', '9', '4', '7', '7', '7', '8', '3', '6', '4', '6']
Source: ['1', '4', '6', '7', '7', '7', '4', '1', '4', '3', '1', '9', '1', '7', '3', '3']
Output: [' will', ' will', ' were', ' will', ' will', ' were', ' was', ' was', ' will', ' was', ' will', ' were', ' will', ' was', ' will', ' will']
Label     : [' was', ' was', ' will', ' was', ' was', ' will', ' will', ' will', ' was', ' will', ' was', ' was', ' was', ' will', ' was', ' was']
Base Label: [' will', ' will', ' were', ' will', ' will', ' were', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' was', ' will', ' will']


Epoch: 0: 100%|██████████| 54/54 [00:04<00:00, 12.25it/s, loss=2.22, acc=0.43]
Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.41s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 916.29 examples/s] 
Map: 100%|██████████| 500/500 [00:00<00:00, 7877.84 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:17,  1.80it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:24,  1.21it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' 

Test:   9%|▉         | 3/32 [00:02<00:20,  1.42it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:23,  1.21it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:19,  1.38it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:04<00:22,  1.18it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  22%|██▏       | 7/32 [00:05<00:22,  1.11it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.27it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:07<00:19,  1.19it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  31%|███▏      | 10/32 [00:07<00:16,  1.33it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  34%|███▍      | 11/32 [00:08<00:17,  1.20it/s]


 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  38%|███▊      | 12/32 [00:09<00:17,  1.12it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:15,  1.24it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  44%|████▍     | 14/32 [00:11<00:15,  1.13it/s]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  47%|████▋     | 15/32 [00:12<00:13,  1.27it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  50%|█████     | 16/32 [00:13<00:13,  1.17it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  53%|█████▎    | 17/32 [00:14<00:13,  1.12it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  56%|█████▋    | 18/32 [00:14<00:11,  1.26it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:16<00:09,  1.27it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  66%|██████▌   | 21/32 [00:17<00:09,  1.18it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  69%|██████▉   | 22/32 [00:18<00:08,  1.12it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  72%|███████▏  | 23/32 [00:18<00:07,  1.27it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  78%|███████▊  | 25/32 [00:20<00:05,  1.31it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  81%|████████▏ | 26/32 [00:21<00:05,  1.19it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  88%|████████▊ | 28/32 [00:23<00:03,  1.25it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:24<00:02,  1.17it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.8}, 'accuracy': 0.9}}


Test:  94%|█████████▍| 30/32 [00:24<00:01,  1.32it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:26<00:00,  1.23it/s]



 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}
Mean IIA: 0.9080
Mean correct IIA: 0.9080
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source l

Map: 100%|██████████| 872/872 [00:01<00:00, 807.38 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 8327.00 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   2%|▏         | 1/54 [00:00<00:16,  3.26it/s, loss=4.27, acc=0]


Tokens to intervene:
['In 2033, there', 'In 2020, there', 'In 1958, there']
['In 1979, there', 'In 2030, there', 'In 2033, there']
Base: ['3', '0', '8', '7', '9', '3', '0', '1', '0', '0', '9', '1', '1', '4', '2', '0']
Source: ['9', '0', '3', '6', '4', '1', '6', '0', '8', '8', '8', '4', '3', '2', '8', '7']
Output: [' will', ' were', ' was', ' will', ' was', ' was', ' were', ' will', ' will', ' will', ' will', ' will', ' was', ' will', ' was', ' will']
Label     : [' was', ' will', ' will', ' was', ' will', ' will', ' will', ' was', ' was', ' was', ' was', ' was', ' will', ' was', ' will', ' was']
Base Label: [' will', ' were', ' was', ' will', ' was', ' was', ' were', ' will', ' will', ' will', ' will', ' will', ' was', ' will', ' was', ' will']

Tokens to intervene:
['In 2041, there', 'In 1978, there', 'In 1952, there']
['In 2002, there', 'In 2041, there', 'In 2036, there']
Base: ['1', '8', '2', '7', '1', '6', '4', '7', '7', '6', '0', '6', '6', '0', '3', '1']
Source: ['2', '1', '6', '

Epoch: 0:   7%|▋         | 4/54 [00:00<00:07,  7.01it/s, loss=3.77, acc=0.01]


Tokens to intervene:
['In 1970, there', 'In 1974, there', 'In 2043, there']
['In 2041, there', 'In 2037, there', 'In 1960, there']
Base: ['0', '4', '3', '1', '0', '8', '8', '6', '4', '0', '6', '3', '1', '4', '4', '4']
Source: ['1', '7', '0', '3', '8', '0', '4', '6', '4', '3', '4', '8', '0', '1', '9', '1']
Output: [' were', ' was', ' will', ' was', ' will', ' will', ' was', ' were', ' will', ' will', ' was', ' will', ' will', ' will', ' was', ' was']
Label     : [' will', ' will', ' was', ' will', ' was', ' was', ' will', ' will', ' was', ' was', ' will', ' was', ' was', ' was', ' will', ' will']
Base Label: [' were', ' was', ' will', ' was', ' will', ' will', ' was', ' were', ' will', ' will', ' was', ' will', ' will', ' will', ' was', ' was']


Epoch: 0: 100%|██████████| 54/54 [00:05<00:00, 10.35it/s, loss=1.74, acc=0.57]
Epoch: 100%|██████████| 1/1 [00:05<00:00,  5.22s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 918.45 examples/s] 
Map: 100%|██████████| 500/500 [00:00<00:00, 7139.80 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:16,  1.93it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:23,  1.25it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' w

Test:   9%|▉         | 3/32 [00:02<00:19,  1.48it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:22,  1.26it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:18,  1.43it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  19%|█▉        | 6/32 [00:04<00:20,  1.28it/s]


 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  22%|██▏       | 7/32 [00:05<00:21,  1.15it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.30it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:07<00:19,  1.19it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  31%|███▏      | 10/32 [00:07<00:16,  1.33it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  34%|███▍      | 11/32 [00:08<00:17,  1.20it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  38%|███▊      | 12/32 [00:09<00:17,  1.13it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:14,  1.30it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  44%|████▍     | 14/32 [00:11<00:14,  1.22it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  47%|████▋     | 15/32 [00:11<00:12,  1.36it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  50%|█████     | 16/32 [00:12<00:12,  1.25it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  53%|█████▎    | 17/32 [00:13<00:13,  1.14it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  56%|█████▋    | 18/32 [00:14<00:10,  1.30it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  59%|█████▉    | 19/32 [00:15<00:10,  1.20it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  62%|██████▎   | 20/32 [00:15<00:09,  1.33it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  66%|██████▌   | 21/32 [00:16<00:09,  1.22it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  69%|██████▉   | 22/32 [00:17<00:08,  1.15it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  72%|███████▏  | 23/32 [00:18<00:06,  1.32it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  75%|███████▌  | 24/32 [00:19<00:06,  1.18it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  78%|███████▊  | 25/32 [00:19<00:05,  1.33it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  81%|████████▏ | 26/32 [00:20<00:05,  1.19it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  84%|████████▍ | 27/32 [00:21<00:04,  1.11it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  88%|████████▊ | 28/32 [00:22<00:03,  1.24it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  91%|█████████ | 29/32 [00:23<00:02,  1.15it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'accuracy': 1.0}}


Test:  94%|█████████▍| 30/32 [00:23<00:01,  1.29it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  97%|█████████▋| 31/32 [00:25<00:00,  1.18it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:25<00:00,  1.25it/s]


Mean IIA: 0.9680
Mean correct IIA: 0.9680
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer5_5-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 800.77 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 9157.80 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 10.89it/s, loss=3.13, acc=0.03]


Tokens to intervene:
['In 2026, there', 'In 2028, there', 'In 2030, there']
['In 1956, there', 'In 1973, there', 'In 1972, there']
Base: ['6', '8', '0', '3', '6', '2', '2', '4', '1', '6', '4', '9', '0', '6', '6', '8']
Source: ['6', '3', '2', '1', '7', '1', '3', '8', '4', '3', '7', '6', '6', '2', '0', '1']
Output: [' will', ' will', ' will', ' were', ' will', ' were', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' was']
Label     : [' was', ' was', ' was', ' were', ' was', ' will', ' will', ' was', ' will', ' will', ' was', ' will', ' was', ' was', ' was', ' will']
Base Label: [' will', ' will', ' will', ' were', ' will', ' were', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' was']

Tokens to intervene:
['In 2030, there', 'In 2036, there', 'In 2037, there']
['In 1984, there', 'In 1987, there', 'In 1958, there']
Base: ['0', '6', '7', '7', '1', '3', '4', '3', '4', '4', '0', '1', '9', '3', '4', '1']
Source: ['4', '7', '8', '

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 10.89it/s, loss=3.2, acc=0.02] 


Tokens to intervene:
['In 1980, there', 'In 2036, there', 'In 2026, there']
['In 2026, there', 'In 2004, there', 'In 1951, there']
Base: ['0', '6', '6', '8', '0', '4', '7', '3', '6', '3', '3', '1', '0', '6', '6', '6']
Source: ['6', '4', '1', '0', '4', '8', '0', '2', '1', '1', '5', '4', '6', '2', '0', '7']
Output: [' were', ' will', ' will', ' was', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' will', ' were', ' were', ' will', ' were']
Label     : [' will', ' was', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' was', ' was', ' was', ' will', ' was', ' was', ' will']
Base Label: [' were', ' will', ' will', ' was', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' will', ' were', ' will', ' will', ' were']


Epoch: 0: 100%|██████████| 54/54 [00:04<00:00, 11.68it/s, loss=1.98, acc=0.48]
Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.63s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 647.11 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6485.86 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:17,  1.76it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:26,  1.15it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' were', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was']



Test:   9%|▉         | 3/32 [00:02<00:21,  1.36it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:23,  1.20it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:19,  1.39it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:04<00:21,  1.21it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  22%|██▏       | 7/32 [00:05<00:22,  1.13it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.28it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  28%|██▊       | 9/32 [00:07<00:19,  1.19it/s]


 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  31%|███▏      | 10/32 [00:07<00:16,  1.34it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  34%|███▍      | 11/32 [00:08<00:17,  1.22it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  38%|███▊      | 12/32 [00:09<00:17,  1.14it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:15,  1.26it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  44%|████▍     | 14/32 [00:11<00:15,  1.19it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:11<00:13,  1.30it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:13<00:13,  1.17it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  56%|█████▋    | 18/32 [00:14<00:11,  1.22it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  59%|█████▉    | 19/32 [00:15<00:11,  1.14it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  62%|██████▎   | 20/32 [00:16<00:09,  1.28it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  66%|██████▌   | 21/32 [00:17<00:09,  1.15it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  69%|██████▉   | 22/32 [00:18<00:09,  1.06it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  72%|███████▏  | 23/32 [00:19<00:07,  1.21it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  75%|███████▌  | 24/32 [00:20<00:07,  1.12it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  78%|███████▊  | 25/32 [00:20<00:05,  1.28it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  81%|████████▏ | 26/32 [00:21<00:05,  1.18it/s]


 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  84%|████████▍ | 27/32 [00:22<00:04,  1.12it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:23<00:03,  1.27it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  94%|█████████▍| 30/32 [00:24<00:01,  1.31it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}}

 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  97%|█████████▋| 31/32 [00:25<00:00,  1.19it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:26<00:00,  1.22it/s]


Mean IIA: 0.9000
Mean correct IIA: 0.9000
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 825.90 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 8484.07 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.09it/s, loss=3.56, acc=0]


Tokens to intervene:
['In 1957, there', 'In 2036, there', 'In 1984, there']
['In 2033, there', 'In 1978, there', 'In 2036, there']
Base: ['7', '6', '4', '6', '3', '3', '7', '3', '2', '4', '6', '3', '4', '3', '1', '7']
Source: ['3', '8', '6', '7', '9', '8', '0', '1', '0', '1', '7', '5', '7', '3', '6', '2']
Output: [' was', ' will', ' was', ' will', ' will', ' will', ' was', ' was', ' was', ' will', ' will', ' will', ' was', ' will', ' was', ' will']
Label     : [' will', ' was', ' will', ' was', ' was', ' was', ' will', ' will', ' will', ' was', ' was', ' was', ' will', ' was', ' will', ' was']
Base Label: [' was', ' will', ' was', ' will', ' will', ' will', ' was', ' was', ' was', ' will', ' will', ' will', ' was', ' will', ' was', ' will']

Tokens to intervene:
['In 2028, there', 'In 2049, there', 'In 2031, there']
['In 1973, there', 'In 1993, there', 'In 2030, there']
Base: ['8', '9', '1', '4', '9', '1', '0', '3', '8', '0', '1', '8', '8', '4', '7', '9']
Source: ['3', '3', '0', '8', 

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.09it/s, loss=3.61, acc=0.04]


Tokens to intervene:
['In 2033, there', 'In 2049, there', 'In 1970, there']
['In 1960, there', 'In 1961, there', 'In 2026, there']
Base: ['3', '9', '0', '3', '8', '9', '4', '0', '1', '1', '0', '6', '7', '3', '9', '8']
Source: ['0', '1', '6', '7', '6', '1', '6', '3', '2', '6', '3', '9', '4', '1', '0', '0']
Output: [' was', ' will', ' were', ' was', ' were', ' will', ' will', ' were', ' will', ' was', ' were', ' was', ' was', ' were', ' was', ' were']
Label     : [' was', ' was', ' will', ' was', ' was', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' was']
Base Label: [' will', ' will', ' were', ' will', ' will', ' will', ' will', ' were', ' will', ' was', ' were', ' was', ' was', ' were', ' was', ' will']


Epoch: 0: 100%|██████████| 54/54 [00:05<00:00, 10.36it/s, loss=1.6, acc=0.59] 
Epoch: 100%|██████████| 1/1 [00:05<00:00,  5.22s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 653.28 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6484.42 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:17,  1.79it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:24,  1.23it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' w

Test:   9%|▉         | 3/32 [00:02<00:19,  1.46it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:22,  1.25it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:19,  1.37it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  19%|█▉        | 6/32 [00:04<00:21,  1.22it/s]


 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  22%|██▏       | 7/32 [00:05<00:22,  1.13it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.30it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:07<00:19,  1.19it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  31%|███▏      | 10/32 [00:07<00:16,  1.31it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  34%|███▍      | 11/32 [00:08<00:18,  1.16it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  38%|███▊      | 12/32 [00:09<00:18,  1.07it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:15,  1.23it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:12<00:13,  1.29it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  50%|█████     | 16/32 [00:13<00:13,  1.17it/s]


 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  53%|█████▎    | 17/32 [00:14<00:13,  1.08it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  56%|█████▋    | 18/32 [00:14<00:11,  1.23it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  59%|█████▉    | 19/32 [00:15<00:11,  1.13it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  62%|██████▎   | 20/32 [00:16<00:09,  1.28it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  66%|██████▌   | 21/32 [00:17<00:09,  1.17it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  69%|██████▉   | 22/32 [00:18<00:09,  1.09it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  72%|███████▏  | 23/32 [00:18<00:07,  1.23it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  75%|███████▌  | 24/32 [00:20<00:06,  1.14it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  78%|███████▊  | 25/32 [00:20<00:05,  1.29it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  81%|████████▏ | 26/32 [00:21<00:05,  1.17it/s]


 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  84%|████████▍ | 27/32 [00:22<00:04,  1.12it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  88%|████████▊ | 28/32 [00:23<00:03,  1.27it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  91%|█████████ | 29/32 [00:24<00:02,  1.16it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'accuracy': 1.0}}


Test:  94%|█████████▍| 30/32 [00:24<00:01,  1.30it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  97%|█████████▋| 31/32 [00:25<00:00,  1.18it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:26<00:00,  1.22it/s]


Mean IIA: 0.9720
Mean correct IIA: 0.9720
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer6_6-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 817.75 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 9057.29 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 12.09it/s, loss=3.66, acc=0]


Tokens to intervene:
['In 2034, there', 'In 2034, there', 'In 2036, there']
['In 1966, there', 'In 1979, there', 'In 1966, there']
Base: ['4', '4', '6', '1', '4', '4', '8', '4', '0', '8', '6', '9', '3', '1', '7', '3']
Source: ['6', '9', '6', '1', '3', '4', '6', '6', '3', '7', '8', '4', '3', '3', '6', '3']
Output: [' will', ' will', ' will', ' will', ' were', ' will', ' was', ' was', ' were', ' was', ' will', ' was', ' was', ' was', ' will', ' was']
Label     : [' was', ' was', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' will', ' was', ' will', ' will', ' will', ' was', ' will']
Base Label: [' will', ' will', ' will', ' will', ' were', ' will', ' was', ' was', ' were', ' was', ' will', ' was', ' was', ' was', ' will', ' was']

Tokens to intervene:
['In 1951, there', 'In 2041, there', 'In 2026, there']
['In 2044, there', 'In 1979, there', 'In 1997, there']
Base: ['1', '1', '6', '4', '0', '9', '4', '4', '1', '4', '5', '9', '6', '9', '3', '7']
Source: ['4', '9', '7', '7'

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 12.09it/s, loss=3.53, acc=0]


Tokens to intervene:
['In 2041, there', 'In 1951, there', 'In 2033, there']
['In 1966, there', 'In 2043, there', 'In 1987, there']
Base: ['1', '1', '3', '4', '8', '9', '1', '3', '8', '3', '7', '1', '0', '0', '4', '4']
Source: ['6', '3', '7', '0', '0', '1', '1', '2', '4', '6', '8', '3', '6', '6', '1', '0']
Output: [' will', ' was', ' will', ' will', ' were', ' will', ' will', ' will', ' was', ' will', ' will', ' was', ' were', ' were', ' were', ' was']
Label     : [' was', ' will', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' was', ' was', ' will', ' will', ' will', ' will', ' will']
Base Label: [' will', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' was', ' will', ' will', ' was', ' were', ' were', ' were', ' was']


Epoch: 0: 100%|██████████| 54/54 [00:04<00:00, 12.38it/s, loss=1.9, acc=0.48] 
Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.37s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 677.94 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 8230.26 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:17,  1.73it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:25,  1.19it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' were', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was']



Test:   9%|▉         | 3/32 [00:02<00:20,  1.44it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:22,  1.25it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:19,  1.41it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:04<00:21,  1.22it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.27it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:07<00:20,  1.15it/s]


 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  31%|███▏      | 10/32 [00:07<00:16,  1.30it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  34%|███▍      | 11/32 [00:08<00:18,  1.16it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  38%|███▊      | 12/32 [00:09<00:17,  1.12it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:14,  1.28it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  44%|████▍     | 14/32 [00:11<00:18,  1.00s/it]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  47%|████▋     | 15/32 [00:12<00:14,  1.15it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  50%|█████     | 16/32 [00:13<00:15,  1.06it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  56%|█████▋    | 18/32 [00:15<00:11,  1.22it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  59%|█████▉    | 19/32 [00:16<00:11,  1.16it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  62%|██████▎   | 20/32 [00:16<00:09,  1.32it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  66%|██████▌   | 21/32 [00:17<00:09,  1.21it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  69%|██████▉   | 22/32 [00:18<00:09,  1.11it/s]


 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  72%|███████▏  | 23/32 [00:19<00:07,  1.25it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  78%|███████▊  | 25/32 [00:20<00:05,  1.24it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  81%|████████▏ | 26/32 [00:21<00:05,  1.16it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  84%|████████▍ | 27/32 [00:22<00:04,  1.09it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:23<00:03,  1.25it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  94%|█████████▍| 30/32 [00:25<00:01,  1.30it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}}

 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  97%|█████████▋| 31/32 [00:26<00:00,  1.18it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:26<00:00,  1.20it/s]


Mean IIA: 0.8960
Mean correct IIA: 0.8960
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 812.22 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 7492.99 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   2%|▏         | 1/54 [00:00<00:05,  9.86it/s, loss=3.52, acc=0]


Tokens to intervene:
['In 1984, there', 'In 2028, there', 'In 2043, there']
['In 2037, there', 'In 1974, there', 'In 1984, there']
Base: ['4', '8', '3', '6', '4', '3', '4', '0', '3', '4', '3', '8', '3', '9', '9', '9']
Source: ['7', '4', '4', '6', '4', '0', '3', '9', '7', '0', '4', '3', '4', '7', '9', '3']
Output: [' was', ' will', ' will', ' was', ' will', ' was', ' will', ' were', ' was', ' will', ' was', ' was', ' will', ' will', ' will', ' were']
Label     : [' will', ' was', ' was', ' will', ' was', ' will', ' was', ' will', ' will', ' was', ' will', ' will', ' was', ' was', ' was', ' will']
Base Label: [' was', ' will', ' will', ' was', ' will', ' was', ' will', ' were', ' was', ' will', ' was', ' was', ' will', ' will', ' will', ' were']

Tokens to intervene:
['In 2033, there', 'In 2016, there', 'In 2033, there']
['In 1994, there', 'In 2037, there', 'In 2016, there']
Base: ['3', '6', '3', '2', '4', '9', '3', '4', '9', '7', '4', '6', '3', '4', '6', '7']
Source: ['4', '7', '6', '9

Epoch: 0:   2%|▏         | 1/54 [00:00<00:05,  9.86it/s, loss=3.62, acc=0]


Tokens to intervene:
['In 2007, there', 'In 1956, there', 'In 1951, there']
['In 2030, there', 'In 2049, there', 'In 2026, there']
Base: ['7', '6', '1', '8', '3', '2', '7', '3', '4', '1', '7', '1', '3', '3', '4', '2']
Source: ['0', '9', '6', '3', '8', '4', '9', '3', '6', '3', '1', '3', '7', '4', '4', '1']
Output: [' were', ' was', ' was', ' was', ' will', ' was', ' will', ' will', ' was', ' will', ' will', ' was', ' will', ' were', ' will', ' were']
Label     : [' will', ' will', ' will', ' will', ' was', ' will', ' was', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' was', ' will']
Base Label: [' were', ' was', ' was', ' was', ' will', ' was', ' will', ' will', ' was', ' will', ' will', ' was', ' will', ' were', ' will', ' were']


Epoch: 0: 100%|██████████| 54/54 [00:05<00:00, 10.59it/s, loss=1.54, acc=0.6] 
Epoch: 100%|██████████| 1/1 [00:05<00:00,  5.10s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 641.52 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6325.37 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:18,  1.69it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:24,  1.22it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' were', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was']

 

Test:   9%|▉         | 3/32 [00:02<00:20,  1.41it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:23,  1.21it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:19,  1.39it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:04<00:21,  1.23it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  22%|██▏       | 7/32 [00:05<00:21,  1.16it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.32it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:07<00:18,  1.21it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  31%|███▏      | 10/32 [00:07<00:16,  1.36it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  34%|███▍      | 11/32 [00:08<00:17,  1.23it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  38%|███▊      | 12/32 [00:09<00:17,  1.17it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:14,  1.32it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  44%|████▍     | 14/32 [00:11<00:14,  1.23it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  47%|████▋     | 15/32 [00:11<00:12,  1.40it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  50%|█████     | 16/32 [00:12<00:12,  1.28it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  53%|█████▎    | 17/32 [00:13<00:12,  1.19it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  56%|█████▋    | 18/32 [00:13<00:10,  1.35it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  59%|█████▉    | 19/32 [00:14<00:10,  1.24it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  62%|██████▎   | 20/32 [00:15<00:08,  1.40it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  66%|██████▌   | 21/32 [00:16<00:08,  1.26it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  69%|██████▉   | 22/32 [00:17<00:08,  1.20it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  72%|███████▏  | 23/32 [00:17<00:06,  1.34it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  75%|███████▌  | 24/32 [00:18<00:06,  1.24it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  78%|███████▊  | 25/32 [00:19<00:05,  1.38it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  81%|████████▏ | 26/32 [00:20<00:04,  1.26it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  84%|████████▍ | 27/32 [00:21<00:04,  1.20it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  88%|████████▊ | 28/32 [00:21<00:02,  1.36it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  91%|█████████ | 29/32 [00:22<00:02,  1.27it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'accuracy': 1.0}}


Test:  94%|█████████▍| 30/32 [00:23<00:01,  1.42it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  97%|█████████▋| 31/32 [00:24<00:00,  1.31it/s]


 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:24<00:00,  1.30it/s]


Mean IIA: 0.9700
Mean correct IIA: 0.9700
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer7_7-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 825.99 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 8310.25 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.82it/s, loss=3.75, acc=0.03]


Tokens to intervene:
['In 1956, there', 'In 1997, there', 'In 2028, there']
['In 2026, there', 'In 2041, there', 'In 2038, there']
Base: ['6', '7', '8', '6', '6', '4', '7', '7', '7', '1', '1', '0', '2', '3', '6', '6']
Source: ['6', '1', '8', '4', '9', '3', '1', '0', '8', '0', '6', '1', '6', '1', '3', '9']
Output: [' was', ' was', ' will', ' will', ' will', ' were', ' was', ' will', ' was', ' will', ' was', ' will', ' were', ' was', ' will', ' will']
Label     : [' will', ' will', ' was', ' was', ' was', ' will', ' will', ' was', ' will', ' was', ' will', ' was', ' will', ' will', ' was', ' was']
Base Label: [' was', ' was', ' will', ' will', ' will', ' were', ' was', ' will', ' was', ' will', ' was', ' will', ' were', ' was', ' will', ' will']

Tokens to intervene:
['In 2030, there', 'In 1993, there', 'In 2041, there']
['In 1981, there', 'In 2034, there', 'In 1978, there']
Base: ['0', '3', '1', '4', '1', '3', '0', '9', '6', '7', '8', '7', '1', '1', '8', '3']
Source: ['1', '4', '8', '9

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.82it/s, loss=3.91, acc=0.02]


Tokens to intervene:
['In 2009, there', 'In 1970, there', 'In 2013, there']
['In 2028, there', 'In 2034, there', 'In 2043, there']
Base: ['9', '0', '3', '1', '8', '1', '6', '2', '7', '0', '4', '8', '1', '3', '1', '1']
Source: ['8', '4', '3', '4', '4', '4', '4', '7', '7', '3', '7', '7', '8', '8', '6', '6']
Output: [' were', ' were', ' were', ' was', ' was', ' was', ' were', ' was', ' was', ' will', ' will', ' were', ' was', ' will', ' will', ' was']
Label     : [' will', ' will', ' will', ' will', ' will', ' will', ' was', ' will', ' will', ' was', ' was', ' was', ' will', ' was', ' was', ' will']
Base Label: [' were', ' were', ' were', ' was', ' was', ' was', ' will', ' was', ' was', ' will', ' will', ' will', ' was', ' will', ' will', ' was']


Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 13.58it/s, loss=1.98, acc=0.47]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.98s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 915.83 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 8099.99 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:15,  1.97it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:22,  1.33it/s]


Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' was', ' was', ' was', ' was', ' were', ' was', ' was', ' was', ' were', ' was', ' were', ' was', ' was']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was']



Test:   9%|▉         | 3/32 [00:01<00:18,  1.55it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:02<00:21,  1.33it/s]


 'source-In 1996, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:03<00:17,  1.51it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:04<00:20,  1.28it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  22%|██▏       | 7/32 [00:05<00:20,  1.23it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:05<00:17,  1.41it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:06<00:17,  1.28it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  31%|███▏      | 10/32 [00:07<00:15,  1.46it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  34%|███▍      | 11/32 [00:08<00:16,  1.30it/s]


 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  38%|███▊      | 12/32 [00:09<00:16,  1.21it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:09<00:13,  1.37it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  44%|████▍     | 14/32 [00:10<00:14,  1.25it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  47%|████▋     | 15/32 [00:11<00:12,  1.40it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  50%|█████     | 16/32 [00:12<00:12,  1.26it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  56%|█████▋    | 18/32 [00:13<00:10,  1.33it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  59%|█████▉    | 19/32 [00:14<00:10,  1.23it/s]


 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  62%|██████▎   | 20/32 [00:14<00:08,  1.39it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  66%|██████▌   | 21/32 [00:15<00:08,  1.27it/s]


 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  72%|███████▏  | 23/32 [00:17<00:06,  1.36it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  75%|███████▌  | 24/32 [00:18<00:06,  1.24it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.2}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  78%|███████▊  | 25/32 [00:18<00:04,  1.41it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  81%|████████▏ | 26/32 [00:19<00:04,  1.29it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  84%|████████▍ | 27/32 [00:20<00:04,  1.21it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:21<00:02,  1.36it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  91%|█████████ | 29/32 [00:22<00:02,  1.27it/s]


 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.7}, 'accuracy': 0.8}}


Test:  94%|█████████▍| 30/32 [00:22<00:01,  1.41it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  97%|█████████▋| 31/32 [00:23<00:00,  1.28it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


Mean IIA: 0.8720
Mean correct IIA: 0.8720
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 817.26 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 8800.39 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.40it/s, loss=4.03, acc=0]


Tokens to intervene:
['In 2043, there', 'In 2004, there', 'In 1980, there']
['In 2009, there', 'In 2036, there', 'In 2036, there']
Base: ['3', '4', '0', '3', '7', '7', '3', '9', '4', '1', '9', '4', '8', '0', '1', '6']
Source: ['9', '6', '6', '7', '4', '3', '8', '3', '0', '4', '4', '7', '1', '1', '6', '7']
Output: [' will', ' were', ' were', ' will', ' was', ' were', ' will', ' will', ' was', ' was', ' will', ' will', ' will', ' were', ' was', ' will']
Label     : [' was', ' will', ' will', ' was', ' will', ' will', ' was', ' was', ' will', ' will', ' was', ' was', ' was', ' will', ' will', ' was']
Base Label: [' will', ' were', ' were', ' will', ' was', ' were', ' will', ' will', ' was', ' was', ' will', ' will', ' will', ' were', ' was', ' will']

Tokens to intervene:
['In 2044, there', 'In 2007, there', 'In 1956, there']
['In 1970, there', 'In 2041, there', 'In 2033, there']
Base: ['4', '7', '6', '6', '9', '3', '0', '4', '9', '2', '1', '6', '8', '4', '1', '4']
Source: ['0', '1', '3'

Epoch: 0:   4%|▎         | 2/54 [00:00<00:04, 11.40it/s, loss=3.89, acc=0.02]


Tokens to intervene:
['In 2043, there', 'In 1979, there', 'In 1960, there']
['In 1972, there', 'In 1951, there', 'In 2036, there']
Base: ['3', '9', '0', '4', '6', '6', '4', '6', '3', '3', '7', '1', '9', '5', '7', '0']
Source: ['2', '1', '6', '0', '9', '5', '1', '3', '8', '3', '8', '7', '8', '9', '9', '3']
Output: [' will', ' was', ' were', ' was', ' were', ' will', ' will', ' will', ' will', ' was', ' were', ' will', ' will', ' was', ' was', ' will']
Label     : [' was', ' was', ' will', ' will', ' will', ' was', ' was', ' was', ' was', ' will', ' will', ' was', ' was', ' will', ' will', ' was']
Base Label: [' will', ' was', ' were', ' was', ' were', ' will', ' will', ' will', ' will', ' was', ' were', ' will', ' will', ' was', ' was', ' will']


Epoch: 0: 100%|██████████| 54/54 [00:04<00:00, 11.97it/s, loss=1.59, acc=0.6] 
Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.51s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 658.42 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 8381.80 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:17,  1.82it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:23,  1.26it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' was', ' w

Test:   9%|▉         | 3/32 [00:02<00:19,  1.50it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  16%|█▌        | 5/32 [00:03<00:18,  1.45it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  19%|█▉        | 6/32 [00:04<00:20,  1.28it/s]


 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  22%|██▏       | 7/32 [00:05<00:20,  1.20it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:05<00:17,  1.37it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:06<00:18,  1.25it/s]


 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  31%|███▏      | 10/32 [00:07<00:15,  1.41it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  34%|███▍      | 11/32 [00:08<00:16,  1.27it/s]


 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  38%|███▊      | 12/32 [00:09<00:16,  1.18it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:09<00:14,  1.34it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  44%|████▍     | 14/32 [00:11<00:15,  1.15it/s]


 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  47%|████▋     | 15/32 [00:11<00:13,  1.31it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  50%|█████     | 16/32 [00:12<00:13,  1.21it/s]


 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  53%|█████▎    | 17/32 [00:13<00:12,  1.15it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  56%|█████▋    | 18/32 [00:13<00:10,  1.31it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:15<00:08,  1.38it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  66%|██████▌   | 21/32 [00:16<00:08,  1.25it/s]


 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  69%|██████▉   | 22/32 [00:17<00:08,  1.18it/s]


 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  72%|███████▏  | 23/32 [00:17<00:06,  1.33it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  78%|███████▊  | 25/32 [00:19<00:05,  1.35it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}


Test:  81%|████████▏ | 26/32 [00:20<00:04,  1.24it/s]


 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy'

Test:  88%|████████▊ | 28/32 [00:21<00:03,  1.32it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:22<00:02,  1.22it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.8}, 'accuracy': 0.9}}


Test:  94%|█████████▍| 30/32 [00:23<00:01,  1.33it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  97%|█████████▋| 31/32 [00:24<00:00,  1.20it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:24<00:00,  1.28it/s]


Mean IIA: 0.9260
Mean correct IIA: 0.9260
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer8_8-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source label: ['das']
#Training examples: 872


Map: 100%|██████████| 872/872 [00:01<00:00, 629.27 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 8648.38 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  32768


Epoch: 0:   4%|▎         | 2/54 [00:00<00:03, 13.77it/s, loss=3.99, acc=0]


Tokens to intervene:
['In 2041, there', 'In 1958, there', 'In 1978, there']
['In 1952, there', 'In 2033, there', 'In 2041, there']
Base: ['1', '8', '8', '1', '4', '1', '8', '6', '6', '3', '4', '9', '3', '3', '7', '4']
Source: ['2', '3', '1', '1', '6', '7', '6', '8', '3', '2', '1', '4', '0', '2', '3', '6']
Output: [' will', ' was', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' were', ' were', ' will', ' will', ' were']
Label     : [' was', ' will', ' will', ' was', ' will', ' will', ' was', ' will', ' was', ' was', ' was', ' will', ' will', ' was', ' was', ' will']
Base Label: [' will', ' was', ' was', ' will', ' was', ' was', ' will', ' was', ' will', ' will', ' will', ' were', ' were', ' will', ' will', ' were']

Tokens to intervene:
['In 1951, there', 'In 2016, there', 'In 2037, there']
['In 2043, there', 'In 2034, there', 'In 1984, there']
Base: ['1', '6', '7', '3', '1', '0', '4', '6', '1', '0', '3', '8', '3', '4', '4', '2']
Source: ['3', '4', '4', 

Epoch: 0: 100%|██████████| 54/54 [00:03<00:00, 14.45it/s, loss=2.07, acc=0.42]
Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.74s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 717.19 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6606.33 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:24,  1.27it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.2, 'token_accuracy': 0.2, 'class_0_accuracy': 0.2}, 'accuracy': 0.2}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' was', ' 

Test:   6%|▋         | 2/32 [00:01<00:28,  1.04it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' 

Test:   9%|▉         | 3/32 [00:02<00:22,  1.28it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  12%|█▎        | 4/32 [00:03<00:24,  1.12it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  16%|█▌        | 5/32 [00:04<00:20,  1.30it/s]


 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:05<00:21,  1.20it/s]


 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  22%|██▏       | 7/32 [00:05<00:22,  1.13it/s]


 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:06<00:18,  1.29it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:07<00:19,  1.20it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  31%|███▏      | 10/32 [00:07<00:15,  1.38it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  34%|███▍      | 11/32 [00:08<00:16,  1.27it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  38%|███▊      | 12/32 [00:09<00:16,  1.21it/s]


 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:10<00:13,  1.37it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  44%|████▍     | 14/32 [00:11<00:14,  1.27it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:11<00:11,  1.43it/s]


 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:12<00:12,  1.31it/s]


 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  53%|█████▎    | 17/32 [00:13<00:12,  1.23it/s]


 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  56%|█████▋    | 18/32 [00:14<00:10,  1.38it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  59%|█████▉    | 19/32 [00:15<00:10,  1.26it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:15<00:08,  1.42it/s]


 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  66%|██████▌   | 21/32 [00:16<00:08,  1.30it/s]


 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  69%|██████▉   | 22/32 [00:17<00:08,  1.22it/s]


 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  72%|███████▏  | 23/32 [00:17<00:06,  1.38it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  75%|███████▌  | 24/32 [00:18<00:06,  1.27it/s]


 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.3, 'token_accuracy': 0.3, 'class_0_accuracy': 0.3}, 'accuracy': 0.3}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  78%|███████▊  | 25/32 [00:19<00:04,  1.43it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  81%|████████▏ | 26/32 [00:20<00:04,  1.30it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  84%|████████▍ | 27/32 [00:21<00:04,  1.24it/s]


 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:21<00:02,  1.40it/s]


 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:22<00:02,  1.28it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.6}, 'accuracy': 0.7}}


Test:  94%|█████████▍| 30/32 [00:23<00:01,  1.45it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:24<00:00,  1.31it/s]



 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}
Mean IIA: 0.7880
Mean correct IIA: 0.7880
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim16-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output
Training Tasks: {'das': 'match_source'}
Training tasks matching source l

Map: 100%|██████████| 872/872 [00:01<00:00, 589.52 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 6364.86 examples/s]


base model trainable parameters:  0
intervention trainable parameters:  65536


Epoch: 0:   0%|          | 0/54 [00:00<?, ?it/s, loss=3.49, acc=0]


Tokens to intervene:
['In 1965, there', 'In 2030, there', 'In 1991, there']
['In 2034, there', 'In 2007, there', 'In 2028, there']
Base: ['5', '0', '1', '1', '0', '3', '1', '0', '3', '6', '0', '7', '3', '4', '8', '6']
Source: ['4', '7', '8', '1', '1', '3', '3', '3', '1', '7', '7', '3', '1', '8', '4', '4']
Output: [' was', ' will', ' was', ' was', ' were', ' were', ' was', ' will', ' was', ' will', ' were', ' were', ' will', ' will', ' was', ' will']
Label     : [' will', ' was', ' will', ' will', ' will', ' will', ' will', ' was', ' will', ' was', ' will', ' will', ' was', ' was', ' will', ' was']
Base Label: [' was', ' will', ' was', ' was', ' were', ' were', ' was', ' will', ' was', ' will', ' were', ' were', ' will', ' will', ' was', ' will']

Tokens to intervene:
['In 1970, there', 'In 2049, there', 'In 2044, there']
['In 2037, there', 'In 1970, there', 'In 1963, there']
Base: ['0', '9', '4', '3', '6', '3', '4', '8', '6', '3', '9', '4', '0', '7', '4', '8']
Source: ['7', '0', '3', 

Epoch: 0:   7%|▋         | 4/54 [00:00<00:04, 10.71it/s, loss=3.27, acc=0.05]


Tokens to intervene:
['In 2037, there', 'In 2026, there', 'In 2034, there']
['In 2038, there', 'In 1957, there', 'In 2004, there']
Base: ['7', '6', '4', '4', '8', '6', '6', '6', '1', '8', '3', '1', '4', '4', '4', '3']
Source: ['8', '7', '4', '1', '0', '4', '6', '8', '6', '8', '2', '8', '3', '4', '6', '4']
Output: [' will', ' will', ' will', ' were', ' were', ' will', ' will', ' will', ' was', ' were', ' will', ' will', ' was', ' was', ' was', ' will']
Label     : [' was', ' was', ' was', ' will', ' was', ' was', ' was', ' was', ' will', ' was', ' was', ' was', ' will', ' will', ' will', ' was']
Base Label: [' will', ' will', ' will', ' were', ' will', ' will', ' will', ' will', ' was', ' will', ' will', ' will', ' was', ' was', ' was', ' will']


Epoch: 0: 100%|██████████| 54/54 [00:04<00:00, 12.35it/s, loss=1.64, acc=0.55]
Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.38s/it]


Model saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt


Map: 100%|██████████| 500/500 [00:00<00:00, 908.59 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 7569.39 examples/s]
Test:   3%|▎         | 1/32 [00:00<00:15,  1.99it/s]


 'source-In 2023, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

Inputs:
Base: ['In 1987, there', 'In 1978, there', 'In 1964, there']
Source: ['In 2023, there', 'In 2023, there', 'In 2023, there']
Tokens to intervene:
    Base: ['7', '8', '4', '4', '5', '1', '1', '6', '1', '9', '6', '3', '8', '0', '1', '7']
    Source: ['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '0', '0', '0', '0', '0', '0']
Outputs:
          Base Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' will', ' will', ' will', ' will', '

Test:   6%|▋         | 2/32 [00:01<00:22,  1.33it/s]


 'source-In 2019, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

Inputs:
Base: ['In 2049, there', 'In 2034, there', 'In 2026, there']
Source: ['In 2000, there', 'In 2000, there', 'In 2000, there']
Tokens to intervene:
    Base: ['9', '4', '6', '3', '6', '7', '3', '0', '9', '4', '3', '6', '4', '8', '6', '8']
    Source: ['0', '0', '0', '0', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '5', '5']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Counterfactual Output: [' was', ' was', ' were', ' 

Test:   9%|▉         | 3/32 [00:01<00:18,  1.55it/s]


Inputs:
Base: ['In 2034, there', 'In 2041, there', 'In 2044, there']
Source: ['In 1975, there', 'In 1975, there', 'In 1975, there']
Tokens to intervene:
    Base: ['4', '1', '4', '6', '3', '7', '3', '9', '3', '2', '6', '7', '0', '4', '1', '8']
    Source: ['5', '5', '5', '5', '5', '5', '5', '5', '8', '8', '8', '8', '8', '8', '8', '8']
Outputs:
          Base Output: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
Counterfactual Output: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will']
Labels:
           Base Label: [' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' were', ' was', ' were', ' was', ' were', ' were', ' was', ' was']
 Counterfactual Label: [' was', ' was', ' was', ' was', ' was', ' was', ' was', ' was', ' will', ' will', ' will', ' will', ' will', ' will', ' will', ' will

Test:  16%|█▌        | 5/32 [00:03<00:18,  1.49it/s]


 'source-In 2046, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.2}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1950, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  19%|█▉        | 6/32 [00:04<00:19,  1.32it/s]


 'source-In 2022, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1985, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  22%|██▏       | 7/32 [00:05<00:20,  1.24it/s]


 'source-In 2008, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2042, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  25%|██▌       | 8/32 [00:05<00:17,  1.39it/s]


 'source-In 1953, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.9, 'token_accuracy': 0.9, 'class_0_accuracy': 0.9}, 'accuracy': 0.9}}

 'source-In 2045, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  28%|██▊       | 9/32 [00:06<00:17,  1.28it/s]


 'source-In 2006, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  31%|███▏      | 10/32 [00:07<00:15,  1.45it/s]


 'source-In 2040, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 1976, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  34%|███▍      | 11/32 [00:08<00:15,  1.32it/s]


 'source-In 1990, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  38%|███▊      | 12/32 [00:09<00:16,  1.24it/s]


 'source-In 2005, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2039, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  41%|████      | 13/32 [00:09<00:13,  1.40it/s]


 'source-In 2025, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2021, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  47%|████▋     | 15/32 [00:11<00:12,  1.39it/s]


 'source-In 2010, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1992, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  50%|█████     | 16/32 [00:11<00:12,  1.28it/s]


 'source-In 1959, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2032, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  53%|█████▎    | 17/32 [00:12<00:12,  1.22it/s]


 'source-In 1989, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1968, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.1}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  56%|█████▋    | 18/32 [00:13<00:10,  1.40it/s]


 'source-In 2027, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}

 'source-In 2018, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  62%|██████▎   | 20/32 [00:14<00:08,  1.44it/s]


 'source-In 1982, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}

 'source-In 2029, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  66%|██████▌   | 21/32 [00:15<00:08,  1.31it/s]


 'source-In 1962, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2035, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.7}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  69%|██████▉   | 22/32 [00:16<00:08,  1.22it/s]


 'source-In 1986, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1967, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}


Test:  72%|███████▏  | 23/32 [00:17<00:06,  1.38it/s]


 'source-In 2014, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1977, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  75%|███████▌  | 24/32 [00:18<00:06,  1.27it/s]


 'source-In 2024, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test:  78%|███████▊  | 25/32 [00:18<00:04,  1.42it/s]


 'source-In 1995, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2011, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  81%|████████▏ | 26/32 [00:19<00:04,  1.30it/s]


 'source-In 1988, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2001, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  88%|████████▊ | 28/32 [00:20<00:02,  1.40it/s]


 'source-In 2012, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2015, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}


Test:  91%|█████████ | 29/32 [00:21<00:02,  1.28it/s]


 'source-In 1983, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 1955, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.9}, 'inv_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'accuracy': 0.1}, 'labels': {'base_outputs': {'accuracy': 0.1, 'token_accuracy': 0.1, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.6}, 'accuracy': 0.7}}


Test:  94%|█████████▍| 30/32 [00:22<00:01,  1.44it/s]


 'source-In 2003, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.7, 'token_accuracy': 0.7, 'class_0_accuracy': 0.7}, 'accuracy': 0.7}}

 'source-In 2047, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 0.5}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'accuracy': 1.0}}


Test: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s]


 'source-In 1999, there-correct-test': {'base_labels': {'base_outputs': {'accuracy': 1.0, 'token_accuracy': 1.0, 'class_0_accuracy': 1.0}, 'inv_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'accuracy': 0.0}, 'labels': {'base_outputs': {'accuracy': 0.0, 'token_accuracy': 0.0, 'class_0_accuracy': 0.0}, 'inv_outputs': {'accuracy': 0.8, 'token_accuracy': 0.8, 'class_0_accuracy': 0.8}, 'accuracy': 0.8}}
Mean IIA: 0.8000
Mean correct IIA: 0.8000
Mean wrong IIA: nan
Saved to /nlp/scr/suzeva/original_year_localization/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim32-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.json





In [74]:
# OLMo-2-0425-1B_stage1-step10000-tokens21B-layer9_9-dim8-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example500_block_output


# Accuracy
inv_dims = [1, 2, 4, 8, 16, 32]

print(f'inv dim: {inv_dims}')
for l in range(10):
  iia = []
  for d in inv_dims:  # OLMo-2-0425-1B_stage1-step10000-tokens21B-layer0_0-dim1-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output_evalall
    split_to_eval_metrics = json.load(open(f'/nlp/scr/suzeva/{FOLDER}/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer{l}_{l}-dim{d}-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output_evalall.json'))
    iia.append(np.mean([v['metrics']['labels']['inv_outputs']['accuracy']
                        for k, v in split_to_eval_metrics.items() if '-correct' in k]).tolist())
  print(f'layer={l}\t' + '\t'.join(map(lambda x: f'{x:.2f}', iia)))

inv dim: [1, 2, 4, 8, 16, 32]
layer=0	0.01	0.10	0.13	0.16	0.19	0.26
layer=1	0.06	0.05	0.17	0.19	0.32	0.42
layer=2	0.23	0.27	0.49	0.52	0.53	0.53
layer=3	0.36	0.51	0.53	0.53	0.56	0.67
layer=4	0.45	0.49	0.54	0.62	0.90	0.96
layer=5	0.31	0.51	0.54	0.69	0.91	0.97
layer=6	0.48	0.51	0.55	0.73	0.90	0.97
layer=7	0.38	0.50	0.54	0.75	0.90	0.97
layer=8	0.33	0.51	0.55	0.72	0.87	0.93
layer=9	0.34	0.51	0.52	0.70	0.79	0.80


# PCA

In [75]:
import collections
import gc
import json
import numpy as np
import re

import h5py
from utils.intervention_utils import remove_all_forward_hooks


def get_last_token_activations(activation_store, repr_type):
  def hook(model, input, output):
    print(f"\n=== Hook called for {repr_type} ===")
    
    if repr_type == 'attention_weight':
      # attention otuput: attention_output, present, (attention_weights)
      print(f"Attention weight output type: {type(output)}, len: {len(output) if hasattr(output, '__len__') else 'N/A'}")
      if len(output) > 2:
        print(f"output[2] shape: {output[2].shape}")
      repr = output[2].detach().cpu()[:, :, -1, :]
    elif repr_type == 'attention_output':
      # attention otuput: attention_output, present, (attention_weights)
      print(f"Attention output type: {type(output)}, len: {len(output) if hasattr(output, '__len__') else 'N/A'}")
      print(f"output[0] shape: {output[0].shape}")
      repr = output[0].detach().cpu()[:, -1, :]
    elif repr_type == 'block_output':
      print(f"Block output type: {type(output)}")
      if isinstance(output, (tuple, list)):
        print(f"Block output len: {len(output)}")
        print(f"output[0] shape: {output[0].shape}")
        print(f"output[0] dtype: {output[0].dtype}")
      else:
        print(f"Block output shape: {output.shape}")
        print(f"Block output dtype: {output.dtype}")
      
      if isinstance(output, (tuple, list)):
        repr = output[0].detach().cpu()[:, -1, :]
      else:
        repr = output.detach().cpu()[:, -1, :]
        
    elif repr_type == 'mlp_output':
      print(f"MLP output shape: {output.shape}")
      print(f"MLP output dtype: {output.dtype}")
      repr = output.detach().cpu()[:, -1, :]
    elif repr_type == 'mlp_input':
      print(f"MLP input type: {type(input)}, len: {len(input) if hasattr(input, '__len__') else 'N/A'}")
      print(f"input[0] shape: {input[0].shape}")
      repr = input[0].detach().cpu()[:, -1, :]
    elif repr_type == 'attention_c_attn_output':
      print(f"Attention c_attn output shape: {output.shape}")
      repr = output.detach().cpu()
    
    print(f"Final repr shape: {repr.shape}")
    print(f"=== End hook for {repr_type} ===\n")
    activation_store.append(repr)
  return hook


def extract_activations_batched(text_inputs, pos, layer, input_max_length, batch_size=128, subspace=None):
  remove_all_forward_hooks(model)
  # Extract residual representations at the last token.
  repr_type = 'block_output'
  
  print(f"\n=== Starting extract_activations_batched ===")
  print(f"Number of text inputs: {len(text_inputs)}")
  print(f"pos: {pos}, layer: {layer}, input_max_length: {input_max_length}, batch_size: {batch_size}")
  print(f"First few text inputs: {text_inputs[:3]}")
  
  initial_shape = tokenizer(text_inputs, padding='longest', return_tensors='pt').input_ids.shape
  print(f"Initial tokenized shape (padding='longest'): {initial_shape}")
  
  encoded_inputs = tokenizer(
          text_inputs, return_tensors='pt', padding='max_length',
          max_length=input_max_length).to(device)
  
  print(f"Encoded inputs shape before truncation: {encoded_inputs.input_ids.shape}")
  
  # Remove the suffix so the target token is the last token.
  for k in encoded_inputs:
    encoded_inputs[k] = encoded_inputs[k][:, :pos + 1]
  
  print(f"Encoded inputs shape after truncation to pos+1={pos+1}: {encoded_inputs.input_ids.shape}")
  print(f"Sample input_ids[0]: {encoded_inputs.input_ids[0]}")
  print(f"Sample attention_mask[0]: {encoded_inputs.attention_mask[0]}")

  all_features = []
  remove_all_forward_hooks(model)
  # Extract residual representations at the last token.
  layer_features = []
  if repr_type == 'attention_weight':
    hook = model.model.layers[layer].self_attn.register_forward_hook(
        get_last_token_activations(layer_features, repr_type))
  elif repr_type == 'block_output':
    print(f"Registering hook on model.model.layers[{layer}]")
    hook = model.model.layers[layer].register_forward_hook(
        get_last_token_activations(layer_features, repr_type))
  
  with torch.no_grad():
    for b_i in range(0, len(encoded_inputs.input_ids), batch_size):
      print(f"\n--- Processing batch {b_i//batch_size + 1} (indices {b_i}:{b_i+batch_size}) ---")
      current_batch_size = min(batch_size, len(encoded_inputs.input_ids) - b_i)
      print(f"Current batch size: {current_batch_size}")
      print(f"Input batch shape: {encoded_inputs.input_ids[b_i:b_i+batch_size].shape}")
      print(f"Attention mask batch shape: {encoded_inputs.attention_mask[b_i:b_i+batch_size].shape}")
      
      _ = model.generate(input_ids=encoded_inputs.input_ids[b_i:b_i+batch_size],
                         attention_mask=encoded_inputs.attention_mask[b_i:b_i+batch_size],
                         max_new_tokens=1,
                         pad_token_id=tokenizer.pad_token_id)
      
      print(f"layer_features length after generation: {len(layer_features)}")
      if layer_features:
        concatenated = torch.cat(layer_features, dim=0)
        print(f"Concatenated layer_features shape: {concatenated.shape}")
        all_features.append(concatenated.tolist())
      layer_features.clear()
      
  hook.remove()
  del hook
  remove_all_forward_hooks(model)
  gc.collect()
  torch.cuda.empty_cache()
  
  print(f"\nall_features length: {len(all_features)}")
  if all_features:
    print(f"all_features[0] shape: {np.array(all_features[0]).shape}")
  
  all_features = np.concatenate(all_features)
  print(f"Final all_features shape: {all_features.shape}")
  
  if subspace is not None:
    subspace_key = list(subspace.keys())[0]
    print(f"Subspace key: {subspace_key}")
    print(f"Subspace shape: {subspace[subspace_key].shape}")
    subspace_features = all_features @ subspace[subspace_key].T.detach().cpu().numpy()
  else:
    subspace_features = all_features
  print('Project to subspace:', subspace_features.shape)
  del encoded_inputs
  gc.collect()
  torch.cuda.empty_cache()
  return subspace_features

# highest IIA
layer, dim = 5, 32

das_subspace = torch.load(f'/nlp/scr/suzeva/{FOLDER}/OLMo-2-0425-1B_stage1-step10000-tokens21B-layer{l}_{l}-dim{d}-daslora_baseline_1tok_das_id-1_len8_pose_ep1_example50_block_output.pt')
input_max_length = 8
batch_size = 8
layer = 4
pos = input_max_length - 3

all_prompts = [prompt_template[0].format(year=y) for y in range(1000, 3000)]
subspace_features = extract_activations_batched(
          all_prompts,
          pos=pos,
          layer=layer,
          input_max_length=input_max_length,
          batch_size=batch_size,
          subspace=das_subspace)


=== Starting extract_activations_batched ===
Number of text inputs: 2000
pos: 5, layer: 4, input_max_length: 8, batch_size: 8
First few text inputs: ['In 1000, there', 'In 1001, there', 'In 1002, there']
Initial tokenized shape (padding='longest'): torch.Size([2000, 6])
Encoded inputs shape before truncation: torch.Size([2000, 8])
Encoded inputs shape after truncation to pos+1=6: torch.Size([2000, 6])
Sample input_ids[0]: tensor([100257, 100257,    644,    220,   1041,     15], device='cuda:0')
Sample attention_mask[0]: tensor([0, 0, 1, 1, 1, 1], device='cuda:0')
Registering hook on model.model.layers[4]

--- Processing batch 1 (indices 0:8) ---
Current batch size: 8
Input batch shape: torch.Size([8, 6])
Attention mask batch shape: torch.Size([8, 6])

=== Hook called for block_output ===
Block output type: <class 'torch.Tensor'>
Block output shape: torch.Size([8, 6, 2048])
Block output dtype: torch.bfloat16
Final repr shape: torch.Size([8, 2048])
=== End hook for block_output ===

lay

In [None]:


# PCA & cluster

import numpy as np
from sklearn.decomposition import PCA


Xs = subspace_features
Ys = [{'year': i + 1000,
       'frequency': all_prompts[i] in data_split['train']['correct'],
       'pred_past': sum([s[1] for s in outputs[i] if s[0] in (' was', ' were')]),
       'pred_future': sum([s[1] for s in outputs[i] if s[0] in (' is', ' are', ' will')]),
       'pred_diff': sum([s[1] for s in outputs[i] if s[0] in (' was', ' were')]) - sum([s[1] for s in outputs[i] if s[0] in (' is', ' are', ' will')])}
      for i in range(len(Xs))]
colors =  [k for k in Ys]
X = (Xs - np.mean(Xs, axis=0, keepdims=True)) / Xs.std(axis=0)
print(X.mean(), X.std())
pca = PCA(n_components=4)
pca.fit(X)

X_reduced = pca.transform(X)
print(X_reduced.shape, len(Ys))
print(pca.explained_variance_ratio_, np.sum(pca.explained_variance_ratio_))

3.8546943414985435e-16 1.0000000000000002
(2000, 4) 2000
[0.29950969 0.10718914 0.08127037 0.06603004] 0.5539992376554456


In [87]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import pandas as pd


def plot_pca_2d(vectors, color_labels, hue, style, rerun_pca=False):
  global pca
  figsize = (8, 5)
  is_large_plot = figsize[0] * figsize[1] > 30
  plt.rcParams['figure.dpi'] = 300 if is_large_plot else 100
  plt.rcParams['savefig.dpi'] = 300 if is_large_plot else 100
  plt.rc('font', **{'size': 6})
  plt.figure(figsize=figsize)

  if rerun_pca:
    X = vectors
    X = (X - np.mean(X, axis=0, keepdims=True)) / X.var(axis=0)**0.5
    pca = PCA(n_components=2)
    pca.fit(X)

  X_2d = pca.transform(vectors)
  fig, ax = plt.subplots()
  pc_index = [0, 1]
  data = pd.DataFrame([dict(list({f'pc_{d}': X_2d[i][d] for d in range(len(X_2d[i]))}.items()) +
                                          list(color_labels[i].items()))
                                     for i in range(len(X_2d))])
  sns.scatterplot(data=data,
                  x=f"pc_{pc_index[0]}", y=f"pc_{pc_index[1]}", hue=hue, style=style, ax=ax,
                  #palette=sns.color_palette("mako", as_cmap=True)
                  )
  # Annotate data point
  for i in range(0, data.shape[0]):
    # Annotate outliers.
    # These values are hard-coded based on the actual plot.
    if (data[f"pc_{pc_index[0]}"][i] > 4.0 or data[f"pc_{pc_index[1]}"][i] > 3.5 or
        data[f"pc_{pc_index[0]}"][i] < -2.7 or data[f"pc_{pc_index[1]}"][i] < -2.0):
      ax.text(data[f"pc_{pc_index[0]}"][i] - 0.01, data[f"pc_{pc_index[1]}"][i]- 0.01,
              data['year'][i], fontsize=6, color='white', fontweight='bold')
      ax.text(data[f"pc_{pc_index[0]}"][i], data[f"pc_{pc_index[1]}"][i],
              data['year'][i], fontsize=6)

  plt.title('Hidden Representations')
  plt.xlabel(f'Principal Component {pc_index[0] + 1}')
  plt.ylabel(f'Principal Component {pc_index[1] + 1}')
  
  # save the plot
  safe_model_id = model_id.replace('/', '_')
  plt.savefig(f'{safe_model_id}_{revision}_pca_{hue}_{style}_layer{l}_dim{d}.png')
  plt.close()

  return X_2d

In [88]:
plot_pca_2d(X, colors, hue='pred_diff', style='frequency')

array([[-0.08051292, -4.41756373, -2.00730782, -0.62249469],
       [ 2.44598896, -5.26863018, -2.04760125, -0.21617198],
       [ 2.06247985, -4.51187507, -2.53785955,  2.03584621],
       ...,
       [-3.07513631,  1.3689811 , -0.67038972, -1.38367814],
       [-3.33637004,  1.32143246, -1.37401658, -1.1960616 ],
       [-3.90143788,  1.64189494, -0.32130079, -1.84068863]],
      shape=(2000, 4))

<Figure size 2400x1500 with 0 Axes>

In [89]:
plot_pca_2d(X, colors, hue='year', style='frequency')

array([[-0.08051292, -4.41756373, -2.00730782, -0.62249469],
       [ 2.44598896, -5.26863018, -2.04760125, -0.21617198],
       [ 2.06247985, -4.51187507, -2.53785955,  2.03584621],
       ...,
       [-3.07513631,  1.3689811 , -0.67038972, -1.38367814],
       [-3.33637004,  1.32143246, -1.37401658, -1.1960616 ],
       [-3.90143788,  1.64189494, -0.32130079, -1.84068863]],
      shape=(2000, 4))

<Figure size 2400x1500 with 0 Axes>