In [None]:
# get jsonl

import json

# Path to your JSON file
path = "lsat_ar_test_Meta-Llama-3.1-70B-Instruct_results.jsonl"

# Open and load into a Python dict
# Load all lines into a list of dicts
with open(path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]




In [None]:
import json5
import pandas as pd
import re
import ast
import numpy as np

def quick_parse(s):
    try:
        input = re.findall(r"\{.*?\}", s, flags=re.DOTALL)[0]
        my_dict = str(ast.literal_eval(input))

        return json5.loads(my_dict) # Had to combine both ast and json5 to finally get it to work right
        
    except:
        return {}


# --- Universal parser ---
def parse_response(entries= entries, fields=fields, model_type = None, stats= True): 
    data = {field: [] for field in fields}
    parsed_list = []
    #content_list = []

    for entry in entries:
        content = fix_reasoning_quotes(entry)
        #content_list.append(content)
        content_dict = quick_parse(content)
        for field in fields:
            try:
                data[field].append(content_dict[field])
            except:
                data[field].append(None)
        

    df = pd.DataFrame(data=data)
    df["coerce"] = df.notna().any(axis=1)
    df['content'] = entries

    return df


def read_json_lines(file_path):
    # Load all lines from the file
    entries = []
    skiped_lines = 0
    with open(file_path, "r") as f:
        for line in f:
            try:
                entries.append(json.loads(line))
            except:
                skiped_lines += 1
                continue

    return entries, skiped_lines


def read_json(file_path):
    # Load all lines from the file
    with open(file_path, "r") as f:
        entries = json.load(f)
    return entries, 0

def get_content(output, model_type:str) -> list:
    content = []
    qid_list = []
    for entry in output:
        try:
            if model_type == "GPT":
                qid_list.append(entry['custom_id'])  
                content.append(entry['response']['body']['choices'][0]['message']['content'])
            if model_type == "Gemini":
                qid_list.append(entry['key'])
                content.append(entry['response']['candidates'][0]['content']['parts'][0]['text'])
            if model_type == "Claude":
                qid_list.append(entry['custom_id'])
                content.append(entry['result']['message']['content'][0]['text'])
            if model_type == "Llama":
                qid_list.append(entry["qid"])
                content.append(entry["ouput"]["content"].replace('<|eot_id|>', ''))      #I misspelled 'output' just keep in mind when debugging
                # --- Get token #
                
                
        except:
            content.append("GENERATION FAILED")

    return content, qid_list

def special_parser(df, model_name):                               # --- To do 
    pass

def llama_get_token_dict(answer: str, entry: str, content:str):
    pattern = r'"(' + re.escape(answer) + r')"'
    match = re.search(pattern, content)

    answer_index = content.find(str(answer))
    if match is None: 
        return {} #
    #print(content) #


    answer_index = match.start() #+ 1

    #print(f'Answer: {answer:<10} | Answer Index: {answer_index}')
    # Find Token 
    position = 0

    # Find answer token in JSON
    position = 0
    response = ""
    while len(response) < answer_index:
        token = entry['ouput']['tokens'][position]['top_token'] 
        response += token 
        position += 1 


    if token.strip() == answer:
        token_dict = entry['ouput']['tokens'][position - 1]['top_100']
        return token_dict, position
    
    else:
        token_dict = entry['ouput']['tokens'][position ]['top_100']
        return token_dict, position




def fix_reasoning_quotes(s):
    # add quotes if missing
    text = s.replace('"Reasoning": The', '"Reasoning": "The')#.replace('. "Ans', '.", "Ans')

    pattern = r'("Reasoning"\s*:\s*)"([\s\S]*?)"(\s*[,\}])'

    def repl(m: re.Match) -> str:
        # m.group(2) = reasoning text
        return f'{m.group(1)}"""{m.group(2)}"""{m.group(3)}'

    return re.sub(pattern, repl, text, flags=re.DOTALL)


In [52]:

fields = ['Reasoning', 'Answer', 'Confidence']
model_type = 'Llama'
entries, qid = get_content(data, model_type= 'Llama')

df = parse_response(entries=entries, fields= fields, model_type= model_type)
    
df['Answer'].notna().mean()


0.9219512195121952

In [55]:
answers = df['Answer']
fields = ['A', 'B', 'C', 'D', 'E']

top_token_list = []
pos_list = []
probs = []
i = 0
for entry in data:
    try:
        answer = df['Answer'][i]
        token_dict, pos = llama_get_token_dict(answer= answer, entry = entry, content = df['content'][i])

        top_token = max(token_dict, key=token_dict.get)
        top_token_list.append(top_token)
        pos_list.append(int(pos))

        probs.append(field_probs(fields= fields, token_dict = token_dict))

    except:
        top_token_list.append(None)
        pos_list.append(None)
    i += 1

probs_df = pd.DataFrame(probs)
df['t1'] = top_token_list
df['index'] = pos_list
df['match'] = df['t1'].str.strip() == df['Answer'].str.strip()

In [60]:
df[df['t1'].notna()]['match'].mean()
df['t1'].notna().mean()
combined = pd.concat([df, probs_df], axis=1)
combined

Unnamed: 0,Reasoning,Answer,Confidence,coerce,content,t1,index,match,A_prob,B_prob,C_prob,D_prob,E_prob
0,"To determine the correct schedule, we need to ...",E,,True,"{\n""Reasoning"": ""To determine the correct sche...",E,127.0,True,0.000049,0.000102,0.000169,0.000043,0.999637
1,Given that Kyle and Lenore do not give reports...,E,,True,"{\n""Reasoning"": ""Given that Kyle and Lenore do...",E,130.0,True,0.006104,0.001921,0.000568,0.011766,0.979640
2,"To solve this question, we need to analyze the...",B,,True,"{\n""Reasoning"": ""To solve this question, we ne...",B,208.0,True,0.000133,0.999476,0.000102,0.000174,0.000114
3,"Given the conditions, we know that George can ...",D,,True,"{\n""Reasoning"": ""Given the conditions, we know...",D,145.0,True,0.000755,0.000207,0.000052,0.998763,0.000223
4,"Given the conditions, we know that Kyle gives ...",B,,True,"{\n""Reasoning"": ""Given the conditions, we know...",B,110.0,True,0.039456,0.710395,0.017791,0.216658,0.015700
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,,,,False,"{\n""Reasoning"": ""To determine which option cou...",,,False,,,,,
201,"To determine the correct order, we need to con...",D,,True,"{\n""Reasoning"": ""To determine the correct orde...",D,106.0,True,,,,,
202,To determine which audition cannot be the seco...,D,,True,"{\n""Reasoning"": ""To determine which audition c...",D,115.0,True,,,,,
203,"Given the conditions, we can deduce the follow...",C,,True,"{\n""Reasoning"": ""Given the conditions, we can ...",C,174.0,True,,,,,


In [54]:
fields = ['A', 'B', 'C', 'D', 'E']

def field_probs(fields: list, token_dict: dict):
    # Step 1: collapse similar tokens (case-insensitive)
    collapsed = {}
    for token, val in token_dict.items():
        canonical = token.strip().lower()
        collapsed[canonical] = collapsed.get(canonical, 0.0) + val

    # Step 2: pick only the fields we care about
    values = {}
    for field in fields:
        canonical = field.strip().lower()
        values[field] = collapsed.get(canonical, 0.0)

    # Step 3: normalize to probabilities
    total = sum(values.values())
    if total == 0:
        return {f"{field}_prob": 0.0 for field in fields}

    return {f"{field}_prob": val / total for field, val in values.items()}

field_probs(fields= fields, token_dict= token_dict)

{'A_prob': 0.00025296283615923736,
 'B_prob': 0.00019097382824085686,
 'C_prob': 0.9990272494142038,
 'D_prob': 0.00023770400344086677,
 'E_prob': 0.0002911099179551638}