In [1]:
import pandas as pd
import os
import pickle
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import ast
import numpy as np
import math

In [2]:
number_of_stochastic_responses = 20
df = pd.DataFrame()
models = []

In [3]:
def calculate_std_deviation(lst):
    mapping = {'A': 1, 'B': 2, 'C': 3, None: 0}
    numerical_values = []
    for item in lst:
        try:
            numerical_values.append(mapping[item])
        except KeyError:
            numerical_values.append(0)
    return np.std(numerical_values)

def response_parser(s):
    match = re.search(r'[A-Z]', s)
    if match:
        if(match.group(0) not in ['A', 'B', 'C']):
            return None
        return match.group(0)
    return None



def get_most_frequent_value(arr):
    counter = Counter(arr)
    try:
        most_frequent_value, _ = counter.most_common(1)[0]
    except:
        most_frequent_value = None
    return most_frequent_value  

def get_most_frequent_non_none_value(arr):
    filtered_arr = [x for x in arr if x is not None]
    counter = Counter(filtered_arr)
    
    try:
        most_common_value, _ = counter.most_common(1)[0]
    except:
        most_common_value = None # case when all values in arr are nan
    return most_common_value


def get_accuracy(pred, true):
    """
    Args:
        pred (list): list of lists of stochastic responses
        true (list): list of lists of stochastic responses

    Returns:
        (float) : exact match accuracy
    
    """
    count = 0
    for x, y in zip(pred, true):
        if(x == y): count += 1
    return count / len(pred)


def get_accuracy_none(pred, true):
    """
    Args:
        pred (list): list of lists of stochastic responses
        true (list): list of lists of stochastic responses

    Returns:
        (float) : exact match accuracy
    
    """
    count = 0
    for x, y in zip(pred, true):
        if(x == None and y == None):
            continue
        if(x == y): count += 1
    return count / len(pred)


def get_major_stochastic_response(stochastic_response, consider_nan = False):
    major_stochastic_response = []
    for arr in stochastic_response:
        if(consider_nan):
            try:
                major_stochastic_response.append(get_most_frequent_value(list(ast.literal_eval(arr))))
            except:
                major_stochastic_response.append(get_most_frequent_value(arr))
            continue
        major_stochastic_response.append(get_most_frequent_non_none_value(list(ast.literal_eval(arr))))
        
    return major_stochastic_response

def get_none_value_perc(arr):
    s = 0
    for elem in arr:
        if(elem is None): s += 1
    return s / len(arr)

def get_none_value_from_response(stochastic_response):
    none_perc_response = []
    for arr in stochastic_response:
        none_perc_response.append(get_none_value_perc(list(ast.literal_eval(arr))))
    return none_perc_response

In [4]:
output_directory = '/home/prasoon/snap/main/mtp/llm-science-miscommunication/results/open'
for model_file in os.listdir(output_directory):
    models.append(model_file[:-4])
    with open(output_directory + '/' + model_file, 'rb') as f:
        result = pickle.load(f)
    model_main = []
    model_stochastic = []

    for index in tqdm(range(len(result))):
        response = result[index]
        main_response = response_parser(response['main']['generated_text'])
        model_main.append(main_response)
        stochastic_responses = []
        for stochastic_index in range(number_of_stochastic_responses):
            stochastic_responses.append(response_parser(response['stochastic_'+str(stochastic_index)]['generated_text']))
        model_stochastic.append(stochastic_responses)
    df[model_file[:-4] + '_main'] = model_main
    df[model_file[:-4] + '_stochastic'] = model_stochastic

100%|██████████| 742/742 [00:00<00:00, 19581.55it/s]


100%|██████████| 742/742 [00:00<00:00, 23994.62it/s]
100%|██████████| 742/742 [00:00<00:00, 25951.22it/s]
100%|██████████| 742/742 [00:00<00:00, 11373.12it/s]
100%|██████████| 742/742 [00:00<00:00, 15840.69it/s]
100%|██████████| 742/742 [00:00<00:00, 28258.32it/s]
100%|██████████| 742/742 [00:00<00:00, 28530.33it/s]
100%|██████████| 742/742 [00:00<00:00, 28839.92it/s]
100%|██████████| 742/742 [00:00<00:00, 32913.90it/s]
100%|██████████| 742/742 [00:00<00:00, 28502.89it/s]
100%|██████████| 742/742 [00:00<00:00, 24971.10it/s]
100%|██████████| 742/742 [00:00<00:00, 18106.66it/s]
100%|██████████| 742/742 [00:00<00:00, 17922.11it/s]


In [5]:
data = pd.read_csv('/home/prasoon/snap/main/mtp/llm-science-miscommunication/data/data.csv')

In [6]:
model_results = {}
subject_wise_results = {'physics': {}, 'mathematics': {}, 'chemistry': {}, 'theoretical_cs': {}}

# target keys <- information
keys = [    'main_response_accuracy', 
            'major_stochastic_response_accuracy',
            'mean_variance_stochastic_response',
            'open_answer_abstinence',
            'closed_main_response_accuracy',
            'closed_major_stochastic_response_accuracy',
            'main_response_stochastic_response_agreeability',
            'main_response_stochastic_response_agreeability_without_none']

closed_answer_index = data.index[data['answer'] != 'C'].tolist()
open_answer_index = data.index[data['answer'] == 'C'].tolist()

In [7]:
for model in models:
    model_results[model] = {}
    # collecting the main_response and replacing nan with None
    # .... to check the agreeability of the main response and major stochastic responses
    main_response = list(df[model + '_main'])
    for index in range(len(main_response)):
        try:
            if(math.isnan(main_response[index])):
                main_response[index] = None
        except:
            continue
    
    # collecting the major stochastic responses
    #.... considering nan values as value entries and replacing wherever they are major to 'None'
    
    stochastic_response = list(df[model + '_stochastic'])
    major_stochastic_response = get_major_stochastic_response(stochastic_response, consider_nan = True) 
    # getting 'main_response_accuracy' & 'stochastic_response_accuracy'
    main_response_accuracy = get_accuracy(main_response, data['answer'])
    major_stochastic_response_accuracy = get_accuracy(major_stochastic_response, data['answer'])
        
    # getting 'agreeability' of the main_response and stochastic_response
    main_response_stochastic_response_agreeability = get_accuracy(main_response, major_stochastic_response)
    main_response_stochastic_response_agreeability_without_none = get_accuracy_none(main_response, major_stochastic_response)
    # here we are discounting those cases where both main_response and major_stochastic_response is none
    # we are treating them as mismatch
    
    closed_main_response = [main_response[index] for index in closed_answer_index]
    closed_major_stochastic_response = [major_stochastic_response[index] for index in closed_answer_index]  
    closed_true_answers = [list(data['answer'])[index] for index in closed_answer_index]
    
    closed_main_response_accuracy = get_accuracy(closed_main_response, closed_true_answers)
    closed_major_stochastic_response_accuracy = get_accuracy(closed_major_stochastic_response, closed_true_answers)
    
    open_main_response = [main_response[index] for index in open_answer_index]
    open_major_stochastic_response = [major_stochastic_response[index] for index in open_answer_index]
    open_true_answers = [list(data['answer'])[index] for index in open_answer_index]
    
    open_main_response_accuracy = get_accuracy(open_main_response, open_true_answers)
    open_major_stochastic_response_accuracy = get_accuracy(open_major_stochastic_response, open_true_answers)
    
    
    variance_stochastic_response = []
    for arr in stochastic_response:
        try:
            variance_stochastic_response.append(calculate_std_deviation(ast.literal_eval(arr)))
        except:
            variance_stochastic_response.append(calculate_std_deviation(arr))
        
    mean_variance_stochastic_response = np.mean(variance_stochastic_response)
    
    
    model_results[model]['mean_variance_stochastic_response'] = mean_variance_stochastic_response
    model_results[model]['main_response_accuracy'] = main_response_accuracy
    model_results[model]['major_stochastic_response_accuracy'] = major_stochastic_response_accuracy
    model_results[model]['main_response_stochastic_response_agreeability'] = main_response_stochastic_response_agreeability
    model_results[model]['main_response_stochastic_response_agreeability_without_none'] = main_response_stochastic_response_agreeability_without_none
    model_results[model]['closed_main_response_accuracy'] = closed_main_response_accuracy
    model_results[model]['closed_major_stochastic_response_accuracy'] = closed_major_stochastic_response_accuracy
    model_results[model]['open_main_response_accuracy'] = open_main_response_accuracy
    model_results[model]['open_major_stochastic_response_accuracy'] = open_major_stochastic_response_accuracy

In [8]:
for model in list(model_results.keys()):
    print(model)
    print(model_results[model])
    print('----------------')

meta-llama-3-70b-instruct
{'mean_variance_stochastic_response': 0.2947699876330266, 'main_response_accuracy': 0.628032345013477, 'major_stochastic_response_accuracy': 0.6226415094339622, 'main_response_stochastic_response_agreeability': 0.9663072776280324, 'main_response_stochastic_response_agreeability_without_none': 0.9582210242587601, 'closed_main_response_accuracy': 0.7803921568627451, 'closed_major_stochastic_response_accuracy': 0.7843137254901961, 'open_main_response_accuracy': 0.29310344827586204, 'open_major_stochastic_response_accuracy': 0.2672413793103448}
----------------
llama-2-13b-hf
{'mean_variance_stochastic_response': 0.8262193593294048, 'main_response_accuracy': 0.3274932614555256, 'major_stochastic_response_accuracy': 0.3611859838274933, 'main_response_stochastic_response_agreeability': 0.5431266846361186, 'main_response_stochastic_response_agreeability_without_none': 0.5363881401617251, 'closed_main_response_accuracy': 0.4764705882352941, 'closed_major_stochastic_re