In [None]:
'''
This notebook conducts a pairwise comparison of the QMSum summaries, using GPT-4o-Mini as a judge.

1. The first part of the notebook holds the helper functions.
2. The second part of the notebook conducts the comparison of summaries of non-cleaned transcripts (find COMPARISON_FOR_NON_CLEANED).
3. The third part of the notebook conducts the comparison between summaries from cleaned transcripts against summaires of non-cleaned transcripts (find COMPARISON_FOR_CLEANED).
4. In the end there is also a small test to check the functionality -- not needed for the assessment.

To run part 2 or 3, first run the boxes of the helper functions, and then one of the evaluations.
'''

In [None]:
'''
START OF HELPER FUNCTIONS
'''

In [1]:
import os
import json
import time
from openai import AzureOpenAI
import random
import numpy as np
import scipy.stats as stats
from tqdm import tqdm

import config

In [None]:
client = AzureOpenAI(
    api_key=config.AZURE_OPENAI_API_KEY, 
    api_version=config.AZURE_OPENAI_API_VERSION,
    azure_endpoint=config.AZURE_OPENAI_API_ENDPOINT
)
BASE_DIR = config.BASE_PATH

In [5]:
def prepare_message_query_summ(summ_ref, summ_1, summ_2, query):
    messages=[
        {"role": "system", "content": "You will be given a query-focused reference summary of a conversation, as well as two summaries written by automatic systems. Your task is to decide which of the two system summaries is better, with respect to the reference summary and the query. If it is difficult to decide which summary has better overall quality, then you may say that there is a tie. \
        First explain briefly the reasoning for your choice, and then provide an answer as 1, 2 or tie. \
        The output should be in the following format: \
        Explanation: <your reasoning> \
        Response: <1, 2 or tie>"},
        {"role": "user", "content": f"Query: {query}\nReference summary: {summ_ref}\nSystem 1 summary: {summ_1}\nSystem 2 summary: {summ_2}"}]
    return messages

def prepare_message_generic_summ(summ_ref, summ_1, summ_2):
    messages=[
        {"role": "system", "content": "You will be given a generic reference summary of a conversation, as well as two summaries written by automatic systems. Your task is to decide which of the two system summaries is better, with respect to the reference summary. If it is difficult to decide which summary has better overall quality, then you may say that there is a tie. \
        First explain briefly the reasoning for your choice, and then provide an answer as 1, 2 or tie. \
        The output should be in the following format: \
        Explanation: <your reasoning> \
        Response: <1, 2 or tie>"},
        {"role": "user", "content": f"Reference summary: {summ_ref}\nSystem 1 summary: {summ_1}\nSystem 2 summary: {summ_2}"}]
    return messages

def pairwise_comparison_gpt(summ_ref, summ_1, summ_2, query='', model_name="gpt-4o-mini"):
    # prepare the GPT prompts for this request:
    if query == '':  # generic summary
        prompt_messages = prepare_message_generic_summ(summ_ref, summ_1, summ_2)
    else:  # query focused summary
        prompt_messages = prepare_message_query_summ(summ_ref, summ_1, summ_2, query)
    
    num_tries = 1
    max_tries = 3
    while num_tries <= max_tries:  # try upto three times to get a properly formatted answer
        try:
            response = client.chat.completions.create(
                model=model_name, #"gpt-4o-mini", "gpt-4o"
                messages=prompt_messages
            )
        except Exception as ex:
            print(f'--- ERROR ---\nSomething went wrong with the request.\n{ex}\n----------')
            preference = ''
            explanation = ''
            break
        
        #tokens_in = response.usage.prompt_tokens
        #tokens_out = response.usage.completion_tokens
        try:
            response_parts = response.choices[0].message.content.split('Response:')
            if '1' in response_parts[1]:
                preference = '1'
            elif '2' in response_parts[1]:
                preference = '2'
            elif 'tie' in response_parts[1]:
                preference = 'tie'
            else:
                raise Exception('Did not get a preference.')
            
            if response_parts[0].startswith('Explanation:'):
                explanation = response_parts[0].split('Explanation:')[1].strip()
            else:
                explanation = response_parts[0].strip()
            
            break  # succeeded, so exit the retry loop
        except:
            preference = ''
            explanation = ''
            print(f'--- ERROR ---\nThe response seems to be in the wrong format.\n\n{response.choices[0].message}\n')
            if num_tries < max_tries:
                print('Retrying...\n----------\n')
                num_tries += 1
            else:
                print('No more tries!\n----------\n')
                break

    return preference, explanation

def compute_pairwise_overall(summ_ref, summs_sys, query='', 
                             anchored_sys_summ_name=None, existing_results=None,
                             print_responses=False):
    # Computes pairwise similarity between all pairs of summaries in the list given.
    # The score for a comparison is: a tie gives each system +1, and a win gives the winning system +2. These are summed over all comparisons.
    # This scoring method is from https://aclanthology.org/2024.findings-naacl.280.pdf
    # summ_ref os the reference summary for the instance
    # summs_sys is a dictionary of system_name -> system_summary, for the system summaries being evaluated
    # query is the query around which the suammry was written. If this was a generic summary, leave as ''.
    # anchored_sys_summ_name is an index of a system summary in summs_sys against which to compare all other summaries. If None, all summaries are compared to all others.
    # existing_results is a results dictionary with existing results, where this inference is for adding results (probably for anchoring a system summary and comnparing against others). If None, a results dictionary is started from scratch.
    # Returns the score counter between the system summaries within a dictionary with more detailed results.
    
    results = {'scores': None, 'result_details': [], 'query': query, 'summ_reference': summ_ref, 'summ_systems': summs_sys}
    scores = {sys_name: 0 for sys_name in summs_sys}  # initialize the preference counters for each system:

    # if there are existing results, we need to add result details to the existing ones, and add to existing scores:
    if existing_results is not None:
        results['result_details'] = existing_results['result_details']
        for sys_name in summs_sys:
            if sys_name in existing_results['scores']:
                scores[sys_name] = existing_results['scores'][sys_name]

    # loop over all pairs of system summaries to compare them with GPT:
    summ_list = [(sys_name, sys_summ) for sys_name, sys_summ in summs_sys.items()]
    for i in range(len(summ_list) - 1):
        for j in range(i + 1, len(summ_list)):

            # if there is a specific sys_summ to compare against, only process on that one:
            if (anchored_sys_summ_name is not None) and \
                (summ_list[i][0] != anchored_sys_summ_name and summ_list[j][0] != anchored_sys_summ_name):
                continue

            # randomly choose the order of the summaries as they are presented to the LLM:
            if random.randint(0, 1):
                idx_1, idx_2 = i, j
            else:
                idx_1, idx_2 = j, i
            sys1_name, sys1_summ = summ_list[idx_1]
            sys2_name, sys2_summ = summ_list[idx_2]

            # compute pairwise comparison with GPT:
            preference, explanation = pairwise_comparison_gpt(summ_ref, sys1_summ, sys2_summ, query)
            results['result_details'].append({'sys1': sys1_name, 'sys2': sys2_name, 
                                              'preference': preference, 'explanation': explanation})

            # provide points for the winning system:
            if preference == '1':
                scores[sys1_name] += 2
            elif preference == '2':
                scores[sys2_name] += 2
            elif preference == 'tie':
                scores[sys1_name] += 1
                scores[sys2_name] += 1
            else:
                print(f'Error with summaries {sys1_name}/{sys2_name}')

            if print_responses:
                print(f'Summ 1 ({sys1_name}) vs. Summ 2 ({sys2_name}):')
                print(f'\tPreference: {preference}')
                print(f'\tExplanation: {explanation}')
                print('---')
    
    results['scores'] = scores

    return results

In [6]:
def prepare_qmsum_noisy_data(data_version_to_filepaths):
    # get the queries for the QMSum dataset:
    data_source_folderpath = f'{BASE_DIR}/data/QMSum/test'
    datum_name_to_queries = {}
    for datum_filename in os.listdir(data_source_folderpath):
        if not datum_filename.endswith('.json'):
            continue
        datum_filepath = os.path.join(data_source_folderpath, datum_filename)
        datum_name_to_queries[datum_filename] = []
        with open(datum_filepath, 'r', encoding='utf-8', errors='ignore') as fIn:
            datum = json.load(fIn)
            for query_instance in datum['specific_query_list']:
                query = query_instance['query']
                datum_name_to_queries[datum_filename].append(query)


    # get the reference and system summaries for the QMSum dataset:
    data_version_to_info = {}
    for version_name, version_data_path in data_version_to_filepaths.items():
        with open(version_data_path) as fIn:
            data_version_to_info[version_name] = json.load(fIn)

    # get a list of instances on which to get the pairwise comparisons
    instances = []  # list of {'datum_id': <str>, 'type': 'generic|query_focused', 'summ_idx': <int>, 'query': <str>, 'ref_summ': <str>, 'sys_summs': [<str>]}
    for datum_name in datum_name_to_queries:
        
        # for each generic summary of this datum, get the summaries over all noise versions:
        generic_summ_idx = 0
        while True:
            instance = {'datum_id': datum_name, 'type': 'generic', 'summ_idx': generic_summ_idx, 
                        'query': '', 'ref_summ': '', 'sys_summs': {}}
            for version_name, version_info in data_version_to_info.items():
                instance['ref_summ'] = version_info[datum_name]['generic_summaries']['references'][generic_summ_idx]  # should be the same for all versions
                instance['sys_summs'][version_name] = version_info[datum_name]['generic_summaries']['predictions'][generic_summ_idx]
            instances.append(instance)
            generic_summ_idx += 1
            # using the last version info, check if we've covered all generic summaries of the datum:
            if generic_summ_idx == len(version_info[datum_name]['generic_summaries']['references']):
                break

        # for each query-focused summary of this datum, get the summaries over all noise versions:
        qf_summ_idx = 0
        while True:
            instance = {'datum_id': datum_name, 'type': 'query_focused', 'summ_idx': qf_summ_idx, 
                        'query': datum_name_to_queries[datum_name][qf_summ_idx], 'ref_summ': '', 'sys_summs': {}}
            for version_name, version_info in data_version_to_info.items():
                instance['ref_summ'] = version_info[datum_name]['query_focused_summaries']['references'][qf_summ_idx]  # should be the same for all versions
                instance['sys_summs'][version_name] = version_info[datum_name]['query_focused_summaries']['predictions'][qf_summ_idx]
            instances.append(instance)
            qf_summ_idx += 1
            # using the last version info, check if we've covered all generic summaries of the datum:
            if qf_summ_idx == len(version_info[datum_name]['query_focused_summaries']['references']):
                break

    return instances  # list of {'datum_id': <str>, 'type': 'generic|query_focused', 'summ_idx': <int>, 'query': <str>, 'ref_summ': <str>, 'sys_summs': [<str>]}

In [7]:
def compute_confidence_interval(values, confidence_level=0.95):
    # Computes the confidence interval according to the confidence_level specified.
    # E.g., for confidence_level=0.95, the confidence interval is for the (0.025, 0.975) percentiles.
    # The assumption is that the list of values converges to a normal distibution.
    mean = np.mean(values)
    sem = stats.sem(values)  # Standard Error of the Mean
    n = len(values)
    critical_value = stats.t.ppf((1 + confidence_level) / 2., n - 1)  # t-distribution
    margin_of_error = critical_value * sem
    confidence_interval = (mean - margin_of_error, mean + margin_of_error)
    return confidence_interval

def compute_overall_scores(scores_dict):
    # scores_dict is expected to be a dictionary with metric names as keys and lists of scores as values
    overall_scores = {}
    for metric in scores_dict:
        overall_scores[metric] = {
            'mean': np.mean(scores_dict[metric]),
            'std': np.std(scores_dict[metric]),
            'median': np.median(scores_dict[metric]),
            'quartile_1': np.percentile(scores_dict[metric], 25),
            'quartile_3': np.percentile(scores_dict[metric], 75),
            'confidence_interval_95': compute_confidence_interval(scores_dict[metric]),
            'n': len(scores_dict[metric])
        }
    return overall_scores

def load_results_if_exist(results_json_path):
    if os.path.exists(results_json_path):
        with open(results_json_path, 'r') as fIn:
            data = json.load(fIn)
    else:
        data = {}
    return data

def dump_results(results, results_json_path):
    os.makedirs(os.path.dirname(results_json_path), exist_ok=True)
    with open(results_json_path, 'w') as fOut:
        json.dump(results, fOut, indent=4)


def get_existing_results(all_scores_dict, datum_id, summ_type, summ_idx):
    if datum_id in all_scores_dict and summ_type in all_scores_dict[datum_id]:
        return all_scores_dict[datum_id][summ_type][summ_idx]
    return None

In [None]:
'''
END OF HELPER FUNCTIONS
'''

In [None]:
'''
START OF COMPARISON_FOR_NON_CLEANED
'''

In [None]:
'''
The following conducts pairwise comparison of summaries based on non-cleaned transcripts.
For each QMSum instance (transcript+query), a comparison is made between the summaries at different noise levels.
Since there are seven levels of noise, there will be 21 pairs to compare total for each instance, and for each task-model.

First run all the boxes of the helper functions above (top of notebook), and then the following two.

The models to work on can be set in the sys_summ_filenames list.
Outputs the results to <base_path>/results/qmsum_test_pairwise_eval, one json file per task-model.
'''

In [8]:
def compute_pairwise_comparison_qmsum_noisy(data_version_to_filepaths, results_json_path, 
                                            anchored_sys_summ_name=None):
    
    # prepare the data from the different versions of noisy data:
    instances_to_evaluate = prepare_qmsum_noisy_data(data_version_to_filepaths)

    # see if any scores were already computed:
    all_scores_dict = load_results_if_exist(results_json_path)
    
    # if this is a full comparison over all summaries, no need to reprocess finished instances:
    if anchored_sys_summ_name is None:
        if 'overall' in all_scores_dict:
            return all_scores_dict  # all evaluations were already completed
        finished_datum_ids = set(all_scores_dict.keys())
    # if there is a specific system summary to check, then we have to go over everything to compute for the new system:
    else:
        finished_datum_ids = set()
        # since we will recompute overall scores, remove these:
        if 'overall' in all_scores_dict:
            del all_scores_dict['overall']


    # Go instance by instance, and aggregate the results.
    # Each instance is a reference summary, query (empty if generic), and system summaries (different noise levels).
    last_datum_id = ''
    cur_datum_scores_dict = {}
    for instance in tqdm(instances_to_evaluate):
        datum_id = instance['datum_id']

        # if this datum was already computed, skip it (not for anchored comparison):
        if datum_id in finished_datum_ids:
            continue

        # if we've reached a new datum ID, store the results until now in a json file:
        if last_datum_id != datum_id and last_datum_id != '':
            all_scores_dict[last_datum_id] = cur_datum_scores_dict
            dump_results(all_scores_dict, results_json_path)
            cur_datum_scores_dict = {}
            time.sleep(3)  # wait a little before running another batch on GPT
        last_datum_id = datum_id

        # evaluate the summaries of this instance and append it to the results dict:
        summ_ref = instance['ref_summ']
        summ_list = instance['sys_summs']
        query = instance['query']  # this is '' if it is a generic summary
        summ_type = instance['type']  # generic or query_focused
        summ_idx = instance['summ_idx']  # index of summary within this datum (there are several qf summaries per datum)
        if anchored_sys_summ_name is None:
            existing_results = None
        else:
            existing_results = get_existing_results(all_scores_dict, datum_id, summ_type, summ_idx)
        results = compute_pairwise_overall(summ_ref, summ_list, query=query, print_responses=False,
                                           anchored_sys_summ_name=anchored_sys_summ_name,
                                           existing_results=existing_results)
        #if datum_id not in all_scores_dict:
        #    all_scores_dict[datum_id] = {}
        #if summ_type not in all_scores_dict[datum_id]:
        #    all_scores_dict[datum_id][summ_type] = []
        #all_scores_dict[datum_id][summ_type].append(results)
        if summ_type not in cur_datum_scores_dict:
            cur_datum_scores_dict[summ_type] = []
        cur_datum_scores_dict[summ_type].append(results)

    # store the results:
    if last_datum_id != '':
        all_scores_dict[datum_id] = cur_datum_scores_dict
        dump_results(all_scores_dict, results_json_path)

    # put the scores of each noise version in a list (one list per version, for generic and qf)
    all_scores_generic = {}
    all_scores_qf = {}
    for datum_id in all_scores_dict:
        for summ_type, type_scores_dict in [('generic', all_scores_generic), ('query_focused', all_scores_qf)]:
            for res in all_scores_dict[datum_id][summ_type]:
                for version_name in res['scores']:
                    if version_name not in type_scores_dict:
                        type_scores_dict[version_name] = []
                    type_scores_dict[version_name].append(res['scores'][version_name])
    all_scores = {version: all_scores_generic[version] + all_scores_qf[version] for version in all_scores_generic}

    # compute the final score for each noise version:
    overall_scores_generic = compute_overall_scores(all_scores_generic)
    overall_scores_qf = compute_overall_scores(all_scores_qf)
    overall_scores = compute_overall_scores(all_scores)

    all_scores_dict['overall'] = {}
    all_scores_dict['overall']['generic'] = overall_scores_generic
    all_scores_dict['overall']['query_focused'] = overall_scores_qf
    all_scores_dict['overall']['all'] = overall_scores
    dump_results(all_scores_dict, results_json_path)

    return all_scores_dict

In [None]:
base_path = f'{BASE_DIR}/results'
sys_summ_filenames = [
    'results_Mistral7BInstruct_recursive.json',
    'results_Llama3Instruct_recursive.json',
    'results_Llama3_1Instruct_truncate.json',
    'results_Gpt4oMini_truncate.json'
]
#sys_summ_filenames = [
#    'results_Llama3_1Instruct_truncate.json'
#]

for sys_summ_filename in sys_summ_filenames:
    # if this is a "cleaned" version, then the original does not have such a file, so use the basic one (no cleaning needed):
    if '_cleaned_' in sys_summ_filename:
        clean_ver_id_char_idx = sys_summ_filename.index('_cleaned_')
        original_sys_summ_filename = sys_summ_filename[0: clean_ver_id_char_idx] + '.json'  # e.g. "results_test_Gpt4oMini_truncate_cleaned_adj.json" -> "results_test_Gpt4oMini_truncate.json"
    else:
        original_sys_summ_filename = sys_summ_filename

    data_version_to_filepaths = {
        'original': os.path.join(base_path, 'qmsum_test_source', original_sys_summ_filename),
        'no_noise': os.path.join(base_path, 'qmsum_test', sys_summ_filename),
        'noise_10': os.path.join(base_path, 'qmsum_test___reverb__noise_10', sys_summ_filename),
        'noise_5': os.path.join(base_path, 'qmsum_test___reverb__noise_5', sys_summ_filename),
        'noise_0': os.path.join(base_path, 'qmsum_test___reverb__noise_0', sys_summ_filename),
        'noise_m5': os.path.join(base_path, 'qmsum_test___reverb__noise_-5', sys_summ_filename),
        'noise_m10': os.path.join(base_path, 'qmsum_test___reverb__noise_-10', sys_summ_filename)
    }
    results_json_path = os.path.join(base_path, 'qmsum_test_pairwise_eval', sys_summ_filename)
    anchor_sys_summ_name = None  # 'original'


    all_scores_dict = compute_pairwise_comparison_qmsum_noisy(data_version_to_filepaths, results_json_path, 
                                                              anchored_sys_summ_name=anchor_sys_summ_name)

In [None]:
'''
END OF COMPARISON_FOR_NON_CLEANED
'''

In [None]:
'''
START OF COMPARISON_FOR_CLEANED
'''

In [None]:
'''
The following conducts pairwise comparison of summaries based on cleaned transcripts against summaries based on non-cleaned transcripts.
It shows how much a transcript-cleaning-technique was effective for improving summarization.
For each QMSum instance (transcript+query), a comparison is made between a cleaned summary and all the summaries based on non-cleaned transcripts at different noise levels.
There are seven levels of noise, and six of them for the cleaned transcripts ("new") will be compared against the seven summaries of the non-cleaned summaries ("base"). So there will be 42 pairs to compare total for each instance, and for each task-model.

First run all the boxes of the helper functions (top of notebook), and then the following boxes.

The models to work on can be set in the sys_summ_filenames_for_new list.
Outputs the results to <base_path>/results/qmsum_test_pairwise_eval_baseline_change, one json file per task-model.
'''

In [3]:
def prepare_qmsum_noisy_data_for_new_system(data_version_to_filepaths_base, data_version_to_filepaths_new_system):
    # get the queries for the QMSum dataset:
    data_source_folderpath = f'{BASE_DIR}/data/QMSum/test'
    datum_name_to_queries = {}
    for datum_filename in os.listdir(data_source_folderpath):
        if not datum_filename.endswith('.json'):
            continue
        datum_filepath = os.path.join(data_source_folderpath, datum_filename)
        datum_name_to_queries[datum_filename] = []
        with open(datum_filepath, 'r', encoding='utf-8', errors='ignore') as fIn:
            datum = json.load(fIn)
            for query_instance in datum['specific_query_list']:
                query = query_instance['query']
                datum_name_to_queries[datum_filename].append(query)


    # get the reference and system summaries for the QMSum dataset:
    data_version_to_info_base = {}
    for version_name, version_data_path in data_version_to_filepaths_base.items():
        with open(version_data_path) as fIn:
            data_version_to_info_base[version_name] = json.load(fIn)
    # also get it for the new system:
    data_version_to_info_new_system = {}
    for version_name, version_data_path in data_version_to_filepaths_new_system.items():
        with open(version_data_path) as fIn:
            data_version_to_info_new_system[version_name] = json.load(fIn)


    # get a list of instances on which to get the pairwise comparisons
    instances = []  # list of {'datum_id': <str>, 'type': 'generic|query_focused', 'summ_idx': <int>, 'query': <str>, 'ref_summ': <str>, 'sys_summs': [<str>]}
    for datum_name in datum_name_to_queries:
        
        # for each generic summary of this datum, get the summaries over all noise versions:
        generic_summ_idx = 0
        while True:
            instance = {'datum_id': datum_name, 'type': 'generic', 'summ_idx': generic_summ_idx, 
                        'query': '', 'ref_summ': '', 'sys_summs_base': {}, 'sys_summs_new': {}}
            for version_name, version_info in data_version_to_info_base.items():
                instance['ref_summ'] = version_info[datum_name]['generic_summaries']['references'][generic_summ_idx]  # should be the same for all versions
                instance['sys_summs_base'][version_name] = version_info[datum_name]['generic_summaries']['predictions'][generic_summ_idx]
            for version_name, version_info in data_version_to_info_new_system.items():
                instance['sys_summs_new'][version_name] = version_info[datum_name]['generic_summaries']['predictions'][generic_summ_idx]
            instances.append(instance)
            generic_summ_idx += 1
            # using the last version info, check if we've covered all generic summaries of the datum:
            if generic_summ_idx == len(version_info[datum_name]['generic_summaries']['references']):
                break

        # for each query-focused summary of this datum, get the summaries over all noise versions:
        qf_summ_idx = 0
        while True:
            instance = {'datum_id': datum_name, 'type': 'query_focused', 'summ_idx': qf_summ_idx, 
                        'query': datum_name_to_queries[datum_name][qf_summ_idx], 'ref_summ': '', 'sys_summs_base': {}, 'sys_summs_new': {}}
            for version_name, version_info in data_version_to_info_base.items():
                instance['ref_summ'] = version_info[datum_name]['query_focused_summaries']['references'][qf_summ_idx]  # should be the same for all versions
                instance['sys_summs_base'][version_name] = version_info[datum_name]['query_focused_summaries']['predictions'][qf_summ_idx]
            for version_name, version_info in data_version_to_info_new_system.items():
                instance['sys_summs_new'][version_name] = version_info[datum_name]['query_focused_summaries']['predictions'][qf_summ_idx]
            instances.append(instance)
            qf_summ_idx += 1
            # using the last version info, check if we've covered all generic summaries of the datum:
            if qf_summ_idx == len(version_info[datum_name]['query_focused_summaries']['references']):
                break

    return instances  # list of {'datum_id': <str>, 'type': 'generic|query_focused', 'summ_idx': <int>, 'query': <str>, 'ref_summ': <str>, 'sys_summs_base': [<str>], 'sys_summs_new': [<str>]}

In [4]:
def compute_pairwise_overall_for_new_system(summ_ref, summs_sys_new, summs_sys_base, query='', 
                                            anchored_sys_summ_name_new=None, anchored_sys_summ_name_base=None,
                                            existing_results=None, print_responses=False):
    # Computes pairwise similarity between the summaries in summ_list_new and the summaries in summ_list_base.
    # The score for a comparison is: a tie gives each system +1, and a win gives the winning system +2. These are summed over all comparisons.
    # This scoring method is from https://aclanthology.org/2024.findings-naacl.280.pdf
    # summ_ref is the reference summary for the instance
    # summs_sys_new is a dictionary of system_name -> system_summary, for the system summaries being evaluated
    # summs_sys_base is a dictionary of system_name -> system_summary, for the system summaries tp compare against
    # query is the query around which the suammry was written. If this was a generic summary, leave as ''.
    # anchored_sys_summ_name_base is an index of a system summary in summs_sys_base against which to compare the summaries in summs_sys_new. If None, all summaries in summs_sys_new are compared to those in summs_sys_base.
    # anchored_sys_summ_name_new is an index of a system summary in summs_sys_new to compare against the summaries in summs_sys_base. If None, all summaries in summs_sys_new are compared to those in summs_sys_base.
    # existing_results is a results dictionary with existing results, where this inference is for adding results (probably for anchoring a system summary and comnparing against others). If None, a results dictionary is started from scratch.
    # Returns the score counter between the system summaries within a dictionary with more detailed results.
    
    results = {'scores': None, 'result_details': [], 'query': query, 'summ_reference': summ_ref, 'summ_systems_new': summs_sys_new, 'summ_systems_base': summs_sys_base}
    scores = {sys_name: 0 for sys_name in summs_sys_new|summs_sys_base}  # initialize the preference counters for each system

    # if there are existing results, we need to add result details to the existing ones, and add to existing scores:
    if existing_results is not None:
        results['result_details'] = existing_results['result_details']
        for sys_name in summs_sys:
            if sys_name in existing_results['scores']:
                scores[sys_name] = existing_results['scores'][sys_name]

    # loop over all pairs of system summaries to compare them with GPT:
    summ_list_base = [(sys_name, sys_summ) for sys_name, sys_summ in summs_sys_base.items()]
    summ_list_new = [(sys_name, sys_summ) for sys_name, sys_summ in summs_sys_new.items()]
    for summ_new_name, summ_new_summ in summ_list_new:
        for summ_base_name, summ_base_summ in summ_list_base:

            # if there is a specific sys_summ to compare against, only process on that one:
            if (anchored_sys_summ_name_base is not None) and (summ_base_name != anchored_sys_summ_name_base):
                continue
            if (anchored_sys_summ_name_new is not None) and (summ_base_name != anchored_sys_summ_name_new):
                continue

            # randomly choose the order of the summaries as they are presented to the LLM:
            if random.randint(0, 1):
                sys1_name, sys1_summ = summ_new_name, summ_new_summ
                sys2_name, sys2_summ = summ_base_name, summ_base_summ
            else:
                sys2_name, sys2_summ = summ_new_name, summ_new_summ
                sys1_name, sys1_summ = summ_base_name, summ_base_summ

            # compute pairwise comparison with GPT:
            preference, explanation = pairwise_comparison_gpt(summ_ref, sys1_summ, sys2_summ, query)
            #preference, explanation = '1', 'test.'
            results['result_details'].append({'sys1': sys1_name, 'sys2': sys2_name, 
                                              'preference': preference, 'explanation': explanation})

            # provide points for the winning system:
            if preference == '1':
                scores[sys1_name] += 2
            elif preference == '2':
                scores[sys2_name] += 2
            elif preference == 'tie':
                scores[sys1_name] += 1
                scores[sys2_name] += 1
            else:
                print(f'Error with summaries {sys1_name}/{sys2_name}')

            if print_responses:
                print(f'Summ 1 ({sys1_name}) vs. Summ 2 ({sys2_name}):')
                print(f'\tPreference: {preference}')
                print(f'\tExplanation: {explanation}')
                print('---')
    
    results['scores'] = scores

    return results

In [5]:
def compute_pairwise_comparison_qmsum_noisy_for_new_system(data_version_to_filepaths_base, 
                                                           data_version_to_filepaths_new_system, 
                                                           results_json_path, anchored_sys_summ_name_new=None, anchored_sys_summ_name_base=None):
    
    # prepare the data from the different versions of noisy data:
    instances_to_evaluate = prepare_qmsum_noisy_data_for_new_system(data_version_to_filepaths_base, data_version_to_filepaths_new_system)

    # see if any scores were already computed:
    all_scores_dict = load_results_if_exist(results_json_path)
    
    # if this is a full comparison over all summaries, no need to reprocess finished instances:
    if (anchored_sys_summ_name_new is None) and (anchored_sys_summ_name_base is None):
        if 'overall' in all_scores_dict:
            return all_scores_dict  # all evaluations were already completed
        finished_datum_ids = set(all_scores_dict.keys())
    # if there is a specific system summary to check, then we have to go over everything to compute for the new system:
    else:
        finished_datum_ids = set()
        # since we will recompute overall scores, remove these:
        if 'overall' in all_scores_dict:
            del all_scores_dict['overall']


    # Go instance by instance, and aggregate the results.
    # Each instance is a reference summary, query (empty if generic), and system summaries (different noise levels).
    last_datum_id = ''
    cur_datum_scores_dict = {}
    for instance in tqdm(instances_to_evaluate):
        datum_id = instance['datum_id']

        # if this datum was already computed, skip it (not for anchored comparison):
        if datum_id in finished_datum_ids:
            continue

        # if we've reached a new datum ID, store the results until now in a json file:
        if last_datum_id != datum_id and last_datum_id != '':
            all_scores_dict[last_datum_id] = cur_datum_scores_dict
            dump_results(all_scores_dict, results_json_path)
            cur_datum_scores_dict = {}
            time.sleep(1)  # wait a little before running another batch on GPT

        last_datum_id = datum_id

        # evaluate the summaries of this instance and append it to the results dict:
        summ_ref = instance['ref_summ']
        summ_list_new = instance['sys_summs_new']
        summ_list_base = instance['sys_summs_base']
        query = instance['query']  # this is '' if it is a generic summary
        summ_type = instance['type']  # generic or query_focused
        summ_idx = instance['summ_idx']  # index of summary within this datum (there are several qf summaries per datum)
        if (anchored_sys_summ_name_new is None) and (anchored_sys_summ_name_base is None):
            existing_results = None
        else:
            existing_results = get_existing_results(all_scores_dict, datum_id, summ_type, summ_idx)
        results = compute_pairwise_overall_for_new_system(summ_ref, summ_list_new, summ_list_base, query=query, print_responses=False,
                                                          anchored_sys_summ_name_new=anchored_sys_summ_name_new, anchored_sys_summ_name_base=anchored_sys_summ_name_base,
                                                          existing_results=existing_results)
        if summ_type not in cur_datum_scores_dict:
            cur_datum_scores_dict[summ_type] = []
        cur_datum_scores_dict[summ_type].append(results)

    # store the results:
    if last_datum_id != '':
        all_scores_dict[datum_id] = cur_datum_scores_dict
        dump_results(all_scores_dict, results_json_path)

    # put the scores of each noise version in a list (one list per version, for generic and qf)
    all_scores_generic = {}
    all_scores_qf = {}
    for datum_id in all_scores_dict:
        for summ_type, type_scores_dict in [('generic', all_scores_generic), ('query_focused', all_scores_qf)]:
            for res in all_scores_dict[datum_id][summ_type]:
                for version_name in res['scores']:
                    if version_name not in data_version_to_filepaths_new_system:
                        continue  # we only need the scores for the new versions (which are compared against the base versions)
                    if version_name not in type_scores_dict:
                        type_scores_dict[version_name] = []
                    type_scores_dict[version_name].append(res['scores'][version_name])
    all_scores = {version: all_scores_generic[version] + all_scores_qf[version] for version in all_scores_generic}

    # compute the final score for each noise version:
    overall_scores_generic = compute_overall_scores(all_scores_generic)
    overall_scores_qf = compute_overall_scores(all_scores_qf)
    overall_scores = compute_overall_scores(all_scores)

    all_scores_dict['overall'] = {}
    all_scores_dict['overall']['generic'] = overall_scores_generic
    all_scores_dict['overall']['query_focused'] = overall_scores_qf
    all_scores_dict['overall']['all'] = overall_scores
    dump_results(all_scores_dict, results_json_path)

    return all_scores_dict

In [None]:
base_path = f'{BASE_DIR}/results'
# sys_summ_filenames_for_new = [
#     'results_Gpt4oMini_truncate_cleaned_noun.json',
#     'results_Gpt4oMini_truncate_cleaned_verb.json',
#     'results_Gpt4oMini_truncate_cleaned_noncontent.json',
#     'results_Gpt4oMini_truncate_cleaned_adj.json',
#     'results_Gpt4oMini_truncate_cleaned_adv.json',
# ]
# sys_summ_filenames_for_new = [
#     'results_Mistral7BInstruct_recursive_cleaned_noun.json',
#     'results_Mistral7BInstruct_recursive_cleaned_verb.json',
#     'results_Mistral7BInstruct_recursive_cleaned_noncontent.json',
#     'results_Mistral7BInstruct_recursive_cleaned_adj.json',
#     'results_Mistral7BInstruct_recursive_cleaned_adv.json'
# ]
# sys_summ_filenames_for_new = [
#     'results_Mistral7BInstruct_recursive_cleaned_named_entity.json',
#     'results_Gpt4oMini_truncate_cleaned_named_entity.json',
# ]
# sys_summ_filenames_for_new = [
#     'results_Llama3_1Instruct_truncate_cleaned_noun.json',
#     'results_Llama3_1Instruct_truncate_cleaned_verb.json',
#     'results_Llama3_1Instruct_truncate_cleaned_named_entity.json',
#     'results_Llama3_1Instruct_truncate_cleaned_noncontent.json',
#     'results_Llama3_1Instruct_truncate_cleaned_adj.json',
#     'results_Llama3_1Instruct_truncate_cleaned_adv.json',
# ]
# sys_summ_filenames_for_new = [
#     'results_Llama3Instruct_recursive_cleaned_noun.json',
#     'results_Llama3Instruct_recursive_cleaned_verb.json',
#     'results_Llama3Instruct_recursive_cleaned_named_entity.json',
#     'results_Llama3Instruct_recursive_cleaned_noncontent.json',
#     'results_Llama3Instruct_recursive_cleaned_adj.json',
#     'results_Llama3Instruct_recursive_cleaned_adv.json',
# ]
sys_summ_filenames_for_new = [
    'results_Gpt4oMini_truncate_cleaned_content.json',
    'results_Llama3_1Instruct_truncate_cleaned_content.json',
    'results_Llama3Instruct_recursive_cleaned_content.json',
    'results_Mistral7BInstruct_recursive_cleaned_content.json'
]

for sys_summ_filename_for_new in sys_summ_filenames_for_new:
    # the new system version name has "cleaned" in it, and the original does not have the suffix:
    clean_ver_id_char_idx = sys_summ_filename_for_new.index('_cleaned_')
    base_sys_summ_filename = sys_summ_filename_for_new[0: clean_ver_id_char_idx] + '.json'  # e.g. "results_Gpt4oMini_truncate_cleaned_adj.json" -> "results_Gpt4oMini_truncate.json"

    # the different noised versions of the current "new system", which is a pre-transcript-cleaning scheme
    # these are the summaries that will be compared to the base summaries
    data_version_to_filepaths_new_system = {
        'no_noise__new': os.path.join(base_path, 'qmsum_test', sys_summ_filename_for_new),
        'noise_10__new': os.path.join(base_path, 'qmsum_test___reverb__noise_10', sys_summ_filename_for_new),
        'noise_5__new': os.path.join(base_path, 'qmsum_test___reverb__noise_5', sys_summ_filename_for_new),
        'noise_0__new': os.path.join(base_path, 'qmsum_test___reverb__noise_0', sys_summ_filename_for_new),
        'noise_m5__new': os.path.join(base_path, 'qmsum_test___reverb__noise_-5', sys_summ_filename_for_new),
        'noise_m10__new': os.path.join(base_path, 'qmsum_test___reverb__noise_-10', sys_summ_filename_for_new)
    }
    # the different noised versions of the "base system", against which the new summaries are compared against
    data_version_to_filepaths_base = {
        'original__base': os.path.join(base_path, 'qmsum_test_source', base_sys_summ_filename),
        'no_noise__base': os.path.join(base_path, 'qmsum_test', base_sys_summ_filename),
        'noise_10__base': os.path.join(base_path, 'qmsum_test___reverb__noise_10', base_sys_summ_filename),
        'noise_5__base': os.path.join(base_path, 'qmsum_test___reverb__noise_5', base_sys_summ_filename),
        'noise_0__base': os.path.join(base_path, 'qmsum_test___reverb__noise_0', base_sys_summ_filename),
        'noise_m5__base': os.path.join(base_path, 'qmsum_test___reverb__noise_-5', base_sys_summ_filename),
        'noise_m10__base': os.path.join(base_path, 'qmsum_test___reverb__noise_-10', base_sys_summ_filename)
    }

    results_json_path = os.path.join(base_path, 'qmsum_test_pairwise_eval_baseline_change', sys_summ_filename_for_new)
    

    compute_pairwise_comparison_qmsum_noisy_for_new_system(data_version_to_filepaths_base, 
                                                           data_version_to_filepaths_new_system, 
                                                           results_json_path, anchored_sys_summ_name_new=None, anchored_sys_summ_name_base=None)

In [None]:
'''
END OF COMPARISON_FOR_CLEANED
'''

In [None]:
'''
START OF TEST
'''

In [None]:
query = ''
summ_ref = "The group discussed the first version of the Bayes-net used to work out a user's intentions when asking for directions from a navigation device. Three intentions were identified: Vista (to view), Enter (to visit) and Tango (to approach). The structure of the belief-net comprises, firstly, a feature layer, which includes linguistic, discourse and world knowledge information that can be gleaned from the data. It is possible for these variables to form thematic clusters( eg \"entrance\", \"type of object\", \"verb\"), each one with a separate middle layer.  At this stage, all the actual probabilities are ad-hoc and hand-coded. However, there has been progress in the design and organisation of experiments, that will eventually provide data more useful and appropriate for this task."
summ_sys_none = "The meeting was about discussing a PE PowerPoint presentation related to a project involving handling Goatee cans (10 Cares) and deciding what the intermediate nodes were for a belief net. The group was trying to figure out the concept of whether someone is a tourist or running an errand, and they came up with hidden variables such as whether someone is on a tour, running an errand, or in a hurry. They also discussed the probabilities of different verbs being used and the possibility of adding more hidden nodes as they look at the data more. The meeting ended with a discussion about the Java Buy program and the need to ask someone who is more familiar with it for help."
summ_sys_m10 = "The meeting was a discussion among several individuals, possibly a group project or brainstorming session, regarding various topics such as business, technology, and personal interests. The participants discussed ideas for a project, shared their thoughts on different subjects, and provided feedback to each other. Some of the key points discussed included the importance of keeping contacts organized, the use of technology for communication and navigation, the need for a system to verify the identity of contacts, and the potential benefits of using a tango or other music in a presentation. The meeting ended with the participants expressing appreciation for each other's ideas and expressing excitement for the project moving forward."
summ_sys_m5 = "The meeting was about a project related to tourism, where the participants were discussing various features and methods to create a system that would help tourists navigate and find their destinations more easily. They were discussing the extraction of certain features from the communication of someone trying to convey an abstract idea, such as a tourist wanting to go to a specific place. They also discussed the possibility of using a binary system for certain variables, such as whether the destination is the final destination, whether they're doing business, whether they're in a hurry, whether they're tourists, etc. They also discussed the possibility of using a probabilistic approach to determine the likelihood of certain events, such as whether a person is entering a place or not. Overall, the goal was to create a system that would make it easier for tourists to navigate and find their destinations."
summ_sys_0 = "The meeting was about developing a system to analyze communication data for determining whether a person is a tourist or engaged in business. The group discussed various features to extract from the data, such as the destination, whether the person is in a hurry, the time of day, and whether they are discussing admission fees. They also discussed the concept of a \"landmark,\" which could be a tourist attraction or a building of architectural significance. The group decided to create a middle layer to abstract the features and probabilistically determine the context based on the extracted features. They also discussed the possibility of adding a hidden layer to represent the person's intention or task. The meeting ended with the group planning to continue working on the project and refining the probabilities and hidden layers."
summ_sys_5 = "The meeting was about discussing a project related to handling data and making inferences based on that data. The project involved creating a system to understand the context of a conversation, such as whether the person is a tourist or running errands, and the destination they are trying to reach. The team was working on a layered approach to this problem, with the first layer being the features extracted from the conversation, the second layer being the hidden variables, and the third layer being the output, which could be a probability of the person entering, tangoing, or visting. The team was also discussing the possibility of adding more layers and variables to improve the accuracy of the system. They also encountered some issues with the probabilities and the way the system was set up, and discussed potential solutions for these issues. Overall, the goal was to create a system that could help businesses better understand the needs and intentions of their customers."
summ_sys_10 = "The meeting was about discussing a project related to artificial intelligence and machine learning. The group was trying to develop a system that could understand and respond to human communication in a specific context, such as a tourist or a business person. They were discussing the structure of the system, including the layers and nodes, and the features they would extract from the communication. They also talked about the probabilities and the programming they would use to make the system work. They had some issues with the current setup and were brainstorming ways to improve it, such as adding more hidden layers and nodes. They also discussed the possibility of adding more features, such as the accessibility of a landmark or whether it can be entered. The meeting ended with some members agreeing to look into certain aspects of the project and continue working on it."

query = "What did Grad B say about the structure of the belief net?"
summ_ref = "It is necessary for the belief-net to have at least one layer of nodes between the features and the final output. This makes the structure more flexible in terms of coding feature-layer probabilities. "
summ_sys_none =  "Grad B said that Goatee cans, to his understanding, are a part of the belief net structure. They refer to the intermediate nodes that were decided upon to help determine the abstract idea being communicated. These nodes help in deciding whether someone is a tourist, running an errand, or in a hurry, among other things. They are not directly extracted from the data but are hidden variables that help in making inferences."
summ_sys_m10 = "Grad B said that the structure of the belief net is bad. He mentioned that they have some of the other things but decided to put them on the same page. He also mentioned that there's going to be a reason to comment, either the context or the time of day, and that the suggestion is correct. However, he also mentioned that they will open the suggestions."
summ_sys_m5 =  "Grad B mentioned that the structure of the belief net they are working on consists of three different methods. The first method is related to the communication of someone trying to convey an abstract idea, such as a tourist wanting to go to a specific place. The second method involves setting up features based on what they are going to do and what was said. The third method involves using this vision to make decisions. Grad B also mentioned that they have features drawn from the communication of someone, and they are trying to extract those features from the data they have without having a whole entire class of data."
summ_sys_0 = "Grad B said that the structure of the belief net they were working on is a 7-2 net, which refers to a network with 7 input nodes and 2 output nodes. They were trying to decide the inner meaning for the rules in the middle layer of this network, and they decided on features such as whether someone was on a tour, trying to care, or occurring, whether the destination is the final destination, close, whether they're doing business, in a hurry, and whether they're tourists, and whether the police are involved. They also added a build layer to set up features along the lines of where they want to go and what means they should use. The mode of the net depends on all these features."
summ_sys_5 = "Grad B said that the structure of the belief net is Goatsie Pans, to his understanding, 10 Kings. This suggests that the belief net has multiple layers or nodes, possibly with some sort of hierarchical or interconnected structure, and that there are certain features or variables associated with each node. However, the exact details of the structure and how the nodes and features are related are not fully clear from the conversation."
summ_sys_10 = "Grad B did not say anything about the structure of the belief net in the given conversation. Instead, they discussed various features and nodes in the belief net, such as the context, verb use, and the middle layer, but they did not provide a clear description of the overall structure of the belief net."

#preference, explanation = pairwise_comparison_gpt(summ_ref, summ_sys_none, summ_sys_0, query=query)
#print(preference)
#print(explanation)

summ_list = {'no_noise': summ_sys_none, 'noise_10': summ_sys_10, 'noise_5': summ_sys_5, 'noise_0': summ_sys_0, 'noise_m5': summ_sys_m5, 'noise_m10': summ_sys_m10}
results_qf = compute_pairwise_overall(summ_ref, summ_list, query=query, print_responses=True)

In [None]:
'''
END OF TEST
'''