## Notebook: Analyse LLM Synthesis Retries

## Packages

In [100]:
import numpy as np
import json

## Constants

In [101]:
LLMS = ["Llama3_70B", "GPT-3"] # "Llama70B", "GPT-3"
FEW_SHOT_CONDITIONS = ["fixed", "random"] # "fixed", "random"
N_SPLITS = 6

## Code

### Retries

In [102]:
language_statistics = {}

In [103]:
for llm in LLMS:
    language_statistics[llm] = {}
    for condition in FEW_SHOT_CONDITIONS:
        language_statistics[llm][condition] = {
            "n_examples": 0,
            "n_retries": 0,
            "more_than_25_retries": 0,
            "n_aspects_more_one_sentence": [],
            "more_than_one_retry": 0,
            "invalid_xml_schema": 0,
            "invalid_xml_tags": 0,
            "aspect_polarity_in_text_but_not_in_label": 0,
            "more_than_one_sentences": 0,
            "empty_aspect_term": 0,
            "invalid_single_word_aspect_term_pos_tag": 0,
            "no_token_in_sentence": 0,
        }

        for split in range(N_SPLITS):
            with open(f"../07 train models/synth/{llm}/{condition}/split_{split}.json", 'r') as file:
                synth_data_split = json.load(file)

            language_statistics[llm][condition]["n_examples"] += len(
                synth_data_split)
            language_statistics[llm][condition]["n_retries"] += np.sum(
                len(example["llm_retry_statistic"]) for example in synth_data_split)
            language_statistics[llm][condition]["more_than_25_retries"] += len([ex for ex in (
                len(example["llm_retry_statistic"]) for example in synth_data_split) if ex > 25])

            for ex in synth_data_split:
                if ex["llm_more_than_one_sentences"] > 0:

                    language_statistics[llm][condition]["n_aspects_more_one_sentence"].append(
                        len(ex["llm_label"]))

            language_statistics[llm][condition]["more_than_one_retry"] += len([ex for ex in (
                len(example["llm_retry_statistic"]) for example in synth_data_split) if ex > 0])
            language_statistics[llm][condition]["invalid_xml_schema"] += sum(
                example["llm_invalid_xml_schema"] for example in synth_data_split)
            language_statistics[llm][condition]["invalid_xml_tags"] += sum(
                example["llm_invalid_xml_tags"] for example in synth_data_split)
            language_statistics[llm][condition]["aspect_polarity_in_text_but_not_in_label"] += sum(
                example["llm_aspect_polarity_in_text_but_not_in_label"] for example in synth_data_split)
            language_statistics[llm][condition]["more_than_one_sentences"] += sum(
                example["llm_more_than_one_sentences"] for example in synth_data_split)
            language_statistics[llm][condition]["empty_aspect_term"] += sum(
                example["llm_empty_aspect_term"] for example in synth_data_split)
            language_statistics[llm][condition]["invalid_single_word_aspect_term_pos_tag"] += sum(
                example["llm_invalid_single_word_aspect_term_pos_tag"] for example in synth_data_split)
            language_statistics[llm][condition]["no_token_in_sentence"] += sum(
                example["llm_no_token_in_sentence"] for example in synth_data_split)
            
        language_statistics[llm][condition]["n_aspects_more_one_sentence"] = np.mean(language_statistics[llm][condition]["n_aspects_more_one_sentence"])

  language_statistics[llm][condition]["n_retries"] += np.sum(


In [104]:
synth_data_split[0]["llm_label"]

[['AMBIENCE', 'POSITIVE']]

In [105]:
language_statistics

{'Llama3_70B': {'fixed': {'n_examples': 11850,
   'n_retries': 12656,
   'more_than_25_retries': 92,
   'n_aspects_more_one_sentence': 2.918486171761281,
   'more_than_one_retry': 2308,
   'invalid_xml_schema': 25,
   'invalid_xml_tags': 23,
   'aspect_polarity_in_text_but_not_in_label': 1147,
   'more_than_one_sentences': 12059,
   'empty_aspect_term': 1,
   'invalid_single_word_aspect_term_pos_tag': 997,
   'no_token_in_sentence': 0},
  'random': {'n_examples': 9000,
   'n_retries': 4411,
   'more_than_25_retries': 19,
   'n_aspects_more_one_sentence': 2.3988853503184715,
   'more_than_one_retry': 1664,
   'invalid_xml_schema': 27,
   'invalid_xml_tags': 14,
   'aspect_polarity_in_text_but_not_in_label': 877,
   'more_than_one_sentences': 3706,
   'empty_aspect_term': 0,
   'invalid_single_word_aspect_term_pos_tag': 359,
   'no_token_in_sentence': 0}},
 'GPT-3': {'fixed': {'n_examples': 11850,
   'n_retries': 901,
   'more_than_25_retries': 9,
   'n_aspects_more_one_sentence': 4.3391

### Duration

In [106]:
def convert_seconds_to_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    
    time_string = "{:02}:{:02}:{:.4f}".format(int(hours), int(minutes), seconds)
    return time_string

In [107]:
duration_statistics = {}

In [108]:
for llm in LLMS:
    duration_statistics[llm] = {}
    for condition in FEW_SHOT_CONDITIONS:
        duration_statistics[llm][condition] = {}
        duration_statistics[llm][condition]["time_no_retries"] = []
        duration_statistics[llm][condition]["time_with_retries"] = []
        duration_statistics[llm][condition]["avg_gen_time_no_retries"] = []
        duration_statistics[llm][condition]["avg_gen_time_with_retries"] = []
        for split in range(N_SPLITS):
            with open(f"../07 train models/synth/{llm}/{condition}/split_{split}.json", 'r') as file:
                synth_data_split = json.load(file)
            duration_statistics[llm][condition]["time_no_retries"] += [example["llm_prediction_duration"] for example in synth_data_split]
            duration_statistics[llm][condition]["time_with_retries"] += [example["llm_prediction_duration"] for example in synth_data_split]
            duration_statistics[llm][condition]["time_with_retries"] += [example["llm_prediction_duration"] for main_example in synth_data_split for example in main_example["llm_retry_statistic"]]
  

        print(len(duration_statistics[llm][condition]["time_with_retries"]))
        duration_statistics[llm][condition]["avg_gen_time_no_retries"] = convert_seconds_to_time(np.mean(duration_statistics[llm][condition]["time_no_retries"]))
        duration_statistics[llm][condition]["avg_gen_time_with_retries"] = convert_seconds_to_time(np.mean(duration_statistics[llm][condition]["time_with_retries"]))
        duration_statistics[llm][condition]["time_no_retries"] = convert_seconds_to_time(np.sum(duration_statistics[llm][condition]["time_no_retries"]))
        duration_statistics[llm][condition]["time_with_retries"] = convert_seconds_to_time(np.sum(duration_statistics[llm][condition]["time_with_retries"]))

24506
13411
12751
9258


In [109]:
duration_statistics

{'Llama3_70B': {'fixed': {'time_no_retries': '73:52:55.0598',
   'time_with_retries': '230:40:55.8351',
   'avg_gen_time_no_retries': '00:00:22.4452',
   'avg_gen_time_with_retries': '00:00:33.8879'},
  'random': {'time_no_retries': '51:41:47.5828',
   'time_with_retries': '98:30:40.1811',
   'avg_gen_time_no_retries': '00:00:20.6786',
   'avg_gen_time_with_retries': '00:00:26.4440'}},
 'GPT-3': {'fixed': {'time_no_retries': '03:48:36.3313',
   'time_with_retries': '04:17:41.3398',
   'avg_gen_time_no_retries': '00:00:1.1575',
   'avg_gen_time_with_retries': '00:00:1.2126'},
  'random': {'time_no_retries': '02:41:13.0422',
   'time_with_retries': '02:49:6.3992',
   'avg_gen_time_no_retries': '00:00:1.0748',
   'avg_gen_time_with_retries': '00:00:1.0960'}}}

In [110]:
def format_duration(time_str):
    hours, minutes, seconds, milliseconds = map(float, time_str.replace('.', ':').split(':'))
    days = (hours - (hours % 24)) / 24
    hours = hours % 24
    total_seconds = (hours * 60 * 60) + (minutes * 60) + seconds
    formatted_time = ""

    if days:
        formatted_time += f"{int(days)} d, "
    if hours:
        formatted_time += f"{int(hours)} h, "
    if minutes:
        formatted_time += f"{int(minutes)} m, "
    if seconds:
        formatted_time += f"{int(seconds)} s, "
    if milliseconds:
        formatted_time += f"{int(milliseconds)} ms"

    return formatted_time.strip()


def format_dictionary_duration(dictionary):
    for model, model_data in dictionary.items():
        for mode, mode_data in model_data.items():
            for key, value in mode_data.items():
                mode_data[key] = format_duration(value)

In [111]:
format_dictionary_duration(duration_statistics)

In [112]:
duration_statistics

{'Llama3_70B': {'fixed': {'time_no_retries': '3 d, 1 h, 52 m, 55 s, 598 ms',
   'time_with_retries': '9 d, 14 h, 40 m, 55 s, 8351 ms',
   'avg_gen_time_no_retries': '22 s, 4452 ms',
   'avg_gen_time_with_retries': '33 s, 8879 ms'},
  'random': {'time_no_retries': '2 d, 3 h, 41 m, 47 s, 5828 ms',
   'time_with_retries': '4 d, 2 h, 30 m, 40 s, 1811 ms',
   'avg_gen_time_no_retries': '20 s, 6786 ms',
   'avg_gen_time_with_retries': '26 s, 4440 ms'}},
 'GPT-3': {'fixed': {'time_no_retries': '3 h, 48 m, 36 s, 3313 ms',
   'time_with_retries': '4 h, 17 m, 41 s, 3398 ms',
   'avg_gen_time_no_retries': '1 s, 1575 ms',
   'avg_gen_time_with_retries': '1 s, 2126 ms'},
  'random': {'time_no_retries': '2 h, 41 m, 13 s, 422 ms',
   'time_with_retries': '2 h, 49 m, 6 s, 3992 ms',
   'avg_gen_time_no_retries': '1 s, 748 ms',
   'avg_gen_time_with_retries': '1 s, 960 ms'}}}