## Notebook: Analyse LLM Synthesis Retries

## Packages

In [18]:
import numpy as np
import json

## Constants

In [19]:
LLMS = ["Llama70B", "GPT-3"] # "Llama70B", "GPT-3"
FEW_SHOT_CONDITIONS = ["fixed", "random"] # "fixed", "random"
N_SPLITS = 6

## Code

### Retries

In [20]:
language_statistics = {}

In [21]:
for llm in LLMS:
    language_statistics[llm] = {}
    for condition in FEW_SHOT_CONDITIONS:
        language_statistics[llm][condition] = {
            "n_examples": 0,
            "n_retries": 0,
            "more_than_25_retries": 0,
            "invalid_xml_schema": 0,
            "invalid_xml_tags": 0,
            "aspect_polarity_in_text_but_not_in_label": 0,
            "more_than_one_sentences": 0,
            "empty_aspect_term": 0,
            "invalid_single_word_aspect_term_pos_tag": 0,
            "no_token_in_sentence": 0,
        }

        for split in range(N_SPLITS):
            with open(f"../07 train models/synth/{llm}/{condition}/split_{split}.json", 'r') as file:
                synth_data_split = json.load(file)

            language_statistics[llm][condition]["n_examples"] += len(synth_data_split)
            language_statistics[llm][condition]["n_retries"] += np.sum(len(example["llm_retry_statistic"]) for example in synth_data_split)
            language_statistics[llm][condition]["more_than_25_retries"] += len([ex for ex in (len(example["llm_retry_statistic"]) for example in synth_data_split) if ex > 25])
            language_statistics[llm][condition]["invalid_xml_schema"] += sum(example["llm_invalid_xml_schema"] for example in synth_data_split)
            language_statistics[llm][condition]["invalid_xml_tags"] += sum(example["llm_invalid_xml_tags"] for example in synth_data_split)
            language_statistics[llm][condition]["aspect_polarity_in_text_but_not_in_label"] += sum(example["llm_aspect_polarity_in_text_but_not_in_label"] for example in synth_data_split)
            language_statistics[llm][condition]["more_than_one_sentences"] += sum(example["llm_more_than_one_sentences"] for example in synth_data_split)
            language_statistics[llm][condition]["empty_aspect_term"] += sum(example["llm_empty_aspect_term"] for example in synth_data_split)
            language_statistics[llm][condition]["invalid_single_word_aspect_term_pos_tag"] += sum(example["llm_invalid_single_word_aspect_term_pos_tag"] for example in synth_data_split)
            language_statistics[llm][condition]["no_token_in_sentence"] += sum(example["llm_no_token_in_sentence"] for example in synth_data_split)



  language_statistics[llm][condition]["n_retries"] += np.sum(len(example["llm_retry_statistic"]) for example in synth_data_split)


In [22]:
language_statistics

{'Llama70B': {'fixed': {'n_examples': 11850,
   'n_retries': 1355,
   'more_than_25_retries': 0,
   'invalid_xml_schema': 6,
   'invalid_xml_tags': 11,
   'aspect_polarity_in_text_but_not_in_label': 1107,
   'more_than_one_sentences': 173,
   'empty_aspect_term': 1,
   'invalid_single_word_aspect_term_pos_tag': 127,
   'no_token_in_sentence': 0},
  'random': {'n_examples': 9000,
   'n_retries': 1131,
   'more_than_25_retries': 0,
   'invalid_xml_schema': 3,
   'invalid_xml_tags': 2,
   'aspect_polarity_in_text_but_not_in_label': 965,
   'more_than_one_sentences': 117,
   'empty_aspect_term': 3,
   'invalid_single_word_aspect_term_pos_tag': 108,
   'no_token_in_sentence': 0}},
 'GPT-3': {'fixed': {'n_examples': 11850,
   'n_retries': 901,
   'more_than_25_retries': 9,
   'invalid_xml_schema': 1,
   'invalid_xml_tags': 0,
   'aspect_polarity_in_text_but_not_in_label': 93,
   'more_than_one_sentences': 674,
   'empty_aspect_term': 0,
   'invalid_single_word_aspect_term_pos_tag': 179,
   '

### Duration

In [23]:
def convert_seconds_to_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    
    time_string = "{:02}:{:02}:{:.4f}".format(int(hours), int(minutes), seconds)
    return time_string

In [24]:
duration_statistics = {}

In [25]:
for llm in LLMS:
    duration_statistics[llm] = {}
    for condition in FEW_SHOT_CONDITIONS:
        duration_statistics[llm][condition] = {}
        duration_statistics[llm][condition]["time_no_retries"] = []
        duration_statistics[llm][condition]["time_with_retries"] = []
        duration_statistics[llm][condition]["avg_gen_time_no_retries"] = []
        duration_statistics[llm][condition]["avg_gen_time_with_retries"] = []
        for split in range(N_SPLITS):
            with open(f"../07 train models/synth/{llm}/{condition}/split_{split}.json", 'r') as file:
                synth_data_split = json.load(file)
            duration_statistics[llm][condition]["time_no_retries"] += [example["llm_prediction_duration"] for example in synth_data_split]
            duration_statistics[llm][condition]["time_with_retries"] += [example["llm_prediction_duration"] for example in synth_data_split]
            duration_statistics[llm][condition]["time_with_retries"] += [example["llm_prediction_duration"] for main_example in synth_data_split for example in main_example["llm_retry_statistic"]]
  

        print(len(duration_statistics[llm][condition]["time_with_retries"]))
        duration_statistics[llm][condition]["avg_gen_time_no_retries"] = convert_seconds_to_time(np.mean(duration_statistics[llm][condition]["time_no_retries"]))
        duration_statistics[llm][condition]["avg_gen_time_with_retries"] = convert_seconds_to_time(np.mean(duration_statistics[llm][condition]["time_with_retries"]))
        duration_statistics[llm][condition]["time_no_retries"] = convert_seconds_to_time(np.sum(duration_statistics[llm][condition]["time_no_retries"]))
        duration_statistics[llm][condition]["time_with_retries"] = convert_seconds_to_time(np.sum(duration_statistics[llm][condition]["time_with_retries"]))

13205
10131
12751
9258


In [26]:
duration_statistics

{'Llama70B': {'fixed': {'time_no_retries': '196:46:55.6779',
   'time_with_retries': '222:28:6.0857',
   'avg_gen_time_no_retries': '00:00:59.7819',
   'avg_gen_time_with_retries': '00:01:0.6502'},
  'random': {'time_no_retries': '143:06:27.2072',
   'time_with_retries': '164:14:50.4334',
   'avg_gen_time_no_retries': '00:00:57.2430',
   'avg_gen_time_with_retries': '00:00:58.3645'}},
 'GPT-3': {'fixed': {'time_no_retries': '03:48:36.3313',
   'time_with_retries': '04:17:41.3398',
   'avg_gen_time_no_retries': '00:00:1.1575',
   'avg_gen_time_with_retries': '00:00:1.2126'},
  'random': {'time_no_retries': '02:41:13.0422',
   'time_with_retries': '02:49:6.3992',
   'avg_gen_time_no_retries': '00:00:1.0748',
   'avg_gen_time_with_retries': '00:00:1.0960'}}}

In [27]:
def format_duration(time_str):
    hours, minutes, seconds, milliseconds = map(float, time_str.replace('.', ':').split(':'))
    days = (hours - (hours % 24)) / 24
    hours = hours % 24
    total_seconds = (hours * 60 * 60) + (minutes * 60) + seconds
    formatted_time = ""

    if days:
        formatted_time += f"{int(days)} d, "
    if hours:
        formatted_time += f"{int(hours)} h, "
    if minutes:
        formatted_time += f"{int(minutes)} m, "
    if seconds:
        formatted_time += f"{int(seconds)} s, "
    if milliseconds:
        formatted_time += f"{int(milliseconds)} ms"

    return formatted_time.strip()


def format_dictionary_duration(dictionary):
    for model, model_data in dictionary.items():
        for mode, mode_data in model_data.items():
            for key, value in mode_data.items():
                mode_data[key] = format_duration(value)

In [28]:
format_dictionary_duration(duration_statistics)

In [29]:
duration_statistics

{'Llama70B': {'fixed': {'time_no_retries': '8 d, 4 h, 46 m, 55 s, 6779 ms',
   'time_with_retries': '9 d, 6 h, 28 m, 6 s, 857 ms',
   'avg_gen_time_no_retries': '59 s, 7819 ms',
   'avg_gen_time_with_retries': '1 m, 6502 ms'},
  'random': {'time_no_retries': '5 d, 23 h, 6 m, 27 s, 2072 ms',
   'time_with_retries': '6 d, 20 h, 14 m, 50 s, 4334 ms',
   'avg_gen_time_no_retries': '57 s, 2430 ms',
   'avg_gen_time_with_retries': '58 s, 3645 ms'}},
 'GPT-3': {'fixed': {'time_no_retries': '3 h, 48 m, 36 s, 3313 ms',
   'time_with_retries': '4 h, 17 m, 41 s, 3398 ms',
   'avg_gen_time_no_retries': '1 s, 1575 ms',
   'avg_gen_time_with_retries': '1 s, 2126 ms'},
  'random': {'time_no_retries': '2 h, 41 m, 13 s, 422 ms',
   'time_with_retries': '2 h, 49 m, 6 s, 3992 ms',
   'avg_gen_time_no_retries': '1 s, 748 ms',
   'avg_gen_time_with_retries': '1 s, 960 ms'}}}

In [30]:
examples = []
for split in range(N_SPLITS):
    with open(f"../07 train models/synth/GPT-3/fixed/split_{split}.json", 'r') as file:
        synth_data_split = json.load(file)
        examples += synth_data_split

In [31]:
[("-->", example["llm_label"], len(example["llm_retry_statistic"]), example["llm_more_than_one_sentences"]) for example in examples if len(example["llm_retry_statistic"]) > 25]

[('-->',
  [['SERVICE', 'NEUTRAL'],
   ['PRICE', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE']],
  33,
  33),
 ('-->',
  [['SERVICE', 'NEUTRAL'],
   ['PRICE', 'NEGATIVE'],
   ['AMBIENCE', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE'],
   ['PRICE', 'NEUTRAL']],
  34,
  34),
 ('-->',
  [['FOOD', 'POSITIVE'],
   ['PRICE', 'NEGATIVE'],
   ['FOOD', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'NEGATIVE']],
  27,
  27),
 ('-->',
  [['AMBIENCE', 'NEUTRAL'],
   ['PRICE', 'NEGATIVE'],
   ['AMBIENCE', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE']],
  50,
  50),
 ('-->',
  [['FOOD', 'NEUTRAL'],
   ['FOOD', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'NEUTRAL']],
  50,
  50),
 ('-->',
  [['SERVICE', 'POSITIVE'],
   ['FOOD', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE']],
  32,
  32),
 ('-->',
  [['FOOD', 'POSITIVE'],
   ['PRICE', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'NEGATIVE']],
  35,
  35),
 ('-->',
  [['AMB