## Notebook: Analyse LLM Synthesis Retries

## Packages

In [134]:
import numpy as np
import json

## Constants

In [135]:
LLMS = ["Llama70B", "GPT-3"] # "Llama70B", "GPT-3"
FEW_SHOT_CONDITIONS = ["fixed", "random"] # "fixed", "random"
N_SPLITS = 5

## Code

### Retries

In [136]:
language_statistics = {}

In [137]:
for llm in LLMS:
    language_statistics[llm] = {}
    for condition in FEW_SHOT_CONDITIONS:
        language_statistics[llm][condition] = {
            "n_examples": 0,
            "n_retries": 0,
            "more_than_25_retries": 0,
            "invalid_xml_schema": 0,
            "invalid_xml_tags": 0,
            "aspect_polarity_in_text_but_not_in_label": 0,
            "more_than_one_sentences": 0,
            "empty_aspect_term": 0,
            "invalid_single_word_aspect_term_pos_tag": 0,
            "no_token_in_sentence": 0,
        }

        for split in range(N_SPLITS):
            with open(f"../07 train models/synth/{llm}/{condition}/split_{split}.json", 'r') as file:
                synth_data_split = json.load(file)

            language_statistics[llm][condition]["n_examples"] += len(synth_data_split)
            language_statistics[llm][condition]["n_retries"] += np.sum(len(example["llm_retry_statistic"]) for example in synth_data_split)
            language_statistics[llm][condition]["more_than_25_retries"] += len([ex for ex in (len(example["llm_retry_statistic"]) for example in synth_data_split) if ex > 25])
            language_statistics[llm][condition]["invalid_xml_schema"] += sum(example["llm_invalid_xml_schema"] for example in synth_data_split)
            language_statistics[llm][condition]["invalid_xml_tags"] += sum(example["llm_invalid_xml_tags"] for example in synth_data_split)
            language_statistics[llm][condition]["aspect_polarity_in_text_but_not_in_label"] += sum(example["llm_aspect_polarity_in_text_but_not_in_label"] for example in synth_data_split)
            language_statistics[llm][condition]["more_than_one_sentences"] += sum(example["llm_more_than_one_sentences"] for example in synth_data_split)
            language_statistics[llm][condition]["empty_aspect_term"] += sum(example["llm_empty_aspect_term"] for example in synth_data_split)
            language_statistics[llm][condition]["invalid_single_word_aspect_term_pos_tag"] += sum(example["llm_invalid_single_word_aspect_term_pos_tag"] for example in synth_data_split)
            language_statistics[llm][condition]["no_token_in_sentence"] += sum(example["llm_no_token_in_sentence"] for example in synth_data_split)



  language_statistics[llm][condition]["n_retries"] += np.sum(len(example["llm_retry_statistic"]) for example in synth_data_split)


In [138]:
language_statistics

{'Llama70B': {'fixed': {'n_examples': 9875,
   'n_retries': 1176,
   'more_than_25_retries': 0,
   'invalid_xml_schema': 5,
   'invalid_xml_tags': 11,
   'aspect_polarity_in_text_but_not_in_label': 1001,
   'more_than_one_sentences': 154,
   'empty_aspect_term': 1,
   'invalid_single_word_aspect_term_pos_tag': 63,
   'no_token_in_sentence': 0},
  'random': {'n_examples': 7500,
   'n_retries': 906,
   'more_than_25_retries': 0,
   'invalid_xml_schema': 3,
   'invalid_xml_tags': 1,
   'aspect_polarity_in_text_but_not_in_label': 818,
   'more_than_one_sentences': 85,
   'empty_aspect_term': 2,
   'invalid_single_word_aspect_term_pos_tag': 49,
   'no_token_in_sentence': 0}},
 'GPT-3': {'fixed': {'n_examples': 9875,
   'n_retries': 810,
   'more_than_25_retries': 9,
   'invalid_xml_schema': 1,
   'invalid_xml_tags': 0,
   'aspect_polarity_in_text_but_not_in_label': 51,
   'more_than_one_sentences': 644,
   'empty_aspect_term': 0,
   'invalid_single_word_aspect_term_pos_tag': 155,
   'no_tok

### Duration

In [139]:
def convert_seconds_to_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    
    time_string = "{:02}:{:02}:{:.4f}".format(int(hours), int(minutes), seconds)
    return time_string

In [140]:
duration_statistics = {}

In [141]:
for llm in LLMS:
    duration_statistics[llm] = {}
    for condition in FEW_SHOT_CONDITIONS:
        duration_statistics[llm][condition] = {}
        duration_statistics[llm][condition]["time_no_retries"] = []
        duration_statistics[llm][condition]["time_with_retries"] = []
        duration_statistics[llm][condition]["avg_gen_time_no_retries"] = []
        duration_statistics[llm][condition]["avg_gen_time_with_retries"] = []
        for split in range(N_SPLITS):
            with open(f"../07 train models/synth/{llm}/{condition}/split_{split}.json", 'r') as file:
                synth_data_split = json.load(file)
            duration_statistics[llm][condition]["time_no_retries"] += [example["llm_prediction_duration"] for example in synth_data_split]
            duration_statistics[llm][condition]["time_with_retries"] += [example["llm_prediction_duration"] for example in synth_data_split]
            duration_statistics[llm][condition]["time_with_retries"] += [example["llm_prediction_duration"] for main_example in synth_data_split for example in main_example["llm_retry_statistic"]]
  

        print(len(duration_statistics[llm][condition]["time_with_retries"]))
        duration_statistics[llm][condition]["avg_gen_time_no_retries"] = convert_seconds_to_time(np.mean(duration_statistics[llm][condition]["time_no_retries"]))
        duration_statistics[llm][condition]["avg_gen_time_with_retries"] = convert_seconds_to_time(np.mean(duration_statistics[llm][condition]["time_with_retries"]))
        duration_statistics[llm][condition]["time_no_retries"] = convert_seconds_to_time(np.sum(duration_statistics[llm][condition]["time_no_retries"]))
        duration_statistics[llm][condition]["time_with_retries"] = convert_seconds_to_time(np.sum(duration_statistics[llm][condition]["time_with_retries"]))

11051
8406
10685
7728


In [142]:
duration_statistics

{'Llama70B': {'fixed': {'time_no_retries': '166:30:37.4865',
   'time_with_retries': '189:04:30.2025',
   'avg_gen_time_no_retries': '00:01:0.7025',
   'avg_gen_time_with_retries': '00:01:1.5935'},
  'random': {'time_no_retries': '120:22:12.3716',
   'time_with_retries': '137:42:16.8850',
   'avg_gen_time_no_retries': '00:00:57.7776',
   'avg_gen_time_with_retries': '00:00:58.9742'}},
 'GPT-3': {'fixed': {'time_no_retries': '03:12:3.1241',
   'time_with_retries': '03:38:25.7390',
   'avg_gen_time_no_retries': '00:00:1.1669',
   'avg_gen_time_with_retries': '00:00:1.2266'},
  'random': {'time_no_retries': '02:14:19.3520',
   'time_with_retries': '02:21:22.4018',
   'avg_gen_time_no_retries': '00:00:1.0746',
   'avg_gen_time_with_retries': '00:00:1.0976'}}}

In [143]:
def format_duration(time_str):
    hours, minutes, seconds, milliseconds = map(float, time_str.replace('.', ':').split(':'))
    days = (hours - (hours % 24)) / 24
    hours = hours % 24
    total_seconds = (hours * 60 * 60) + (minutes * 60) + seconds
    formatted_time = ""

    if days:
        formatted_time += f"{int(days)}d "
    if hours:
        formatted_time += f"{int(hours)}h "
    if minutes:
        formatted_time += f"{int(minutes)}m "
    if seconds:
        formatted_time += f"{int(seconds)}s "
    if milliseconds:
        formatted_time += f"{int(milliseconds)}ms"

    return formatted_time.strip()


def format_dictionary_duration(dictionary):
    for model, model_data in dictionary.items():
        for mode, mode_data in model_data.items():
            for key, value in mode_data.items():
                mode_data[key] = format_duration(value)

In [144]:
format_dictionary_duration(duration_statistics)

In [145]:
duration_statistics

{'Llama70B': {'fixed': {'time_no_retries': '6d 22h 30m 37s 4865ms',
   'time_with_retries': '7d 21h 4m 30s 2025ms',
   'avg_gen_time_no_retries': '1m 7025ms',
   'avg_gen_time_with_retries': '1m 1s 5935ms'},
  'random': {'time_no_retries': '5d 22m 12s 3716ms',
   'time_with_retries': '5d 17h 42m 16s 8850ms',
   'avg_gen_time_no_retries': '57s 7776ms',
   'avg_gen_time_with_retries': '58s 9742ms'}},
 'GPT-3': {'fixed': {'time_no_retries': '3h 12m 3s 1241ms',
   'time_with_retries': '3h 38m 25s 7390ms',
   'avg_gen_time_no_retries': '1s 1669ms',
   'avg_gen_time_with_retries': '1s 2266ms'},
  'random': {'time_no_retries': '2h 14m 19s 3520ms',
   'time_with_retries': '2h 21m 22s 4018ms',
   'avg_gen_time_no_retries': '1s 746ms',
   'avg_gen_time_with_retries': '1s 976ms'}}}

In [146]:
examples = []
for split in range(N_SPLITS):
    with open(f"../07 train models/synth/GPT-3/fixed/split_{split}.json", 'r') as file:
        synth_data_split = json.load(file)
        examples += synth_data_split

In [147]:
[("-->", example["llm_label"], len(example["llm_retry_statistic"]), example["llm_more_than_one_sentences"]) for example in examples if len(example["llm_retry_statistic"]) > 25]

[('-->',
  [['SERVICE', 'NEUTRAL'],
   ['PRICE', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE']],
  33,
  33),
 ('-->',
  [['SERVICE', 'NEUTRAL'],
   ['PRICE', 'NEGATIVE'],
   ['AMBIENCE', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE'],
   ['PRICE', 'NEUTRAL']],
  34,
  34),
 ('-->',
  [['FOOD', 'POSITIVE'],
   ['PRICE', 'NEGATIVE'],
   ['FOOD', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'NEGATIVE']],
  27,
  27),
 ('-->',
  [['AMBIENCE', 'NEUTRAL'],
   ['PRICE', 'NEGATIVE'],
   ['AMBIENCE', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE']],
  50,
  50),
 ('-->',
  [['FOOD', 'NEUTRAL'],
   ['FOOD', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'NEUTRAL']],
  50,
  50),
 ('-->',
  [['SERVICE', 'POSITIVE'],
   ['FOOD', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE']],
  32,
  32),
 ('-->',
  [['FOOD', 'POSITIVE'],
   ['PRICE', 'NEGATIVE'],
   ['GENERAL-IMPRESSION', 'POSITIVE'],
   ['GENERAL-IMPRESSION', 'NEGATIVE']],
  35,
  35),
 ('-->',
  [['AMB