<a href="https://colab.research.google.com/github/mille-s/GEM24_EvalLLM/blob/main/GEM24_EvalLLM_OpenAI_SM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Install OpenAI
from IPython.display import clear_output

! pip install openai==0.28
# !pip install --upgrade openai

clear_output()

In [8]:
#@title Download and Load human-eval-packaged json, and format contents (triples, text, id)
import json
import codecs
from bs4 import BeautifulSoup
import os
import time

language = "English"

def format_json(json_path):
  # Open en_regular and parse json
  en_regular_json = json.load(codecs.open(json_path, 'r', 'utf-8'))
  # Print first entry
  # print(json.dumps(en_regular_json[0], indent=4))

  triples_text_pairs = []

  x = 0
  while x < len(en_regular_json):
    # if x < 10:
    # Parse html found in the "input" key
    html = en_regular_json[x]['input']
    soup = BeautifulSoup(html, 'html.parser')
    # Print raw table
    # print(soup.prettify())
    table = soup.find('table')
    # headers = [header.text.strip() for header in table.find_all('th')]
    rows = []
    for row in table.find_all('tr'):
      columns = row.find_all(['td', 'th'])  # Get both <td> and <th>
      row_data = ' '.join([col.text.strip() for col in columns])
      rows.append(row_data)
    triples_formatted = '; '.join(rows[1:]) # exclude header
    # print("Headers:", rows[0])
    # print(rows[1:])
    triples_text_pairs.append({'id':en_regular_json[x]['id'], 'triples': '"""'+triples_formatted+'"""', 'text': en_regular_json[x]['output']})
    # else:
    #   break
    x += 1
  return triples_text_pairs

In [9]:
#@title Load Custom json file
import json
import codecs
from bs4 import BeautifulSoup
import os

language = "English" #@param["English", "Spanish", "Swahili"]

custom_filepath = 'llm_as_judge_samples.json'
triples_text_pairs = format_json(custom_filepath)
print(f'{len(triples_text_pairs)} datapoints found!')
print(triples_text_pairs[0])

720 datapoints found!
{'id': 'en_D2T-1-FA_1437_3_struct_D2T', 'triples': '"""McVeagh_of_the_South_Seas director Cyril_Bruce; McVeagh_of_the_South_Seas director Harry_Carey_(actor_born_1878); McVeagh_of_the_South_Seas writer Harry_Carey_(actor_born_1878); McVeagh_of_the_South_Seas producer The_Progressive_Motion_Picture_Company; McVeagh_of_the_South_Seas distributor Alliance_Films_Corporation"""', 'text': 'The film McVeagh of the South Seas was directed by Cyril Bruce and Harry Carey, and distributed by Alliance Films Corporation.'}


In [13]:
# Get already processed IDs
path_out = 'GPT_results'
existing_files = os.listdir(path_out)
processed_ids = set()
for fname in existing_files:
    if fname.startswith('GPT_results_'):
        processed_ids.add(fname[len('GPT_results_'):])


# existing_files

In [15]:
#@title Run evaluation (needs OpenAI API key in Parameters)

import os
import json
import csv
import pandas as pd
import openai
import pickle
import time

# PARAMETERS OpenAI
#==========================
# openai.api_key = "insertYourKeyHere"
model="o3" #@param["gpt-4o-mini-2024-07-18", "o3-mini-2025-01-31"]
path_out = 'GPT_results'
#==========================

if not os.path.exists(path_out):
  os.makedirs(path_out)

def dumpResults(annotations, path_out):
  results_file = open(os.path.join(path_out, 'All_GPT_results'), 'ab')
  pickle.dump(annotations, results_file)
  results_file.close()

def callGPT(prompt, Triples, Nice_Text, model):
    response =  openai.ChatCompletion.create(
    model=model,
    messages=[
        {"role": "system",
         "content": prompt},
    ],
    temperature=1)

    return response

def runEval(triples_text_pairs, model):
  # EN regular splits: range(0, 2750), range(2750, 5500), range(5500, 8240)
  x = 0
  # To get all evaluations
  for x in range(0, len(triples_text_pairs)):
  # To test on a few inputs only
  # while x < 3:
    Triples = triples_text_pairs[x]['triples']
    Nice_Text = triples_text_pairs[x]['text']
    id = triples_text_pairs[x]['id']
    if id in processed_ids:
      # print(f"Skipping text #{x} (ID={id}) -- already processed.")
      continue

    #Prompt (Do not change unless discussed with the GEM-HumEval group)
    prompt = '''
In this task, you will evaluate the quality of the Text in relation to the given Triple Set. How well does the Text represent the Triple Set?  You will be given four specific Dimensions to evaluate against:

Dimensions:"""
No-Omissions: ALL the information in the Triple Set is present in the Text.
No-Additions: ONLY information from the Triple Set is present in the Text.
Grammaticality: The Text is free of grammatical and spelling errors.
Fluency: The Text flows well and is easy to read; its parts are connected in a natural way."""

Important note on No-Omissions and No-Additions: some Triple Set/Text pairs contain non-factual information and even fictional names for people, places, dates, etc. Whether there are omissions and/or additions in a Text is NOT related to factual truth, but instead is strictly related to the contents of the input Triple Set.
Important note on Grammaticality and Fluency: for Grammaticality and Fluency you do not need to consider the input Triple Set; only the intrinsic quality of the Text needs to be assessed.

You need to provide the scores ranging from 1 (indicating the lowest score) to 7 (indicating the highest score) for each of the dimensions and a short justification for each score in the following JSON format:  {"No-Omissions": {"Justification": "", "Score": ""}, "No-Additions": {"Justification": "", "Score": ""}, "Grammaticality": {"Justification": "", "Score": ""}, "Fluency": {"Justification": "", "Score": ""} }.

Make sure to read thoroughly the Triple Set and the '''+str(language)+''' Text below, and assess the four Dimensions using the instructions and template above.

Triple Set: ''' + str(Triples) + "\n" + '''Text: '''+ str(Nice_Text) + "\n\n" + '''
'''
    print(f'Evaluating text #{x}...')
    print(f'ID: {id}')
    # print(prompt)
    # print(Triples)
    # print(Nice_Text)
    responseGPT = callGPT(prompt, Triples, Nice_Text, model)
    # print(responseGPT.choices[0].message.content)
    # print('\n')
    # print(response['choices'][0]['message']['content'])
    # print('\n')
    triples_text_pairs[x]['scores_GPT'] = responseGPT['choices'][0]['message']['content']

    # Save individual files as backup
    # with open(os.path.join('GPT_results', 'GPT_results_'+str(id)), 'ab') as f:
    with open(os.path.join('GPT_results', 'GPT_results_' + id), 'ab') as f:
      pickle.dump(triples_text_pairs[x], f)

    time.sleep(10)
    x += 1

  return triples_text_pairs

In [17]:
annotations = runEval(triples_text_pairs, model)

# dumpResults(annotations, path_out)

## Results analysis

In [None]:
#@title Load unzipped files
import pickle
import glob
import json
import re
import codecs
import os
import ast

def auto_close_json(s):
    # Counts how many more } are needed
    open_braces = s.count('{')
    close_braces = s.count('}')
    needed = open_braces - close_braces
    if needed > 0:
        s += '}' * needed
    return s

update_params_unzip = True #@param {type:"boolean"}
if update_params_unzip:
  zip_language = 'EN' #@param['EN', 'ES', 'SW']
  zip_model = 'o3' #@param['GPT-4o-mini', 'GPT-o3-mini', 'Gemini-1dot5-flash']
  zip_data = 'regular' #@param['regular', 'iaa']

model_scores = [1, 2, 3, 4, 5, 6, 7]
# load_gemini_folder = True #@param {type:"boolean"}
# load_gpt_folder = False #@param {type:"boolean"}
path_dir_unzipped = ''
model_prefix = ''
if zip_model.startswith('Gemini'):
  path_dir_unzipped = os.path.join('/content', zip_language+'_'+zip_data, zip_model, 'content', 'Gemini_results')
  model_prefix = 'Gemini'
# elif zip_model.startswith('GPT'):
else:
  path_dir_unzipped = os.path.join('GPT_results')
  model_prefix = 'GPT'

def separateJustification(LLMoutString, criterion):
  """
  The Justifications returned by the models often break the json format, so I extract them
  """
  search_expression = '("'+criterion+'":[^\{]+\{[^\}]*"Justification":)([^\}]+)("Score":[^\}]+\})'
  if re.search(search_expression, LLMoutString):
    justificationRemoved = re.sub(search_expression, '\g<1> "", \g<3>',  LLMoutString)
    justification = re.sub('^.*'+search_expression+'.*$', '\g<2>',  LLMoutString)
  else:
    justificationRemoved = LLMoutString
    justification = ''
  return justificationRemoved, justification

def loadDataPoint(dbfile_x, model):
  eval_missing = None
  wrong_score = None
  dico_key = 'scores_'+str(model)
  formatted_scores = {}
  # load data with pickle
  dp = pickle.load(dbfile_x)
  if dico_key in dp:
    print(dp['id'])
    # print(dp['triples'])
    # print(dp['text'])
    justifications = []
    # pickle.load uses single quotes, whereas json.load expects double quotes
    # Gemini adds a node "query" in the json, unlike OpenAI's models
    LLMout_string = str(dp[dico_key]).replace("'query'", '"query"')
    LLMout_string = LLMout_string.replace("```json", "")
    LLMout_string = LLMout_string.replace("```", "")
    LLMout_string = LLMout_string.replace("'No-Omissions'", '"No-Omissions"')
    # There's a typo in one of the Gemini outputs
    LLMout_string = LLMout_string.replace("'No-Omissons'", '"No-Omissions"')
    LLMout_string = LLMout_string.replace('"No-Omissons"', '"No-Omissions"')
    LLMout_string = LLMout_string.replace("'No-Additions'", '"No-Additions"')
    LLMout_string = LLMout_string.replace("'Grammaticality'", '"Grammaticality"')
    LLMout_string = LLMout_string.replace("'Fluency'", '"Fluency"')
    # Sometimes justifications are followed by single quotes, sometimes by double quotes
    LLMout_string = LLMout_string.replace("'Justification': '", '"Justification": "').replace("'Justification'", '"Justification"')
    LLMout_string = LLMout_string.replace("', 'Score'", '", "Score"').replace("'Score'", '"Score"')
    if LLMout_string == 'None':
      eval_missing = dp['id']
    else:
      # print(LLMout_string)
      # LLMout_string = re.sub('("No-Omissions":[^\{]+\{"Justification":)([^\}]+)("Score":[^\}]+\})', '\g<1> "", \g<3>',  LLMout_string)
      LLMout_string, justifNoOm = separateJustification(LLMout_string, 'No-Omissions')
      justifications.append(justifNoOm)
      LLMout_string, justifNoAdd = separateJustification(LLMout_string, 'No-Additions')
      justifications.append(justifNoAdd)
      LLMout_string, justifGram = separateJustification(LLMout_string, 'Grammaticality')
      justifications.append(justifGram)
      LLMout_string, justifFlu = separateJustification(LLMout_string, 'Fluency')
      justifications.append(justifFlu)
      # print(justifications)
      LLMout_string = LLMout_string.replace("'1'", '"1"')
      LLMout_string = LLMout_string.replace("'2'", '"2"')
      LLMout_string = LLMout_string.replace("'3'", '"3"')
      LLMout_string = LLMout_string.replace("'4'", '"4"')
      LLMout_string = LLMout_string.replace("'5'", '"5"')
      LLMout_string = LLMout_string.replace("'6'", '"6"')
      LLMout_string = LLMout_string.replace("'7'", '"7"')
      LLMout_string = auto_close_json(LLMout_string)
      # scores_json = json.loads(LLMout_string)
      try:
        scores_json = json.loads(LLMout_string)
      except Exception:
          try:
              scores_json = ast.literal_eval(LLMout_string)
          except Exception as e:
              print("\n=== JSON LOAD FAIL ===")
              print("Offending string:\n", LLMout_string)
              print("=====================\n")
              raise e
      clean_scores_json = None
      # Gemini adds a node "query" in the json, unlike OpenAI's models
      if 'query' in scores_json:
        clean_scores_json = scores_json['query']
      else:
        clean_scores_json = scores_json

      gram_score = int(clean_scores_json['Grammaticality']['Score'])
      flu_score = int(clean_scores_json['Fluency']['Score'])
      no_om_score = int(clean_scores_json['No-Omissions']['Score'])
      no_ad_score = int(clean_scores_json['No-Additions']['Score'])

      if (gram_score not in model_scores) or (flu_score not in model_scores) or (no_om_score not in model_scores) or (no_ad_score not in model_scores):
        wrong_score = dp['id']

      formatted_scores["eid"] = dp['id']
      formatted_scores["annotator_id"] = str(zip_model)
      formatted_scores["no-omissions"] = no_om_score
      formatted_scores["no-additions"] = no_ad_score
      formatted_scores["grammaticality"] = gram_score
      formatted_scores["fluency"] = flu_score

      # print(f"Gram: {gram_score}; Flu: {flu_score}; NoOm: {no_om_score}; NoAd: {no_ad_score}.")
      # print('')

  return formatted_scores, eval_missing, wrong_score

# print(path_dir_unzipped)
# print(model_prefix)
eval_files = glob.glob(os.path.join(path_dir_unzipped, '*'))
evals_missing = []
wrong_scores = []
all_scores = []
for filepath in eval_files:
  # print(filepath)
  dbfile_x = open(filepath, 'rb')
  formatted_scores, eval_missing, wrong_score = loadDataPoint(dbfile_x, model_prefix)
  if eval_missing != None:
    evals_missing.append(eval_missing)
  if wrong_score != None:
    wrong_scores.append(wrong_score)
  dbfile_x.close()
  all_scores.append(formatted_scores)
# print(f'Missing evaluations: {evals_missing}')
# print(f'Wrong scores: {wrong_scores}')

# Save all scores into a json file
path_json_out = zip_language+'_'+zip_model+'_scores.json'
with codecs.open(path_json_out, 'w', 'utf-8') as outfile:
  json.dump(all_scores, outfile)

en_D2T-1-FA_1237_1_agent_D2T
en_D2T-1-FA_0559_4_human_D2T
en_D2T-1-FA_0701_2_e2e_D2T
en_D2T-1-FA_1610_2_e2e_D2T
en_D2T-1-FA_0083_4_human_D2T
en_D2T-1-FA_1430_4_human_D2T
en_D2T-1-FA_1655_2_e2e_D2T
en_D2T-1-FA_1638_4_human_D2T
en_D2T-1-FA_1655_3_struct_D2T
en_D2T-1-FA_0745_1_agent_D2T
en_D2T-1-FA_0667_3_struct_D2T
en_D2T-1-FA_1588_2_e2e_D2T
en_D2T-1-FA_1661_2_e2e_D2T
en_D2T-1-FA_0604_3_struct_D2T
en_D2T-1-FA_1437_3_struct_D2T
en_D2T-1-FA_0360_1_agent_D2T
en_D2T-1-FA_0585_3_struct_D2T
en_D2T-1-FA_0212_3_struct_D2T
en_D2T-1-FA_1722_4_human_D2T
en_D2T-1-FA_0585_1_agent_D2T
en_D2T-1-FA_1454_4_human_D2T
en_D2T-1-FA_0935_4_human_D2T
en_D2T-1-FA_1025_4_human_D2T

=== JSON LOAD FAIL ===
Offending string:
 {
"No-Omissions": {
"Justification": "", "Score": "7"
},
"No-Additions": {
"Justification": "", "Score": "7"
},
"Grammaticality": {
"Justification": "", "Score": "7"
},
"Fluency": {
"Justification": "", "Score": "6"
}



  search_expression = '("'+criterion+'":[^\{]+\{[^\}]*"Justification":)([^\}]+)("Score":[^\}]+\})'
  justificationRemoved = re.sub(search_expression, '\g<1> "", \g<3>',  LLMoutString)
  justification = re.sub('^.*'+search_expression+'.*$', '\g<2>',  LLMoutString)


SyntaxError: '{' was never closed (<unknown>, line 1)