In [None]:
%pip install openai

In [1]:
import pandas as pd
import json
import numpy as np
from time import sleep
import time
from openai import OpenAI
from tqdm import tqdm
client = OpenAI(api_key=open('API_key').read())

In [2]:
tests = ['piqa-en', 'piqa-mk', 'piqa-bg', 'piqa-sl', 'piqa-sr_cyrl', 'piqa-hr', 'piqa-sr_latn', 'piqa-bs', 'piqa-sl-cer', 'piqa-hr-ckm']

In [3]:
def predict_gpt(df_test_name, gpt_model):

	df_path = f"../../datasets/{df_test_name}.jsonl"

	responses = []
	instance_number = 0

	start_time = time.time()
	for line in open(df_path):
		instance_number += 1
		entry=json.loads(line)

		prompt= f"""
		### Task
			Given the following situation, which option is more likely to be correct?

			Situation: {entry['prompt']}

			Option 0: {entry['solution0']}

			Option 1: {entry['solution1']}
				
		### Output format
			Return a valid JSON dictionary with the following key: 'answer' and a value should be either 0 (if option 0 is more plausible) or 1 (if option 1 is more plausible).
		"""
		
		if "gpt-5" not in gpt_model:
			completion = client.chat.completions.create(model=gpt_model,
				response_format= {"type": "json_object"},
				messages=[
				{
					"role": "user",
					"content": prompt}
				],
				temperature = 0
				)
		# the "v5" models do not have the "temperature" parameter
		elif "gpt-5" in gpt_model:
			completion = client.chat.completions.create(model=gpt_model,
				response_format= {"type": "json_object"},
				messages=[
				{
					"role": "user",
					"content": prompt}
				],
				#temperature = 0
				)

		initial_response=completion.choices[0].message.content
		
		response = initial_response.replace("\n", "")
		response = response.replace("\t", "")

		# Get out a label
		try:
			# Convert the string into a dictionary
			response = json.loads(response)
			predicted = response["answer"]
			responses.append(predicted)
		# add a possibility of something going wrong
		except:
			predicted = initial_response
			print("error with extracting a label:")
			print(initial_response)
			responses.append(initial_response)

	end_time = time.time()
	elapsed_time_min = end_time-start_time

	print(f"Prediction finished. It took {elapsed_time_min/60} min for {instance_number} instances - {elapsed_time_min/instance_number} s per instance.")

	# Create a json with results

	current_results = {
		"system": gpt_model,
		"predictions": [
			{
			"train": "NA (zero-shot)",
			"test": "{}".format(df_test_name),
			"predictions": responses,
			},
		]
		}

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format(gpt_model, df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format(gpt_model, df_test_name))

In [4]:
tests = ['piqa-en', 'piqa-mk', 'piqa-bg', 'piqa-sl', 'piqa-sr_cyrl', 'piqa-hr', 'piqa-sr_latn', 'piqa-bs', 'piqa-sl-cer', 'piqa-hr-ckm']

In [5]:
# First, check that everything works well by evaluating the models on English
for model in ["gpt-4o-2024-08-06", "gpt-3.5-turbo-0125", "gpt-5-2025-08-07"]:
	print(model)
	predict_gpt("piqa-en", model)

gpt-4o-2024-08-06
Prediction finished. It took 1.332702386379242 min for 100 instances - 0.7996214318275452 s per instance.
Classification with gpt-4o-2024-08-06 on piqa-en finished.
gpt-3.5-turbo-0125
Prediction finished. It took 1.1147172888120016 min for 100 instances - 0.668830373287201 s per instance.
Classification with gpt-3.5-turbo-0125 on piqa-en finished.
gpt-5-2025-08-07
Prediction finished. It took 8.298507142066956 min for 100 instances - 4.979104285240173 s per instance.
Classification with gpt-5-2025-08-07 on piqa-en finished.


In [6]:
# First, evaluate the first two models, as GPT-5 takes much longer
for test in tests[1:]:
	print(test)
	for model in ["gpt-4o-2024-08-06", "gpt-3.5-turbo-0125"]:
		print(model)
		predict_gpt(test, model)

piqa-mk
gpt-4o-2024-08-06
Prediction finished. It took 1.2247767806053163 min for 100 instances - 0.7348660683631897 s per instance.
Classification with gpt-4o-2024-08-06 on piqa-mk finished.
gpt-3.5-turbo-0125
Prediction finished. It took 0.9944889426231385 min for 100 instances - 0.5966933655738831 s per instance.
Classification with gpt-3.5-turbo-0125 on piqa-mk finished.
piqa-bg
gpt-4o-2024-08-06
Prediction finished. It took 1.3237185835838319 min for 100 instances - 0.7942311501502991 s per instance.
Classification with gpt-4o-2024-08-06 on piqa-bg finished.
gpt-3.5-turbo-0125
Prediction finished. It took 1.0784034331639607 min for 100 instances - 0.6470420598983765 s per instance.
Classification with gpt-3.5-turbo-0125 on piqa-bg finished.
piqa-sl
gpt-4o-2024-08-06
Prediction finished. It took 1.3038632035255433 min for 100 instances - 0.7823179221153259 s per instance.
Classification with gpt-4o-2024-08-06 on piqa-sl finished.
gpt-3.5-turbo-0125
Prediction finished. It took 1.10

In [7]:
# Add GPT-5
for test in tests[1:]:
	print(test)
	predict_gpt(test, "gpt-5-2025-08-07")



piqa-mk
Prediction finished. It took 7.916777757803599 min for 100 instances - 4.7500666546821595 s per instance.
Classification with gpt-5-2025-08-07 on piqa-mk finished.
piqa-bg
Prediction finished. It took 10.71853571732839 min for 100 instances - 6.431121430397034 s per instance.
Classification with gpt-5-2025-08-07 on piqa-bg finished.
piqa-sl
Prediction finished. It took 12.195564572016398 min for 100 instances - 7.317338743209839 s per instance.
Classification with gpt-5-2025-08-07 on piqa-sl finished.
piqa-sr_cyrl
Prediction finished. It took 13.426584800084433 min for 100 instances - 8.05595088005066 s per instance.
Classification with gpt-5-2025-08-07 on piqa-sr_cyrl finished.
piqa-hr
Prediction finished. It took 8.12641879717509 min for 100 instances - 4.875851278305054 s per instance.
Classification with gpt-5-2025-08-07 on piqa-hr finished.
piqa-sr_latn
Prediction finished. It took 11.984904567400614 min for 100 instances - 7.190942740440368 s per instance.
Classification 