In [None]:
%pip install openai

In [1]:
import pandas as pd
import json
import numpy as np
from time import sleep
import time
from openai import OpenAI
from tqdm import tqdm
client = OpenAI(api_key=open('API_key').read())

In [2]:
# Load the test datasets
test_en = pd.read_json("../../datasets/copa-en-test.jsonl", lines=True)
test_sl = pd.read_json("../../datasets/copa-sl-test.jsonl", lines=True)
test_hr_ckm = pd.read_json("../../datasets/copa-hr-ckm-test.jsonl", lines=True)
test_hr = pd.read_json("../../datasets/copa-hr-test.jsonl", lines=True)
test_mk = pd.read_json("../../datasets/copa-mk-test.jsonl", lines=True)
test_sl_cer = pd.read_json("../../datasets/copa-sl-cer-test.jsonl", lines=True)
test_sr = pd.read_json("../../datasets/copa-sr-test.jsonl", lines=True)
test_sr_tor = pd.read_json("../../datasets/copa-sr-tor-test.jsonl", lines=True)

print(test_en.shape, test_sl.shape, test_hr.shape, test_hr_ckm.shape, test_mk.shape, test_sl_cer.shape, test_sr.shape, test_sr_tor.shape)

(500, 5) (500, 5) (500, 7) (500, 5) (500, 7) (500, 5) (500, 7) (500, 5)


In [3]:
def predict_gpt(df_test_name, gpt_model):

	df_path = f"../../datasets/{df_test_name}-test.jsonl"

	responses = []

	start_time = time.time()
	for line in open(df_path):
		entry=json.loads(line)

		if df_test_name != "copa-en":
			prompt= 'You will be given a task. The task definition is in English, but the task itself is in another language. Here is the task!\nGiven the premise "'+entry['premise']+'",'
			if entry['question']=='cause':
				prompt+=' and that we are looking for the cause of this premise,'
			else:
				prompt+=' and that we are looking for the result of this premise, '
			prompt+=f"""which hypothesis is more plausible?\nHypothesis 1: "{entry['choice1']}".\nHypothesis 2: "{entry['choice2']}".
					
			### Output format
				Return a valid JSON dictionary with the following key: 'answer' and a value should be an integer -- either 1 (if hypothesis 1 is more plausible) or 2 (if hypothesis 2 is more plausible).
			"""
		elif df_test_name == "copa-en":
			prompt= 'You will be given a task. The task definition is in English, as is the task itself. Here is the task!\nGiven the premise "'+entry['premise']+'",'
			if entry['question']=='cause':
				prompt+=' and that we are looking for the cause of this premise,'
			else:
				prompt+=' and that we are looking for the result of this premise,'
			prompt+=f"""which hypothesis is more plausible?\nHypothesis 1: "{entry['choice1']}".\nHypothesis 2: "{entry['choice2']}".
					
			### Output format
				Return a valid JSON dictionary with the following key: 'answer' and a value should be an integer -- either 1 (if hypothesis 1 is more plausible) or 2 (if hypothesis 2 is more plausible).
			"""
		
		if "gpt-5" not in gpt_model:
			completion = client.chat.completions.create(model=gpt_model,
				response_format= {"type": "json_object"},
				messages=[
				{
					"role": "user",
					"content": prompt}
				],
				temperature = 0
				)
		# the "v5" models do not have the "temperature" parameter
		elif "gpt-5" in gpt_model:
			completion = client.chat.completions.create(model=gpt_model,
				response_format= {"type": "json_object"},
				messages=[
				{
					"role": "user",
					"content": prompt}
				],
				#temperature = 0
				)

		initial_response=completion.choices[0].message.content
		
		response = initial_response.replace("\n", "")
		response = response.replace("\t", "")

		# Get out a label
		try:
			# Convert the string into a dictionary
			response = json.loads(response)
			
			# The true labels are 0 or 1, so you have to change the answer by substracting 1 from it.
			predicted = response["answer"]-1
			responses.append(predicted)
		# add a possibility of something going wrong
		except:
			predicted = initial_response
			print("error with extracting a label:")
			print(initial_response)
			responses.append(initial_response)

	end_time = time.time()
	elapsed_time_min = end_time-start_time

	print(f"Prediction finished. It took {elapsed_time_min/60} min for 500 instances - {elapsed_time_min/500} s per instance.")

	# Create a json with results

	current_results = {
		"system": gpt_model,
		"predictions": [
			{
			"train": "NA (zero-shot)",
			"test": "{}".format(df_test_name),
			"predictions": responses,
			},
		]
		}

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format(gpt_model, df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format(gpt_model, df_test_name))

In [4]:
tests = ["copa-en","copa-sl", "copa-hr", "copa-hr-ckm", "copa-mk", "copa-sl-cer", "copa-sr", "copa-sr-tor"]

In [6]:
# First, check that everything works well by evaluating the models on English
for model in ["gpt-4o-2024-08-06", "gpt-3.5-turbo-0125", "gpt-5-2025-08-07"]:
	print(model)
	predict_gpt("copa-en", model)

gpt-4o-2024-08-06
Prediction finished. It took 6.09971798658371 min for 500 instances - 0.7319661583900452 s per instance.
Classification with gpt-4o-2024-08-06 on copa-en finished.
gpt-3.5-turbo-0125
Prediction finished. It took 5.382102092107137 min for 500 instances - 0.6458522510528565 s per instance.
Classification with gpt-3.5-turbo-0125 on copa-en finished.
gpt-5-2025-08-07
Prediction finished. It took 26.666247244675954 min for 500 instances - 3.1999496693611147 s per instance.
Classification with gpt-5-2025-08-07 on copa-en finished.


In [7]:
# First, evaluate the first two models, as GPT-5 takes much longer
for test in tests[1:]:
	print(test)
	for model in ["gpt-4o-2024-08-06", "gpt-3.5-turbo-0125"]:
		print(model)
		predict_gpt(test, model)

copa-sl
gpt-4o-2024-08-06
Prediction finished. It took 5.678991957505544 min for 500 instances - 0.6814790349006653 s per instance.
Classification with gpt-4o-2024-08-06 on copa-sl finished.
gpt-3.5-turbo-0125
Prediction finished. It took 5.451760907967885 min for 500 instances - 0.6542113089561462 s per instance.
Classification with gpt-3.5-turbo-0125 on copa-sl finished.
copa-hr
gpt-4o-2024-08-06
Prediction finished. It took 5.586604948838552 min for 500 instances - 0.6703925938606262 s per instance.
Classification with gpt-4o-2024-08-06 on copa-hr finished.
gpt-3.5-turbo-0125
Prediction finished. It took 5.42175254424413 min for 500 instances - 0.6506103053092956 s per instance.
Classification with gpt-3.5-turbo-0125 on copa-hr finished.
copa-hr-ckm
gpt-4o-2024-08-06
Prediction finished. It took 6.307056427001953 min for 500 instances - 0.7568467712402344 s per instance.
Classification with gpt-4o-2024-08-06 on copa-hr-ckm finished.
gpt-3.5-turbo-0125
Prediction finished. It took 5.

In [6]:
tests[3:]

['copa-hr-ckm', 'copa-mk', 'copa-sl-cer', 'copa-sr', 'copa-sr-tor']

In [None]:
# Add GPT-5
for test in tests[3:]:
	print(test)
	predict_gpt(test, "gpt-5-2025-08-07")



copa-hr-ckm
Prediction finished. It took 87.900279601415 min for 500 instances - 10.5480335521698 s per instance.
Classification with gpt-5-2025-08-07 on copa-hr-ckm finished.
copa-mk
Prediction finished. It took 34.84431236584981 min for 500 instances - 4.181317483901977 s per instance.
Classification with gpt-5-2025-08-07 on copa-mk finished.
copa-sl-cer
