In [None]:
%pip install openai

In [1]:
import pandas as pd
import json
import numpy as np
from time import sleep
import time
from openai import OpenAI
from tqdm import tqdm

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=open("API_key", "r").read(),
)

In [2]:
tests = ['piqa-en', 'piqa-mk', 'piqa-bg', 'piqa-sl', 'piqa-sr_cyrl', 'piqa-hr', 'piqa-sr_latn', 'piqa-bs', 'piqa-sl-cer', 'piqa-hr-ckm']

In [3]:
def predict_gpt(df_test_name, gpt_model):

	df_path = f"../../datasets/{df_test_name}.jsonl"

	responses = []
	instance_number = 0

	start_time = time.time()
	for line in open(df_path):
		instance_number += 1
		entry=json.loads(line)

		prompt= f"""
		### Task
			Given the following situation, which option is more likely to be correct?

			Situation: {entry['prompt']}

			Option 0: {entry['solution0']}

			Option 1: {entry['solution1']}
				
		### Output format
			Return a valid JSON dictionary with the following key: 'answer' and a value should be either 0 (if option 0 is more plausible) or 1 (if option 1 is more plausible).
		"""
		
		if gpt_model != "anthropic/claude-haiku-4.5":
			completion = client.chat.completions.create(model=gpt_model,
				response_format= {"type": "json_object"},
				messages=[
				{
					"role": "user",
					"content": prompt}
				],
				temperature = 0
			)
		# anthropic requires additional rules, as it doesn't support json output formatting
		elif gpt_model == "anthropic/claude-haiku-4.5":
			prompt+= "Answer ONLY with the JSON dictionary, do NOT provide your reasoning or the explanation. Provide only the dictionary with the integer 1 or 2 as the answer!"
			completion = client.chat.completions.create(model=gpt_model,
				response_format= {"type": "json_object"},
				stop="}",
				messages=[
				{
					"role": "user",
					"content": prompt}
				],
				temperature = 0)

		initial_response=completion.choices[0].message.content

		response = initial_response.replace("\n", "")
		response = response.replace("\t", "")

		# Get out a label
		try:
			# Convert the string into a dictionary
			response = json.loads(response)
	
			predicted = response["answer"]
			responses.append(predicted)
		# add a possibility of something going wrong
		except:
			predicted = initial_response
			print("error with extracting a label:")
			print(initial_response)
			responses.append(initial_response)

	end_time = time.time()
	elapsed_time_min = end_time-start_time

	print(f"Prediction finished. It took {elapsed_time_min/60} min for {instance_number} instances - {elapsed_time_min/instance_number} s per instance.")

	# Create a json with results

	current_results = {
		"system": gpt_model,
		"predictions": [
			{
			"train": "NA (zero-shot)",
			"test": "{}".format(df_test_name),
			"predictions": responses,
			}
		]
		}

	# The only thing that needs to be changed in the code from OpenAI
	gpt_model_name = gpt_model.split("/")[1]

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format(gpt_model_name, df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format(gpt_model_name, df_test_name))

In [4]:
tests = ['piqa-en', 'piqa-mk', 'piqa-bg', 'piqa-sl', 'piqa-sr_cyrl', 'piqa-hr', 'piqa-sr_latn', 'piqa-bs', 'piqa-sl-cer', 'piqa-hr-ckm']

In [5]:
# First, let's evaluate the models that we evaluated in other tasks as well
models = ["google/gemini-2.5-flash", "mistralai/mistral-medium-3.1"]

In [None]:
for test in tests:
	print(test)
	for model in models:
		print(model)
		predict_gpt(test, model)

piqa-en
google/gemini-2.5-flash


In [None]:
# Evaluate also the Anthropic model

for test in tests:
	print(test)
	predict_gpt(test, "anthropic/claude-haiku-4.5")

In [None]:
# Add the PRO Gemini model

for test in tests:
	print(test)
	predict_gpt(test, "google/gemini-2.5-pro")