In [None]:
%pip install openai

In [2]:
import pandas as pd
import json
import numpy as np
from time import sleep
import time
from openai import OpenAI
from tqdm import tqdm
client = OpenAI(api_key=open('API_key').read())

In [3]:
# Load the test datasets

test_en = pd.read_json("../../datasets/ParlaSent-EN-test/ParlaSent_EN_test.jsonl", lines=True)
test_bcs = pd.read_json("../../datasets/ParlaSent-BSC-test/ParlaSent_BCS_test.jsonl", lines=True)

print(test_en.shape, test_bcs.shape)

(2600, 14) (2600, 14)


In [4]:
def predict_gpt(df_test_name, gpt_model):

	dfs = {
		"ParlaSent-EN-test": test_en,
		"ParlaSent-BCS-test": test_bcs
	}

	df = dfs[df_test_name]

	responses = []
	
	texts = df["text"].to_list()
	langs = df["lang"].to_list()

	labels_dict = {0: "Negative", 1: "Neutral", 2: "Positive"}

	sentiment_description = {
		"Negative - text that is entirely or predominantly negative":  0, 
		"Neutral - text that only contains non-sentiment-related statements": 1,
		"Positive - text that is entirely or predominantly positive": 2
	}

	start_time = time.time()

	for i in list(zip(texts, langs)):
		text = i[0]
		lang = i[1]
		# the "v5" models do not have the "temperature" parameter
		if "gpt-5" not in gpt_model:
			completion = client.chat.completions.create(model=gpt_model,
			response_format= {"type": "json_object"},
			messages= [
			{
				"role": "user",
				"content": f"""
				### Task
					Your task is to classify the provided parliamentary text into a sentiment label, meaning that you need to recognize whether the speaker's sentiment towards the topic is negative, neutral, positive or somewhere in between. You will be provided with an excerpt from a parliamentary speech in {lang} language, delimited by single quotation marks. Always provide a label, even if you are not sure.


				### Output format
					Return a valid JSON dictionary with the following key: 'sentiment' and a value should be an integer which represents one of the labels according to the following dictionary: {sentiment_description}.

					Text: '{text}'
			"""
		}
			],
			temperature = 0)
		# the "v5" models do not have the "temperature" parameter
		elif "gpt-5" in gpt_model:
			completion = client.chat.completions.create(model=gpt_model,
			response_format= {"type": "json_object"},
			messages= [
			{
				"role": "user",
				"content": f"""
				### Task
					Your task is to classify the provided parliamentary text into a sentiment label, meaning that you need to recognize whether the speaker's sentiment towards the topic is negative, neutral, positive or somewhere in between. You will be provided with an excerpt from a parliamentary speech in {lang} language, delimited by single quotation marks. Always provide a label, even if you are not sure.


				### Output format
					Return a valid JSON dictionary with the following key: 'sentiment' and a value should be an integer which represents one of the labels according to the following dictionary: {sentiment_description}.

					Text: '{text}'
			"""
		}
			],
		)
		else:
			print("The model is not supported, check the code.")

		response=completion.choices[0].message.content

		response = response.replace("\n", "")
		response = response.replace("\t", "")

		# Convert the string into a dictionary
		response = json.loads(response)

		# Get out a label
		try:
			predicted = labels_dict[response["sentiment"]]
			responses.append(predicted)
		# add a possibility of something going wrong
		except:
			predicted = "error"
			print("error with extracting a label")
			responses.append("Mix")

	end_time = time.time()
	elapsed_time_min = end_time-start_time

	print(f"Prediction finished. It took {elapsed_time_min/60} min for {df.shape[0]} instances - {elapsed_time_min/df.shape[0]} s per instance.")

	# Create a json with results

	current_results = {
		"system": gpt_model,
		"predictions": [
			{
			"train": "NA (zero-shot)",
			"test": "{}".format(df_test_name),
			"predictions": responses,
			}
		]
		}

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format(gpt_model, df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format(gpt_model, df_test_name))

In [None]:
for test in ["ParlaSent-EN-test", "ParlaSent-BCS-test"]:
	for model in ["gpt-4o-2024-08-06", "gpt-3.5-turbo-0125", "gpt-4o-mini-2024-07-18"]:
		print(model)
		predict_gpt(test, model)

gpt-4o-2024-08-06
Prediction finished. It took 27.62819653749466 min for 2600 instances - 0.6375737662498767 s per instance.
Classification with gpt-4o-2024-08-06 on ParlaSent-EN-test finished.
gpt-3.5-turbo-0125
Prediction finished. It took 27.254899839560192 min for 2600 instances - 0.6289592270667737 s per instance.
Classification with gpt-3.5-turbo-0125 on ParlaSent-EN-test finished.
gpt-4o-mini-2024-07-18
Prediction finished. It took 35.473263736565904 min for 2600 instances - 0.8186137785361364 s per instance.
Classification with gpt-4o-mini-2024-07-18 on ParlaSent-EN-test finished.
gpt-4o-2024-08-06
Prediction finished. It took 28.581721182664236 min for 2600 instances - 0.6595781811384054 s per instance.
Classification with gpt-4o-2024-08-06 on ParlaSent-BCS-test finished.
gpt-3.5-turbo-0125
Prediction finished. It took 28.31333527962367 min for 2600 instances - 0.6533846602990077 s per instance.
Classification with gpt-3.5-turbo-0125 on ParlaSent-BCS-test finished.
gpt-4o-mini

In [4]:
for test in ["ParlaSent-EN-test"]:
	for model in ["gpt-5"]:
		print(model)
		predict_gpt(test, model)

gpt-5
Prediction finished. It took 136.49830847581228 min for 2600 instances - 3.149960964826437 s per instance.
Classification with gpt-5 on ParlaSent-EN-test finished.


In [5]:
# Evaluate the new v5 models
for test in ["ParlaSent-BCS-test"]:
	for model in ["gpt-5-nano-2025-08-07", "gpt-5-mini-2025-08-07","gpt-5"]:
		print(model)
		predict_gpt(test, model)

gpt-5-nano-2025-08-07
Prediction finished. It took 139.0791892608007 min for 2600 instances - 3.2095197521723233 s per instance.
Classification with gpt-5-nano-2025-08-07 on ParlaSent-BCS-test finished.
gpt-5-mini-2025-08-07


PermissionDeniedError: Error code: 403

In [5]:
# Evaluate the new v5 models
for test in ["ParlaSent-BCS-test"]:
	for model in ["gpt-5-mini-2025-08-07","gpt-5"]:
		print(model)
		predict_gpt(test, model)

gpt-5-mini-2025-08-07
Prediction finished. It took 158.3352254152298 min for 2600 instances - 3.6538898172745338 s per instance.
Classification with gpt-5-mini-2025-08-07 on ParlaSent-BCS-test finished.
gpt-5
Prediction finished. It took 143.04790797630946 min for 2600 instances - 3.3011055686840645 s per instance.
Classification with gpt-5 on ParlaSent-BCS-test finished.
