In [1]:
import pandas as pd
import json
import numpy as np
from time import sleep
import time
from tqdm import tqdm
import requests
from pydantic import BaseModel
url = open("local_models_path.txt", "r").read()

In [2]:
# Load the test datasets
test_en = pd.read_json("../../datasets/copa-en-test.jsonl", lines=True)
test_sl = pd.read_json("../../datasets/copa-sl-test.jsonl", lines=True)
test_hr_ckm = pd.read_json("../../datasets/copa-hr-ckm-test.jsonl", lines=True)
test_hr = pd.read_json("../../datasets/copa-hr-test.jsonl", lines=True)
test_mk = pd.read_json("../../datasets/copa-mk-test.jsonl", lines=True)
test_sl_cer = pd.read_json("../../datasets/copa-sl-cer-test.jsonl", lines=True)
test_sr = pd.read_json("../../datasets/copa-sr-test.jsonl", lines=True)
test_sr_tor = pd.read_json("../../datasets/copa-sr-tor-test.jsonl", lines=True)

print(test_en.shape, test_sl.shape, test_hr.shape, test_hr_ckm.shape, test_mk.shape, test_sl_cer.shape, test_sr.shape, test_sr_tor.shape)

(500, 5) (500, 5) (500, 7) (500, 5) (500, 7) (500, 5) (500, 7) (500, 5)


In [3]:
tests = ["copa-en","copa-sl", "copa-hr", "copa-hr-ckm", "copa-mk", "copa-sl-cer", "copa-sr", "copa-sr-tor"]

In [4]:
def run_local_model(model, prompt, url=url):

	class ReponseStructure(BaseModel):
		answer: int

	data = {
	    "model": model,
	    "prompt": prompt,
	    "stream": False,
	    "temperature": 0,
	    "format": ReponseStructure.model_json_schema()
	}

	headers = {"Content-Type": "application/json",}
	response = requests.post(url, json=data, headers=headers)

	return response.json()["response"]

In [5]:
models = ["gemma3:27b", "llama3.3:latest", "qwen3:32b", "deepseek-r1:14b"]

In [9]:
def predict_gpt(df_test_name, gpt_model):

	df_path = f"../../datasets/{df_test_name}-test.jsonl"

	responses = []

	start_time = time.time()
	for line in open(df_path):
		entry=json.loads(line)
		if df_test_name != "copa-en":
			prompt= 'You will be given a task. The task definition is in English, but the task itself is in another language. Here is the task!\nGiven the premise "'+entry['premise']+'",'
			if entry['question']=='cause':
				prompt+=' and that we are looking for the cause of this premise,'
			else:
				prompt+=' and that we are looking for the result of this premise, '
			prompt+=f"""which hypothesis is more plausible?\nHypothesis 1: "{entry['choice1']}".\nHypothesis 2: "{entry['choice2']}".
					
			### Output format
				Return a valid JSON dictionary with the following key: 'answer' and a value should be an integer -- either 1 (if hypothesis 1 is more plausible) or 2 (if hypothesis 2 is more plausible).
			"""
		elif df_test_name == "copa-en":
			prompt= 'You will be given a task. The task definition is in English, as is the task itself. Here is the task!\nGiven the premise "'+entry['premise']+'",'
			if entry['question']=='cause':
				prompt+=' and that we are looking for the cause of this premise,'
			else:
				prompt+=' and that we are looking for the result of this premise,'
			prompt+=f"""which hypothesis is more plausible?\nHypothesis 1: "{entry['choice1']}".\nHypothesis 2: "{entry['choice2']}".
					
			### Output format
				Return a valid JSON dictionary with the following key: 'answer' and a value should be an integer -- either 1 (if hypothesis 1 is more plausible) or 2 (if hypothesis 2 is more plausible).
			"""

		if gpt_model == "GaMS-27B":
			gpt_model_path = "hf.co/mradermacher/GaMS-27B-Instruct-i1-GGUF:i1-Q4_K_M"
		else:
			gpt_model_path = gpt_model

		initial_response= run_local_model(gpt_model_path, prompt, url=url)

		response = initial_response.replace("\n", "")
		response = response.replace("\t", "")

		# Convert the string into a dictionary
		response = json.loads(response)

		# Get out a label
		try:
			# The true labels are 0 or 1, so you have to change the answer by substracting 1 from it.
			predicted = response["answer"]-1
			responses.append(predicted)
		# add a possibility of something going wrong
		except:
			predicted = initial_response
			print("error with extracting a label:")
			print(initial_response)
			responses.append(initial_response)

	end_time = time.time()
	elapsed_time_min = end_time-start_time

	print(f"Prediction finished. It took {elapsed_time_min/60} min for 500 instances - {elapsed_time_min/500} s per instance.")

	# Create a json with results

	current_results = {
		"system": gpt_model,
		"predictions": [
			{
			"train": "NA (zero-shot)",
			"test": "{}".format(df_test_name),
			"predictions": responses,
			},
		]
		}

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format(gpt_model, df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format(gpt_model, df_test_name))

In [None]:
# First, check that everything works well by evaluating the models on English
for model in models:
	print(model)
	predict_gpt("copa-en", model)

In [17]:
tests = ["copa-en","copa-sl", "copa-hr", "copa-hr-ckm", "copa-mk", "copa-sl-cer", "copa-sr", "copa-sr-tor"]

In [18]:
# Evaluate the models also on Slovenian
for model in ["gemma3:27b", "llama3.3:latest", "qwen3:32b"]:
	print(model)
	predict_gpt("copa-sl", model)

gemma3:27b
Prediction finished. It took 5.792300760746002 min for 500 instances - 0.6950760912895203 s per instance.
Classification with gemma3:27b on copa-sl finished.
llama3.3:latest
Prediction finished. It took 5.281671710809072 min for 500 instances - 0.6338006052970886 s per instance.
Classification with llama3.3:latest on copa-sl finished.
qwen3:32b
Prediction finished. It took 3.2755377888679504 min for 500 instances - 0.39306453466415403 s per instance.
Classification with qwen3:32b on copa-sl finished.


In [None]:
# Now, evaluate the models on all other datasets
for test in tests[1:]:
	for model in ["gemma3:27b", "llama3.3:latest", "qwen3:32b"]:
		print(model)
		predict_gpt(test, model)


gemma3:27b
Prediction finished. It took 5.592840758959452 min for 500 instances - 0.6711408910751343 s per instance.
Classification with gemma3:27b on copa-hr finished.
llama3.3:latest
Prediction finished. It took 19.373022063573202 min for 500 instances - 2.324762647628784 s per instance.
Classification with llama3.3:latest on copa-hr finished.
qwen3:32b
Prediction finished. It took 3.1452640771865843 min for 500 instances - 0.3774316892623901 s per instance.
Classification with qwen3:32b on copa-hr finished.
gemma3:27b
Prediction finished. It took 5.484848415851593 min for 500 instances - 0.6581818099021912 s per instance.
Classification with gemma3:27b on copa-hr-ckm finished.
llama3.3:latest
Prediction finished. It took 5.325845523675283 min for 500 instances - 0.639101462841034 s per instance.
Classification with llama3.3:latest on copa-hr-ckm finished.
qwen3:32b
Prediction finished. It took 3.115610110759735 min for 500 instances - 0.37387321329116824 s per instance.
Classificati

In [14]:
# Add the DeepSeek
for test in tests:
	predict_gpt(test, "deepseek-r1:14b")

Prediction finished. It took 4.298616341749827 min for 500 instances - 0.5158339610099792 s per instance.
Classification with deepseek-r1:14b on copa-en finished.
Prediction finished. It took 4.2313922842343645 min for 500 instances - 0.5077670741081238 s per instance.
Classification with deepseek-r1:14b on copa-hr finished.
Prediction finished. It took 4.1268409808476765 min for 500 instances - 0.49522091770172116 s per instance.
Classification with deepseek-r1:14b on copa-hr-ckm finished.
Prediction finished. It took 4.145345358053843 min for 500 instances - 0.4974414429664612 s per instance.
Classification with deepseek-r1:14b on copa-mk finished.
Prediction finished. It took 4.042402517795563 min for 500 instances - 0.48508830213546755 s per instance.
Classification with deepseek-r1:14b on copa-sl-cer finished.
Prediction finished. It took 4.092441606521606 min for 500 instances - 0.4910929927825928 s per instance.
Classification with deepseek-r1:14b on copa-sr finished.
Prediction

In [11]:
tests

['copa-en',
 'copa-sl',
 'copa-hr',
 'copa-hr-ckm',
 'copa-mk',
 'copa-sl-cer',
 'copa-sr',
 'copa-sr-tor']

In [12]:
predict_gpt("copa-sl", "deepseek-r1:14b")

Prediction finished. It took 4.256381364663442 min for 500 instances - 0.510765763759613 s per instance.
Classification with deepseek-r1:14b on copa-sl finished.


In [7]:
import requests


url = 'http://kt-gpu5.ijs.si:11435/api/generate'
data = {
    "model": "hf.co/mradermacher/GaMS-27B-Instruct-i1-GGUF:i1-Q4_K_M",
    "prompt": "Explain what machine learning is.",
    "stream": False
}


headers = {"Content-Type": "application/json"}
response = requests.post(url, json=data, headers=headers)

result = response.json()

result

{'model': 'hf.co/mradermacher/GaMS-27B-Instruct-i1-GGUF:i1-Q4_K_M',
 'created_at': '2025-10-22T12:37:29.447197783Z',
 'response': 'Machine learning is a subset of artificial intelligence that involves training algorithms to learn patterns in data and make predictions or decisions based on new, unseen data.',
 'done': True,
 'done_reason': 'stop',
 'context': [106,
  1645,
  108,
  74198,
  1212,
  6479,
  6044,
  603,
  235265,
  107,
  108,
  106,
  2516,
  108,
  24911,
  6044,
  603,
  476,
  38397,
  576,
  18225,
  17273,
  674,
  18348,
  4770,
  28514,
  577,
  3918,
  12136,
  575,
  1423,
  578,
  1501,
  32794,
  689,
  12013,
  3482,
  611,
  888,
  235269,
  76926,
  1423,
  235265],
 'total_duration': 6886242363,
 'load_duration': 6099193403,
 'prompt_eval_count': 15,
 'prompt_eval_duration': 156567271,
 'eval_count': 30,
 'eval_duration': 629435991}

In [10]:
# Add GaMS:
for test in tests:
	predict_gpt(test, "GaMS-27B")

Prediction finished. It took 2.9194748163223267 min for 500 instances - 0.3503369779586792 s per instance.
Classification with GaMS-27B on copa-en finished.
Prediction finished. It took 2.9133307933807373 min for 500 instances - 0.34959969520568845 s per instance.
Classification with GaMS-27B on copa-sl finished.
Prediction finished. It took 2.9452760418256125 min for 500 instances - 0.3534331250190735 s per instance.
Classification with GaMS-27B on copa-hr finished.
Prediction finished. It took 2.948777413368225 min for 500 instances - 0.353853289604187 s per instance.
Classification with GaMS-27B on copa-hr-ckm finished.
Prediction finished. It took 2.9747700611750285 min for 500 instances - 0.35697240734100344 s per instance.
Classification with GaMS-27B on copa-mk finished.
Prediction finished. It took 2.9812559445699054 min for 500 instances - 0.35775071334838865 s per instance.
Classification with GaMS-27B on copa-sl-cer finished.
Prediction finished. It took 2.9419235825538634 m