## 1. Importing Libraries

In [None]:
import sklearn
import pandas as pd
import numpy as np

## 2. Select A Model to Evaluate
- You can choose from these models:
    - gemma_3_No-In-Context-Learning
    - gemma_3_With-In-Context-Learning
    - gemma_3_With-Prompt-Language_Teacher
    - gemma_3_With-Prompt-Rules
    - helsinki_with-Augmented-Dataset
    - helsinky_With-No-Augmented-Dataset
    - Llama8b
    - Flan_t5_XL
- If you want to save the evaluation as a json file turn var "save_as_json" to "True"

In [None]:
MODEL = "Flan_t5_XL"

save_as_json = True

## 3. Models

In [None]:
file_name = ""
match MODEL:
    # GEMMA FAM
    case "gemma_3_NOCONTEXT":
        file_name = "gemma_3_1b_nocontext_res"
    case "gemma_3_CONTEXT":
        file_name = "gemma_3_1b_context_res"
    case "gemma_3_LANGUAGE_TEACHER":
        file_name = "gemma_3_1b_language_teacher_res"
    case "gemma_3_RULES":
        file_name = "gemma_3_1b_with_rules_res"


    #HELSINKI FAM
    case "helsinki_AUG":
        file_name = "Helsinki_res"
    case "helsinky_NOAUG":
        file_name = "Helsinki_noaug_res"

    #LLAMA
    case "Llama8b":
        file_name = "LLama8b_res"

    #FLAN T5
    case "Flan_t5_XL":
        file_name = "flan_t5_XL_res"

    #BASE CASE
    case _:
        result_path = ""

result_path =f"./model_translation_result/{file_name}.csv"

## 4. Loading Results

In [None]:
df = pd.read_csv(result_path, sep=";")

In [None]:
gpt_score = df["GPT_score"].to_numpy()
user_score = df["user_score"].to_numpy()
prometheus_score = df["prometheus_score"].to_numpy()

print(gpt_score)

[1 4 5 2 2 4 2 3 4 3]


## 5. Correlation between score

In [None]:
print(f"--------Compute Mean for each score--------")

print(f"GPT score        -> {np.mean(gpt_score)}")
print(f"Prometheus score -> {np.mean(prometheus_score)}")
print(f"User score       -> {np.mean(user_score)}")


--------Compute Mean for each score--------
GPT score        -> 3.0
Prometheus score -> 2.2
User score       -> 1.6


1. GPT score correlation

In [None]:
from scipy.stats import pearsonr, spearmanr, kendalltau


pearson_corr, _ = pearsonr(user_score, gpt_score)
spearman_corr, _ = spearmanr(user_score, gpt_score)
kendall_corr, _ = kendalltau(user_score, gpt_score)


print(f"Pearson:  {pearson_corr:.2f}")
print(f"Spearman: {spearman_corr:.2f}")
print(f"Kendall:  {kendall_corr:.2f}")

Prometheus_score = {"Pearson": pearson_corr, "spearman_corr": spearman_corr, "kendall_corr": kendall_corr}

Pearson:  0.53
Spearman: 0.54
Kendall:  0.43


2. Prometheus score correlation

In [None]:
from scipy.stats import pearsonr, spearmanr, kendalltau


pearson_corr, _ = pearsonr(user_score, prometheus_score)
spearman_corr, _ = spearmanr(user_score, prometheus_score)
kendall_corr, _ = kendalltau(user_score, prometheus_score)


print(f"Pearson:  {pearson_corr:.2f}")
print(f"Spearman: {spearman_corr:.2f}")
print(f"Kendall:  {kendall_corr:.2f}")

GPT_scores = {"Pearson": pearson_corr, "spearman_corr": spearman_corr, "kendall_corr": kendall_corr}

Pearson:  0.64
Spearman: 0.56
Kendall:  0.53


## 6. Save Json

In [None]:
import json
import os
# Costruisci la struttura JSON richiesta
output = [df.to_dict(orient='records'), {'GPT_score': GPT_scores}, {'Prometheus_score': Prometheus_score}]

path = f"./model_translation_result_json/{file_name}.json"
os.makedirs(os.path.dirname(path), exist_ok=True)
# Salva su file
with open(path, 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=2, ensure_ascii=False)