In [1]:
import json
import os
import sys
import re
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
translation_suffix = "translation"
paraphrase_suffix = "paraphrase"

translation_files = []
paraphrase_files = {
    "de": [],
    "en": [],
    "fa": [],
    "id": [],
    "ja": [],
    "zh": [],
    "ar": [],
}
languages = ["de", "en", "fa", "id", "ja", "zh", "ar"]
for seed in [42, 123]:
    translation_files.append(f"../outputs/llm-as-a-judge/gpt_bias_results_seed_{seed}_{translation_suffix}.jsonl")

for seed in [42, 123]:
    for language in languages:
        paraphrase_files[language].append(f"../outputs/llm-as-a-judge/{language}_gpt_bias_results_seed_{seed}_{paraphrase_suffix}.jsonl")

In [None]:
translation_files, paraphrase_files

In [4]:
data = {
    "translation": [],
    "paraphrase": {
        "de": [],
        "en": [],
        "fa": [],
        "id": [],
        "ja": [],
        "zh": [],
        "ar": [],
    }
}

for file in translation_files:
    with open(file, "r") as f:
        data["translation"].append(json.load(f))

for language in languages:
    for file in paraphrase_files[language]:
        with open(file, "r") as f:
            data["paraphrase"][language].append(json.load(f))

# Check if the data is loaded correctly
print(f"Translation data: {len(data['translation'])} entries")
print(f"Paraphrase data: {len(data['paraphrase'])} entries")

Translation data: 2 entries
Paraphrase data: 7 entries


In [None]:
scores_pattern = re.compile(
    r"\*\*([A-Z][a-z]+) Text.*?\*\*.*?- Depth of detail: (\d+).*?- Clarity of writing: (\d+).*?- Coherence and logical flow: (\d+).*?- Originality and insight: (\d+).*?- Use of specific examples: (\d+).*?- Accuracy of information: (\d+)",
    re.DOTALL
)


scores_pattern_paraphrase = re.compile(
    r"\*\*(Text [AB])(?: Evaluations)?:\*\*\s*"
    r"1\. ?\**Depth of detail\**?: (\d+) -.*?"
    r"2\. ?\**Clarity of writing\**?: (\d+) -.*?"
    r"3\. ?\**Coherence and logical flow\**?: (\d+) -.*?"
    r"4\. ?\**Originality and insight\**?: (\d+) -.*?"
    r"5\. ?\**Use of specific examples\**?: (\d+) -.*?"
    r"6\. ?\**Accuracy of information\**?: (\d+)",
    re.DOTALL
)


In [6]:
gpt_judge_data_translation = []

for file_index, file in enumerate(data["translation"]):
    temp_list = []
    for data_index, element in enumerate(file):
        shuffled_languages = element["shuffled_languages"]
        matches = scores_pattern.findall(element["result"])
        temp_list.append({"index": data_index, "language-order": shuffled_languages, "scores": [[int(score) for score in match[1:]] for match in matches], "verdict": element["verdict"]})
    gpt_judge_data_translation.append(temp_list)

In [None]:
gpt_judge_data_paraphrase = {
    "de": [],
    "en": [],
    "fa": [],
    "id": [],
    "ja": [],
    "zh": [],
    "ar": [],
}

for language in languages:
    for file_index, file in enumerate(data["paraphrase"][language]):
        individual_run = []
        for data_index, element in enumerate(file):
            shuffled_languages = element["shuffled_languages"]
            matches = scores_pattern_paraphrase.findall(element["result"])
            individual_run.append({"index": data_index, "language-order": shuffled_languages, "scores": [[int(score) for score in match[1:]] for match in matches], "verdict": element["verdict"]})
        gpt_judge_data_paraphrase[language].append(individual_run)

In [16]:
len(gpt_judge_data_paraphrase['en'][0])

100

In [17]:
from collections import Counter

In [18]:
from statistics import mean, variance, stdev

# Calculate individual run results
run_results = []
for run in gpt_judge_data_translation:
    run_verdict_counts = Counter([entry['verdict'] for entry in run])
    total_count_run = sum(run_verdict_counts.values())
    run_percentages = {k: v / total_count_run * 100 for k, v in run_verdict_counts.items()}
    run_results.append(run_percentages)

# Calculate averages, variance, and standard deviation across runs
all_verdicts = [entry['verdict'] for run in gpt_judge_data_translation for entry in run]
verdict_counts_all = Counter(all_verdicts)
total_count_all = sum(verdict_counts_all.values())
verdict_percentages_all = {k: v / total_count_all * 100 for k, v in verdict_counts_all.items()}

averages = {k: mean([run.get(k, 0) for run in run_results]) for k in verdict_percentages_all.keys()}
variances = {k: variance([run.get(k, 0) for run in run_results]) for k in verdict_percentages_all.keys()}
std_devs = {k: stdev([run.get(k, 0) for run in run_results]) for k in verdict_percentages_all.keys()}


In [19]:
# Print results
print("Individual Run Results:")
for i, run in enumerate(run_results):
    print(f"Run {i + 1}:")
    for verdict, percentage in run.items():
        print(f" {verdict}: {percentage:.2f}%")
    # print(f"Run {i + 1}: {run}")

print("\nOverall Statistics:")
print("Averages:", averages)
print("Variances:", variances)
print("Standard Deviations:", std_devs)

# verdict_counter_first_last = Counter(
#     [entry['verdict'] for element in gpt_judge_data_translation for entry in element if entry['language-order'].index(entry['verdict'].lower()) in [0, len(entry['language-order']) - 1]]
# )
# verdict_counts_translation = Counter([entry['verdict'] for element in gpt_judge_data_translation for entry in element])
# # Calculate the percentage for each language
# total_count_translation = sum(verdict_counts_translation.values())
# verdict_percentages_translation = {k: v / total_count_translation * 100 for k, v in verdict_counts_translation.items()}
# verdict_percentages_first_last = {k: v / total_count_translation * 100 for k, v in verdict_counter_first_last.items()}
# print("Translation Verdict Percentages:")
# print("Language: Percentage - Count_first_last")
# for verdict, percentage in verdict_percentages_translation.items():
#     print(f"{verdict}: {percentage:.2f}% - {verdict_percentages_first_last.get(verdict, 0):.2f}%")

Individual Run Results:
Run 1:
 EN: 23.00%
 JA: 17.00%
 FA: 2.00%
 AR: 19.00%
 ZH: 8.00%
 DE: 28.00%
 ID: 3.00%
Run 2:
 EN: 16.00%
 JA: 12.00%
 DE: 33.00%
 AR: 27.00%
 ID: 4.00%
 ZH: 6.00%
 FA: 2.00%

Overall Statistics:
Averages: {'EN': 19.5, 'JA': 14.5, 'FA': 2.0, 'AR': 23.0, 'ZH': 7.0, 'DE': 30.5, 'ID': 3.5}
Variances: {'EN': 24.5, 'JA': 12.5, 'FA': 0.0, 'AR': 32.0, 'ZH': 2.0, 'DE': 12.499999999999982, 'ID': 0.5}
Standard Deviations: {'EN': 4.949747468305833, 'JA': 3.5355339059327378, 'FA': 0.0, 'AR': 5.656854249492381, 'ZH': 1.4142135623730951, 'DE': 3.535533905932735, 'ID': 0.7071067811865476}


In [21]:
# Calculate counts for individual runs
individual_run_counts_paraphrase = {
    language: [
        Counter([entry['verdict'] for entry in gpt_judge_data_paraphrase[language][0]]),
        Counter([entry['verdict'] for entry in gpt_judge_data_paraphrase[language][1]])
    ]
    for language in languages
}

# Calculate statistics for each language
statistics_paraphrase = {}
for language in languages:
    run_totals = [
        sum(individual_run_counts_paraphrase[language][0].values()),
        sum(individual_run_counts_paraphrase[language][1].values())
    ]
    run_percentages = [
        {k: v / run_totals[0] * 100 for k, v in individual_run_counts_paraphrase[language][0].items()},
        {k: v / run_totals[1] * 100 for k, v in individual_run_counts_paraphrase[language][1].items()}
    ]
    all_keys = set(run_percentages[0].keys()).union(run_percentages[1].keys())
    averages = {k: mean([run_percentages[0].get(k, 0), run_percentages[1].get(k, 0)]) for k in all_keys}
    variances = {k: variance([run_percentages[0].get(k, 0), run_percentages[1].get(k, 0)]) for k in all_keys}
    std_devs = {k: stdev([run_percentages[0].get(k, 0), run_percentages[1].get(k, 0)]) for k in all_keys}
    statistics_paraphrase[language] = {
        "averages": averages,
        "variances": variances,
        "std_devs": std_devs
    }

print("Individual Run Results:")
for language in languages:
    print(f"Language: {language}")
    for i, run in enumerate(individual_run_counts_paraphrase[language]):
        print(f" Run {i + 1}:")
        for verdict, count in run.items():
            print(f"  {verdict}: {count}")
            
# Print statistics
for language, stats in statistics_paraphrase.items():
    print(f"Language: {language}")
    print(" Averages:", stats["averages"])
    print(" Variances:", stats["variances"])
    print(" Standard Deviations:", stats["std_devs"])

Individual Run Results:
Language: de
 Run 1:
  perturbed_text: 52
  TIE: 17
  natural_text: 30
  Model Failure: 1
 Run 2:
  perturbed_text: 43
  natural_text: 27
  TIE: 29
  Model Failure: 1
Language: en
 Run 1:
  perturbed_text: 82
  natural_text: 17
  TIE: 1
 Run 2:
  perturbed_text: 69
  natural_text: 31
Language: fa
 Run 1:
  natural_text: 23
  TIE: 40
  perturbed_text: 37
 Run 2:
  perturbed_text: 42
  TIE: 41
  natural_text: 17
Language: id
 Run 1:
  TIE: 19
  natural_text: 36
  perturbed_text: 45
 Run 2:
  TIE: 24
  perturbed_text: 51
  natural_text: 25
Language: ja
 Run 1:
  TIE: 33
  natural_text: 28
  perturbed_text: 39
 Run 2:
  natural_text: 29
  TIE: 42
  perturbed_text: 29
Language: zh
 Run 1:
  perturbed_text: 31
  TIE: 43
  natural_text: 26
 Run 2:
  perturbed_text: 29
  TIE: 45
  natural_text: 26
Language: ar
 Run 1:
  natural_text: 35
  TIE: 28
  perturbed_text: 37
 Run 2:
  perturbed_text: 44
  natural_text: 26
  TIE: 30
Language: de
 Averages: {'TIE': 23.0, 'perturb

Overall Statistics:
Averages: {'EN': 19.5, 'JA': 14.5, 'FA': 2.0, 'AR': 23.0, 'ZH': 7.0, 'DE': 30.5, 'ID': 3.5}
Variances: {'EN': 24.5, 'JA': 12.5, 'FA': 0.0, 'AR': 32.0, 'ZH': 2.0, 'DE': 12.499999999999982, 'ID': 0.5}
Standard Deviations: {'EN': 4.949747468305833, 'JA': 3.5355339059327378, 'FA': 0.0, 'AR': 5.656854249492381, 'ZH': 1.4142135623730951, 'DE': 3.535533905932735, 'ID': 0.7071067811865476}

### Translation Verdict Percentages
| Language | Percentage (%) | Count_first_last (%) |
|----------|:-------------:|:--------------------:|
| EN       | 19.50 ± 4.95  | 6.50         |
| JA       | 14.50 ± 3.54  | 13.00        |
| FA       | 2.00 ± 0.00   | 2.00         |
| AR       | 23.00 ± 5.66  | 13.00        |
| ZH       | 7.00 ± 1.41   | 6.50         |
| DE       | 30.50 ± 3.54  | 11.00        |
| ID       | 3.50 ± 0.71   | 3.50         |


### Paraphrase Verdict Percentages

| Language | perturbed_text (%) | natural_text (%) | TIE (%) | Model Failure (%) |
|----------|:-----------------:|:---------------:|:-------:|:-----------------:|
| zh       | 30.00             | 26.00           | 44.00   |                   |
| fa       | 39.50             | 20.00           | 40.50   |                   |
| ja       | 34.00             | 28.50           | 37.50   |                   |
| ar       | 40.50             | 30.50           | 29.00   |                   |
| de       | 47.50             | 28.50           | 23.00   | 1.00              |
| id       | 48.00             | 30.50           | 21.50   |                   |
| en       | 75.50             | 24.00           | 0.50    |                   |

### Individual Runs Paraphrase:

### Individual Run Results:

| Language | Run | Perturbed Text (%) | Natural Text (%) | TIE (%) | Model Failure (%) |
|----------|-----|--------------------|------------------|---------|-------------------|
| de       | 1   | 52                 | 30               | 17      | 1                 |
| de       | 2   | 43                 | 27               | 29      | 1                 |
| en       | 1   | 82                 | 17               | 1       |                   |
| en       | 2   | 69                 | 31               | 0       |                   |
| fa       | 1   | 37                 | 23               | 40      |                   |
| fa       | 2   | 42                 | 17               | 41      |                   |
| id       | 1   | 45                 | 36               | 19      |                   |
| id       | 2   | 51                 | 25               | 24      |                   |
| ja       | 1   | 39                 | 28               | 33      |                   |
| ja       | 2   | 29                 | 29               | 42      |                   |
| zh       | 1   | 31                 | 26               | 43      |                   |
| zh       | 2   | 29                 | 26               | 45      |                   |
| ar       | 1   | 37                 | 35               | 28      |                   |
| ar       | 2   | 44                 | 26               | 30      |                   |


### Paraphrase Statistics by Language
| Language | TIE (%)          | Perturbed Text (%)   | Natural Text (%)     | Model Failure (%) |
|----------|------------------|---------------------|---------------------|-------------------|
| de       | 23.0 ± 8.49      | 47.5 ± 6.36         | 28.5 ± 2.12         | 1.0               |
| en       | 0.5 ± 0.71       | 75.5 ± 9.19         | 24.0 ± 9.90         |                   |
| fa       | 40.5 ± 0.71      | 39.5 ± 3.54         | 20.0 ± 4.24         |                   |
| id       | 21.5 ± 3.54      | 48.0 ± 4.24         | 30.5 ± 7.78         |                   |
| ja       | 37.5 ± 6.36      | 34.0 ± 7.07         | 28.5 ± 0.71         |                   |
| zh       | 44.0 ± 1.41      | 30.0 ± 1.41         | 26.0 ± 0.0          |                   |
| ar       | 29.0 ± 1.41      | 40.5 ± 4.95         | 30.5 ± 6.36         |                   |


## Observations
From traslation results, we see that Persian, Chinese, and Indonesian languages receive the lowest scores in terms of quality although all language texts used are translations from the original English texts. Previous studies indicated bias in llms-as-a-judge methods such as [1-5]. In order to try to get an unbaised judgement as much as possible we randomized the order in which texts are inputted to the text and we experiment with two runs with different seeds for 100 examples. Notice that, in the translation results, there are no TIEs, which indicates large confidence in choosing a language of the winning texts. German, English and Arabic seem to be the once favored the most by GPT-4o-mini.

Studies also indicate [4] that LLMs can favor texts generated by a variant of the same judging model. We used GPT-3.5-turbo to translated from English to all languages, and we used GPT-4o-mini for judgements. The reason Arabic language was chosen more than the other intenationl langauges like Persian (same language family but with less scores) maybe attributed to translations intricacies by the translator which made the text easier for GPT-4o-mini to judge. To get a closer look and more understanding at this type of bias, we also did experiments of the same text judging. However, we want to make sure that the both texts under judging have the same meaning and closer lengths. For this purpose we introduce the results in the second table which is about judging two texts that are the same but one of them is a paraphrase of the other.

In the paraphrase experiments, we say that when the model has more TIEs then it might not be confident about one text being better than the other. Comparing the paraphrase results with that of the translations, we noticed that the languages that received lower scores in transation tables are at the top of the paraphrase tables in terms of the TIE scores except for the Indoneisan language. In Indonesian language, the alphabet used are the same as English's, which might indicate a bias called token bias as indicated in this work [1]. Another observation is that perturbed text is favored more by LLM judger. This just confirms studies that indicate LLMs favoring texts from their own generations or from a variant model.


References

[1] Zheng, C., Zhou, H., Meng, F., Zhou, J., & Huang, M. Large language models are not robust multiple choice selectors, 2024. URL https://arxiv. org/abs/2309.03882. (token bias)

[2] Koo, R., Lee, M., Raheja, V., Park, J. I., Kim, Z. M., & Kang, D. (2023). Benchmarking cognitive biases in large language models as evaluators. arXiv preprint arXiv:2309.17012. (benchmark biases)

[3] https://www.simonpcouch.com/blog/2025-01-30-llm-biases/ (blog)

[4] Ye, J., Wang, Y., Huang, Y., Chen, D., Zhang, Q., Moniz, N., ... & Zhang, X. (2024). Justice or prejudice? quantifying biases in llm-as-a-judge. arXiv preprint arXiv:2410.02736. (biases including LLM favoring own generations)

[5] Pezeshkpour, P., & Hruschka, E. (2023). Large language models sensitivity to the order of options in multiple-choice questions. arXiv preprint arXiv:2308.11483. (position bias)