In [1]:
import pandas as pd

# Loads the Excel file
file_path = "TranslationLog_Current.xlsx"
excel_file = pd.ExcelFile(file_path)

# Preview sheet names to see what we're working with
print("Available sheets:", excel_file.sheet_names)

Available sheets: ['EN-DE', 'EN-SL', 'PromptTemp']


In [2]:
# Load the 'EN-DE' sheet into a DataFrame
sheet_name = "EN-DE"
df_de = pd.read_excel(file_path, sheet_name=sheet_name)

# Show column names to understand the structure
print("Column names:")
print(df_de.columns.tolist())

# Show a preview of the first 5 rows
df_de.head()

Column names:
['Unnamed: 0', 'Source sentence', 'Label', 'Human translation', 'GPT-4o Type (Explicit) 1', 'GPT-4o Prompt Text 1', 'GPT-4o Output 1', 'GPT-4o BLEU 1', 'GPT-4o Human Eval 1', 'GPT-4o Type (Implicit) 2', 'GPT-4o Prompt Text 2', 'GPT-4o Output 2', 'GPT-4o BLEU 2', 'GPT-4o Human Eval 2', 'DS Type (Explicit) 1', 'DS Prompt Text 1', 'DS Output 1', 'DS BLEU 1', 'DS Human eval 1', 'DS Type (Implicit) 2', 'DS Prompt Text 2', 'DS Output 2', 'DS BLEU 2', 'DS Human eval 2', 'Gemma Type (Explicit) 1', 'Gemma Prompt Text 1', 'Gemma Output 1', 'Gemma BLEU 1', 'Gemma Human eval 1', 'Gemma Type (Implicit) 2', 'Gemma Prompt Text 2', 'Gemma Output 2', 'Gemma BLEU 2', 'Gemma Human eval 2', 'Llama Type (Explicit) 1', 'Llama Prompt Text 1', 'Llama Output 1', 'Llama BLEU 1', 'Llama Human eval 1', 'Llama Type (Implicit) 2', 'Llama Prompt Text 2', 'Llama Output 2', 'Llama BLEU 2', 'Llama Human eval 2', 'Unnamed: 44']


Unnamed: 0.1,Unnamed: 0,Source sentence,Label,Human translation,GPT-4o Type (Explicit) 1,GPT-4o Prompt Text 1,GPT-4o Output 1,GPT-4o BLEU 1,GPT-4o Human Eval 1,GPT-4o Type (Implicit) 2,...,Llama Prompt Text 1,Llama Output 1,Llama BLEU 1,Llama Human eval 1,Llama Type (Implicit) 2,Llama Prompt Text 2,Llama Output 2,Llama BLEU 2,Llama Human eval 2,Unnamed: 44
0,1,"The form contains errors, please correct the f...",Formal,"Das Formular enthält Fehler, bitte korrigieren...",Explicit A,"You are a certified translator, specializing i...","Das Formular enthält Fehler, bitte korrigieren...",,,Implicit A,...,"You are a certified translator, specializing i...",,,,Implicit A,You are a certified translator specializing in...,,,,
1,2,It is discreteely personal though it has inevi...,Formal,"Sie ist eigenständig persönlich, obwohl sie un...",Explicit A,"You are a certified translator, specializing i...","Es ist dezent persönlich, hat jedoch unvermeid...",,,Implicit A,...,"You are a certified translator, specializing i...",,,,Implicit A,You are a certified translator specializing in...,,,,
2,3,The details of your order and the terms and co...,Formal,Die Details Ihrer Bestellung und die AGB werde...,Explicit A,"You are a certified translator, specializing i...",Die Details Ihrer Bestellung sowie die Allgeme...,,,Implicit A,...,"You are a certified translator, specializing i...",,,,Implicit A,You are a certified translator specializing in...,,,,
3,4,"Create a few posts that are well-researched, c...",Formal,"Erstellen Sie einige Beiträge, die gut recherc...",Explicit A,"You are a certified translator, specializing i...","Erstellen Sie ein paar Beiträge, die gut reche...",,,Implicit A,...,"You are a certified translator, specializing i...",,,,Implicit A,You are a certified translator specializing in...,,,,DeepSeek gave 4 different contexts to choose f...
4,5,We assist you in creating a suitable design in...,Formal,Wir helfen Ihnen ein passendes Layout zu entwi...,Explicit A,"You are a certified translator, specializing i...",Wir unterstützen Sie bei der Erstellung eines ...,,,Implicit A,...,"You are a certified translator, specializing i...",,,,Implicit A,You are a certified translator specializing in...,,,,


In [3]:
import sacrebleu

# Reference translations in German
references = df_de["Human translation"].astype(str).tolist()

# Defining the model outputs to evaluate
model_outputs = {
    "GPT-4o Output 1": df_de["GPT-4o Output 1"].astype(str).tolist(),
    "GPT-4o Output 2": df_de["GPT-4o Output 2"].astype(str).tolist(),
    "DS Output 1": df_de["DS Output 1"].astype(str).tolist(),
    "DS Output 2": df_de["DS Output 2"].astype(str).tolist(),
    "Gemma Output 1": df_de["Gemma Output 1"].astype(str).tolist(),
    "Gemma Output 2": df_de["Gemma Output 2"].astype(str).tolist(),
    "Llama Output 1": df_de["Llama Output 1"].astype(str).tolist(),
    "Llama Output 2": df_de["Llama Output 2"].astype(str).tolist(),
}

# Compute and collect BLEU scores on corpus-level
bleu_scores = {}

for label, system_output in model_outputs.items():
    bleu = sacrebleu.corpus_bleu(system_output, [references])
    bleu_scores[label] = bleu.score
    print(f"{label}: {bleu.score:.2f} BLEU")
    
bleu_summary = pd.DataFrame.from_dict(bleu_scores, orient='index', columns=["BLEU Score"])
bleu_summary = bleu_summary.sort_values(by="BLEU Score", ascending=False)
bleu_summary

GPT-4o Output 1: 36.87 BLEU
GPT-4o Output 2: 35.36 BLEU
DS Output 1: 31.65 BLEU
DS Output 2: 30.47 BLEU
Gemma Output 1: 26.70 BLEU
Gemma Output 2: 28.56 BLEU
Llama Output 1: 0.00 BLEU
Llama Output 2: 0.00 BLEU


Unnamed: 0,BLEU Score
GPT-4o Output 1,36.87301
GPT-4o Output 2,35.361362
DS Output 1,31.647601
DS Output 2,30.472965
Gemma Output 2,28.561826
Gemma Output 1,26.697578
Llama Output 1,0.0
Llama Output 2,0.0


In [9]:
from sacrebleu.metrics import BLEU

# Creating a BLEU scorer
bleu = BLEU(effective_order=True)

# Defining a helper function
def compute_sentence_bleu(output, reference):
    return bleu.sentence_score(output, [reference]).score

# List of column pairs
output_columns = [
    ("GPT-4o Output 1", "BLEU_sent_GPT4o_1"),
    ("GPT-4o Output 2", "BLEU_sent_GPT4o_2"),
    ("DS Output 1", "BLEU_sent_DS_1"),
    ("DS Output 2", "BLEU_sent_DS_2"),
    ("Gemma Output 1", "BLEU_sent_Gemma_1"),
    ("Gemma Output 2", "BLEU_sent_Gemma_2"),
    ("Llama Output 1", "BLEU_sent_Llama_1"),
    ("Llama Output 2", "BLEU_sent_Llama_2"),
]

# Applying the BLEU score for each column
for output_col, bleu_col in output_columns:
    df_de[bleu_col] = [
        compute_sentence_bleu(str(output), str(ref))
        for output, ref in zip(df_de[output_col], df_de["Human translation"])
    ]

# Preview the new BLEU columns
df_de[[col for _, col in output_columns]].head()

It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 

Unnamed: 0,BLEU_sent_GPT4o_1,BLEU_sent_GPT4o_2,BLEU_sent_DS_1,BLEU_sent_DS_2,BLEU_sent_Gemma_1,BLEU_sent_Gemma_2,BLEU_sent_Llama_1,BLEU_sent_Llama_2
0,68.896568,46.991522,44.815017,18.207053,44.815017,44.815017,0.0,0.0
1,34.47206,24.941747,26.340674,34.47206,34.315019,34.315019,0.0,0.0
2,40.896015,37.596635,37.596635,23.397626,54.451788,37.596635,0.0,0.0
3,40.787688,51.411814,39.094431,1.559298,35.153841,9.279771,0.0,0.0
4,28.914467,20.750199,39.169311,29.951336,30.674732,14.543207,0.0,0.0


In [None]:
df_de.to_excel("TranslationLog_DE_with_sentenceBLEU.xlsx", index=False)