In [9]:
from bert_score import BERTScorer


### Disclaimer
The human reference of each test case has been translated by me, and reviewed by English teacher.

In [10]:
# Source sentence in Dutch
dutch_source = [
    "Doordat u de recreatiewoning oneigenlijk laat gebruiken handelt u in strijd met de op grond van artikel 12.1, onder a, van het omgevingsplan op de betreffende gronden rustende bestemming."
]

# Machine translations in English
machine_translations = [
    "By allowing the recreational property to be used improperly, you are acting in violation of the designated use for the relevant grounds as specified under Article 12.1(a) of the environmental plan.",  # Machine translation ChatGPT
    "By allowing the recreational home to be used improperly, you are acting in violation of the zoning under Article 12.1(a) of the environmental plan on the land in question.",  # Machine translation DeepL
    "By allowing the holiday home to be used improperly, you are acting in violation of the zoning plan applicable to the land in question pursuant to Article 12.1, sub a, of the environmental plan.",  # Machine translation Google Translate
]

# Human reference translation in English
human_reference = [
    "Improper use of the holiday home violates the zoning designation outlined in article 12.1, subsection a, of the zoning plan for this area."
]

bert_scorer = BERTScorer(lang='en', model_type="bert-base-multilingual-cased")

# Evaluate each machine translation against the human reference
print("BERTScore Evaluation:\n")
for i, translation in enumerate(machine_translations):
    P, R, F1 = bert_scorer.score([translation], human_reference)  
    print(f"Machine Translation {i + 1}:")
    print(f"Translation: {translation}")
    print(f"Precision: {P[0].item():.4f}")  
    print(f"Recall: {R[0].item():.4f}")
    print(f"F1 Score: {F1[0].item():.4f}")
    print()


BERTScore Evaluation:

Machine Translation 1:
Translation: By allowing the recreational property to be used improperly, you are acting in violation of the designated use for the relevant grounds as specified under Article 12.1(a) of the environmental plan.
Precision: 0.7663
Recall: 0.7851
F1 Score: 0.7756

Machine Translation 2:
Translation: By allowing the recreational home to be used improperly, you are acting in violation of the zoning under Article 12.1(a) of the environmental plan on the land in question.
Precision: 0.7815
Recall: 0.8116
F1 Score: 0.7963

Machine Translation 3:
Translation: By allowing the holiday home to be used improperly, you are acting in violation of the zoning plan applicable to the land in question pursuant to Article 12.1, sub a, of the environmental plan.
Precision: 0.7905
Recall: 0.8450
F1 Score: 0.8168



In [11]:
# Source sentence in Dutch
dutch_source = [
    "Aan de last kunt u voldoen door het oneigenlijk laten gebruiken van de recreatiewoning te beëindigen en/of beëindigd te houden."
]

# Machine translations in English
machine_translations = [
    "You can comply with the order by stopping and/or continuing to prevent the improper use of the recreational property.",  # Machine translation ChatGPT
    "You can comply with the charge by terminating and/or keeping terminated the improper use of the recreational home.",  # Machine translation DeepL
    "You can comply with the order by terminating and/or having terminated the improper use of the holiday home.",  # Machine translation Google Translate
]

# Human reference translation in English
human_reference = [
    "You can comply with the order by ceasing and/or ensuring the improper use of the recreational property remains ceased."
]

bert_scorer = BERTScorer(lang='en', model_type="bert-base-multilingual-cased")

# Evaluate each machine translation against the human reference
print("BERTScore Evaluation:\n")
for i, translation in enumerate(machine_translations):
    P, R, F1 = bert_scorer.score([translation], human_reference)  
    print(f"Machine Translation {i + 1}:")
    print(f"Translation: {translation}")
    print(f"Precision: {P[0].item():.4f}")  
    print(f"Recall: {R[0].item():.4f}")
    print(f"F1 Score: {F1[0].item():.4f}")
    print()


BERTScore Evaluation:

Machine Translation 1:
Translation: You can comply with the order by stopping and/or continuing to prevent the improper use of the recreational property.
Precision: 0.9274
Recall: 0.9135
F1 Score: 0.9204

Machine Translation 2:
Translation: You can comply with the charge by terminating and/or keeping terminated the improper use of the recreational home.
Precision: 0.9038
Recall: 0.8945
F1 Score: 0.8991

Machine Translation 3:
Translation: You can comply with the order by terminating and/or having terminated the improper use of the holiday home.
Precision: 0.9031
Recall: 0.8899
F1 Score: 0.8965



In [12]:
# Source sentence in Dutch
dutch_source = [
    "Wij zijn voornemens om aan de last onder dwangsom een bedrag te verbinden van € 25.000,00 ineens."
]

# Machine translations in English
machine_translations = [
    "We intend to attach a lump sum of €25,000.00 to the penalty payment order.",  # Machine translation ChatGPT
    "We intend to impose a penalty of €25,000.00 as a single amount under the enforcement order.",  # Machine translation DeepL
    "We intend to attach a lump sum of €25,000.00 to the order penalty payment.",  # Machine translation Google Translate
]

# Human reference translation in English
human_reference = [
    "We intend to impose a penalty of €25,000.00 for non-compliance."
]

bert_scorer = BERTScorer(lang='en', model_type="bert-base-multilingual-cased")

# Evaluate each machine translation against the human reference
print("BERTScore Evaluation:\n")
for i, translation in enumerate(machine_translations):
    P, R, F1 = bert_scorer.score([translation], human_reference)  
    print(f"Machine Translation {i + 1}:")
    print(f"Translation: {translation}")
    print(f"Precision: {P[0].item():.4f}")  
    print(f"Recall: {R[0].item():.4f}")
    print(f"F1 Score: {F1[0].item():.4f}")
    print()


BERTScore Evaluation:

Machine Translation 1:
Translation: We intend to attach a lump sum of €25,000.00 to the penalty payment order.
Precision: 0.8713
Recall: 0.8857
F1 Score: 0.8784

Machine Translation 2:
Translation: We intend to impose a penalty of €25,000.00 as a single amount under the enforcement order.
Precision: 0.8913
Recall: 0.9139
F1 Score: 0.9024

Machine Translation 3:
Translation: We intend to attach a lump sum of €25,000.00 to the order penalty payment.
Precision: 0.8686
Recall: 0.8832
F1 Score: 0.8758



In [13]:
# Source sentence in Dutch
dutch_source = [
    "Hij heeft het bij het rechte eind. Hij had dit al voorspeld. Daarom vertrouwt iedereen op zijn oordeel."
]

# Machine translations in English
machine_translations = [
    "He is right. He had already predicted this, which is why everyone trusts his judgment.",  # Machine translation ChatGPT
    "He is correct. He had already predicted this. That's why everyone trusts His judgment.",  # Machine translation DeepL
    "He is right. He predicted this already. That is why everyone trusts his judgment.",  # Machine translation Google Translate
]

# Human reference translation in English
human_reference = [
    "He is right. He had already predicted this, which is why everyone trusts his judgement."
]

bert_scorer = BERTScorer(lang='en', model_type="bert-base-multilingual-cased")

# Evaluate each machine translation against the human reference
print("BERTScore Evaluation:\n")
for i, translation in enumerate(machine_translations):
    P, R, F1 = bert_scorer.score([translation], human_reference)  
    print(f"Machine Translation {i + 1}:")
    print(f"Translation: {translation}")
    print(f"Precision: {P[0].item():.4f}")  
    print(f"Recall: {R[0].item():.4f}")
    print(f"F1 Score: {F1[0].item():.4f}")
    print()


BERTScore Evaluation:

Machine Translation 1:
Translation: He is right. He had already predicted this, which is why everyone trusts his judgment.
Precision: 0.9875
Recall: 0.9792
F1 Score: 0.9833

Machine Translation 2:
Translation: He is correct. He had already predicted this. That's why everyone trusts His judgment.
Precision: 0.9138
Recall: 0.9096
F1 Score: 0.9117

Machine Translation 3:
Translation: He is right. He predicted this already. That is why everyone trusts his judgment.
Precision: 0.9292
Recall: 0.9093
F1 Score: 0.9192



In [14]:
# Source sentence in Dutch
dutch_source = [
    "Ik zal het in de gaten houden. Ik zorg ervoor dat er niets misgaat. Jij kunt intussen op andere dingen focussen."
]

# Machine translations in English
machine_translations = [
    "I will keep an eye on it. I’ll make sure nothing goes wrong. In the meantime, you can focus on other things.",  # Machine translation ChatGPT
    "I will keep an eye on it. I'll make sure nothing goes wrong. You can focus on other things in the meantime.",  # Machine translation DeepL
    "I'll keep an eye on it. I'll make sure nothing goes wrong. You can focus on other things in the meantime.",  # Machine translation Google Translate
]

# Human reference translation in English
human_reference = [
    "I will keep an eye on it. I'll amke sure nothing goes wrong so you can focus on other things."
]

bert_scorer = BERTScorer(lang='en', model_type="bert-base-multilingual-cased")

# Evaluate each machine translation against the human reference
print("BERTScore Evaluation:\n")
for i, translation in enumerate(machine_translations):
    P, R, F1 = bert_scorer.score([translation], human_reference)  
    print(f"Machine Translation {i + 1}:")
    print(f"Translation: {translation}")
    print(f"Precision: {P[0].item():.4f}")  
    print(f"Recall: {R[0].item():.4f}")
    print(f"F1 Score: {F1[0].item():.4f}")
    print()


BERTScore Evaluation:

Machine Translation 1:
Translation: I will keep an eye on it. I’ll make sure nothing goes wrong. In the meantime, you can focus on other things.
Precision: 0.8797
Recall: 0.9052
F1 Score: 0.8923

Machine Translation 2:
Translation: I will keep an eye on it. I'll make sure nothing goes wrong. You can focus on other things in the meantime.
Precision: 0.9041
Recall: 0.9227
F1 Score: 0.9133

Machine Translation 3:
Translation: I'll keep an eye on it. I'll make sure nothing goes wrong. You can focus on other things in the meantime.
Precision: 0.8931
Recall: 0.9073
F1 Score: 0.9002



In [15]:
# Source sentence in Dutch
dutch_source = [
    "Hij ziet door de bomen het bos niet meer. Alles lijkt hem te veel te worden. Hij heeft hulp nodig om alles op een rijtje te krijgen."
]

# Machine translations in English
machine_translations = [
    "He can't see the forest for the trees anymore. Everything seems to be too much for him. He needs help to get everything sorted out.",  # Machine translation ChatGPT
    "He can no longer see the forest for the trees. Everything seems to be too much for him. He needs help to get it all together.",  # Machine translation DeepL
    "He can't see the forest for the trees. Everything seems to be getting too much for him. He needs help to get everything in order.",  # Machine translation Google Translate
]

# Human reference translation in English
human_reference = [
    "He can't see the bigger picture anymore, everything seems overwhelming, and he need help to sort things out."
]

bert_scorer = BERTScorer(lang='en', model_type="bert-base-multilingual-cased")

# Evaluate each machine translation against the human reference
print("BERTScore Evaluation:\n")
for i, translation in enumerate(machine_translations):
    P, R, F1 = bert_scorer.score([translation], human_reference)  
    print(f"Machine Translation {i + 1}:")
    print(f"Translation: {translation}")
    print(f"Precision: {P[0].item():.4f}")  
    print(f"Recall: {R[0].item():.4f}")
    print(f"F1 Score: {F1[0].item():.4f}")
    print()


BERTScore Evaluation:

Machine Translation 1:
Translation: He can't see the forest for the trees anymore. Everything seems to be too much for him. He needs help to get everything sorted out.
Precision: 0.8232
Recall: 0.8381
F1 Score: 0.8306

Machine Translation 2:
Translation: He can no longer see the forest for the trees. Everything seems to be too much for him. He needs help to get it all together.
Precision: 0.7942
Recall: 0.7879
F1 Score: 0.7910

Machine Translation 3:
Translation: He can't see the forest for the trees. Everything seems to be getting too much for him. He needs help to get everything in order.
Precision: 0.8016
Recall: 0.8034
F1 Score: 0.8025

