In [1]:
import copy
import json
import os
import numpy
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union

from openai import BadRequestError

import autogen
from autogen import UserProxyAgent, AssistantAgent, ConversableAgent
from autogen import config_list_from_json
from autogen.agentchat import Agent
from autogen.agentchat.contrib.agent_optimizer import AgentOptimizer
from autogen.agentchat.contrib.math_user_proxy_agent import MathUserProxyAgent
from autogen.code_utils import extract_code
from autogen.math_utils import get_answer

## Data

In [2]:
def read_file(url):
    file = open(url, "r")
    data = file.read().split('\n')
    return data

In [3]:
phoMT_dev_en = read_file("data/PhoMT/detokenization/dev/dev.en")
phoMT_dev_vi = read_file("data/PhoMT/detokenization/dev/dev.vi")
phoMT_test_en = read_file("data/PhoMT/detokenization/test/test.en")
phoMT_test_vi = read_file("data/PhoMT/detokenization/test/test.vi")
phoMT_train_en = read_file("data/PhoMT/detokenization/train/train.en")
phoMT_train_vi = read_file("data/PhoMT/detokenization/train/train.vi")

In [4]:
phoMT_dev_en[0] = phoMT_dev_en[0][1:]

In [5]:
phoMT_dev_envi = [];
for index in range(len(phoMT_dev_en)):
    phoMT_dev_envi.append({"question":phoMT_dev_en[index], "answer": phoMT_dev_vi[index]})
phoMT_test_envi = [];
for index in range(len(phoMT_test_en)):
    phoMT_test_envi.append({"question": phoMT_test_en[index], "answer": phoMT_test_vi[index]})
phoMT_train_envi = [];
for index in range(len(phoMT_train_en)):
    phoMT_train_envi.append({"question": phoMT_train_en[index], "answer": phoMT_train_vi[index]})
# phoMT_dev_envi

In [6]:
phoMT_dev_envi[0]["question"]

'Hurricane Dorian, one of the most powerful storms ever recorded in the Atlantic Ocean, made landfall as a Category 5 storm on Great Abaco Island in the northern Bahamas on Sunday morning, September 1, 2019.'

## Benchmark

### Comet

In [7]:
from comet import download_model, load_from_checkpoint

# Choose your model from Hugging Face Hub
# model_path = download_model("Unbabel/XCOMET-XL")
# or for example:
# model_path = download_model("Unbabel/wmt22-comet-da")

# Load the model checkpoint:
model = load_from_checkpoint('./XCOMET-XL/checkpoints/model.ckpt')

Encoder model frozen.
/home/kaylous/workspace/ics/llms/.venv/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [8]:
# data = [
#     {
#         "src": "Boris Johnson teeters on edge of favour with Tory MPs", 
#         "mt": "Boris Johnson ist bei Tory-Abgeordneten völlig in der Gunst", 
#         "ref": "Boris Johnsons Beliebtheit bei Tory-MPs steht auf der Kippe"
#     }
# ]
# model_output = model.predict(data, batch_size=8, gpus=1)
# Segment-level scores
# System-level score
# Score explanation (error spans)

In [9]:
# model_output.scores

In [10]:
# model_output.system_score

In [11]:
# model_output.metadata.error_spans

In [6]:
def get_score(src, ans, res):
    data = [
        {
            "src": src,
            "mt" : res,
            "ref": ans
        }
    ]
    return model.predict(data, batch_size=8, gpus=1).system_score

### SacreBleu

In [7]:
from sacrebleu.metrics import BLEU, CHRF, TER
bleu = BLEU()

def get_score(src, ans, res):
     return bleu.corpus_score([ans], [[res]]).score

## Agent init

### Agents declarations

In [8]:
llm_config = {
    "config_list": [
        {
            "model": "llama3",
            "base_url": "http://localhost:11434/v1",
            "api_key": "ollama",
        }
    ]
}

PromptGenerator = autogen.UserProxyAgent(
    name="PromptGenerator",
    system_message="You are a prompt engineer",
    human_input_mode = "NEVER",
    code_execution_config=False,
    llm_config=llm_config,
)
LLM = autogen.AssistantAgent(
    name="LLM",
    system_message="You are a helpful assistant",
    code_execution_config=False,
    llm_config=llm_config,
)

# user_proxy = autogen.UserProxyAgent(
#     name="Userproxyagent",
#     human_input_mode="NEVER",
#     code_execution_config={"work_dir": "_output", "use_docker": False},
# )

In [10]:
# output = PromptGenerator.initiate_chat(
#     recipient = LLM,
#     max_turns = 1,
#     message = "Translate this sentence to Vietnamese, say no more than necessary, and no notes: " + phoMT_dev_envi[0]["question"],
#     answer = phoMT_dev_envi[0]["answer"]
# )

In [11]:
# output.summary

In [12]:
# get_score(phoMT_dev_envi[0]["question"], phoMT_dev_envi[0]["answer"], output.summary)

### Custom UserProxyAgent

In [17]:
def is_termination_msg_mathchat(message):
    """Check if a message is a termination message."""
    if isinstance(message, dict):
        message = message.get("content")
        if message is None:
            return False
    if message.rstrip().find("TERMINATE") >= 0:
        return True
    return False


class JudgeProxyAgent(UserProxyAgent):
    MAX_CONSECUTIVE_AUTO_REPLY = 10
    DEFAULT_REPLY_TEMPLATE = "Generate a response more closely resembling the style, detail, and tone of the provided answer. Focus on specifying key elements to capture the nuances of this answer effectively. The answer: "
    PROMPTS = """Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    """

    def __init__(
        self,
        name: Optional[str] = "JudgeChatAgent",
        # is_termination_msg: Optional[Callable[[Dict], bool]] = is_termination_msg_mathchat,
        human_input_mode: Literal["ALWAYS", "NEVER", "TERMINATE"] = "NEVER",
        # default_auto_reply: Optional[Union[str, Dict, None]] = DEFAULT_REPLY,
        # max_invalid_q_per_step=3,
        **kwargs,
    ):
        super().__init__(
            name=name,
            # is_termination_msg=is_termination_msg,
            human_input_mode=human_input_mode,
            # default_auto_reply=default_auto_reply,
            # max_invalid_q_per_step=max_invalid_q_per_step,
            **kwargs,
        )
        self.register_reply(
            trigger=ConversableAgent, reply_func=JudgeProxyAgent.generate_feedback, position=0
        )
        self.register_reply(
            trigger=ConversableAgent, reply_func=JudgeProxyAgent._check_final_result, position=0
        )
        self.max_function_call_trial = 3
        self.query = None
        self._answer = None
        self.is_correct = None
        # self.history = []

    def initiate_chat(
        self,
        recipient,
        history: Optional[Dict[Agent, List[Dict]]] = None,
        answer=None,
        silent: Optional[bool] = False,
        max_turns = 3,
        **context,
    ):
        self.query = context["message"]
        self._answer = answer
        self.max_turns = max_turns
        
        # self.llm_config["context"] = {"question": self.query, "answer": answer}
        # self.DEFAULT_REPLY = self.DEFAULT_REPLY_TEMPLATE + answer
        self.is_correct = None
        self.max_function_call_trial = max_turns

        self._prepare_chat(recipient, True)
        print(self._oai_messages)
        # self._append_oai_message(history)
        if history is not None:
            for agent in history:
                for msg in history[agent]:
                    self._append_oai_message(message=msg, conversation_id=agent) 
        # print(history)
        # self._oai_messages = history
        error_message = None
        try:
            prompt = self.PROMPTS + context['message']
            self.send(prompt, recipient, silent=silent)
        except BadRequestError as e:
            error_message = str(e)
            self.is_correct = 0
            print("error information: {}".format(error_message))

        recipient.reset()
        # print("Check self.is_correct")
        self.is_correct = copy.deepcopy(self.is_correct)
        result = self.is_correct
        # print(self.is_correct)
        # print(result)
        # print(str(result))
        print("End check")
        self._reset()
        return result

    def receive(
        self,
        message: Union[Dict, str],
        sender: Agent,
        request_reply: Optional[bool] = None,
        silent: Optional[bool] = False,
    ):
        self._process_received_message(message, sender, silent)
        if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False:
            return

        self.is_correct = self.chat_messages[sender][-1].get("content")

        if (get_score(self.query, self.is_correct, self._answer) >= 90):
            return

        self.max_function_call_trial = self.max_function_call_trial - 1
        if (self.max_function_call_trial <= 0):
            self.max_function_call_trial = 0
            return

        reply = f"Review the generated translation for the sentence: {self.query}. Ensure it accurately conveys the meaning, tone, and context of the original text in natural Vietnamese. If any errors or unnatural phrasing are present, correct them. Respond with only the revised translation, without any additional explanations or commentary."
        if self._answer is not None:
            # full_history = self.history + self.chat_messages[sender]
            # reply = self.generate_reply(messages=full_history, sender=sender)
            reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
            
        if reply is not None:
            self.send(reply, sender, silent=silent)

    def generate_feedback(
        self,
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
    ):
        # self.max_function_call_trial = self.max_function_call_trial - 1
        # if (self.max_function_call_trial <= 0):
        #     self.max_function_call_trial = 0
        #     return False, None
        messages = messages[-1]
        if isinstance(messages, dict):
            # messages = messages.get("content")
            if messages.get("content") is None:
                return False, None
        reply_prompt = f"Review the generated translation for the sentence: {self.query}. Ensure it accurately conveys the meaning, tone, and context of the original text in natural Vietnamese. If any errors or unnatural phrasing are present, correct them. Respond with only the revised translation, without any additional explanations or commentary."
        if self._answer is not None:
            reply_prompt = f'Analyze the original sentence: {self.query}, the expected Vietnamese translation: {self._answer}, and the generated translation: {messages.get("content")}. Identify the differences between {messages.get("content")} and {self._answer}, and provide guidance to improve the translation so it aligns more closely with {self._answer}. Focus on preserving meaning, tone, style, and naturalness in Vietnamese while addressing any discrepancies.'

        toned_message = messages
        default_prompt_for_trimming = "Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task."
        if self._answer is not None:
            toned_message["content"] = reply_prompt + ". " + default_prompt_for_trimming
        else:
            toned_message["content"] = reply_prompt
        state, oai_reply = self.generate_oai_reply(
            messages = [toned_message],
            sender = sender,
            config = config
        )
        if state:
            return True, oai_reply
        return False, None

    def _check_final_result(
        self,
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
    ):
        messages = messages[-1]
        if isinstance(messages, dict):
            messages = messages.get("content")
            if messages is None:
                return False, None
            if (messages.find("\n") >= 0):
                print("Response longer than expected?\n" + messages)
                # messages = messages.split("\n")[0]

        self.is_correct = messages
        temp_score = get_score(self.query, messages, self._answer)
        print("Score: " + str(temp_score))
        if (temp_score >= 90):
            return True, "The result is passable. Please reply me with the same answer as before."
        return False, None

    def _reset(self):
        # super()._reset()
        self.max_function_call_trial = 0
        self.is_correct = None
        self.query = None
        self._answer = None

In [18]:
# llm_config_judge = {
#     "config_list": [
#         {
#             "model": "llama3",
#             "base_url": "http://localhost:11434/v1",
#             "api_key": "ollama",
#         }
#     ],
#     # "prompt": 'Analyze the original sentence {question}, the expected translation {answer}, and the generated response. Provide advice on how to guide the model toward producing translations more closely aligned with {answer}. Focus on enhancing clarity, specificity, and context awareness in the prompt instructions. ',
# }

Judge = JudgeProxyAgent(
    name="Judge",
    system_message="You are an advisor",
    code_execution_config=False,
    llm_config=llm_config,
)

# groupchat = autogen.GroupChat(
#     agents=[Judge, PromptGenerator, LLM],
#     messages=[],
#     max_round=50,
#     speaker_selection_method="round_robin"
# )

In [19]:
result = Judge.initiate_chat(
    recipient = LLM,
    max_turns = 2,
    message = phoMT_dev_envi[4]["question"],
    answer = phoMT_dev_envi[4]["answer"]
)

defaultdict(<class 'list'>, {<autogen.agentchat.assistant_agent.AssistantAgent object at 0x76193c742c10>: []})
[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Tới giờ, chưa có báo cáo về nạn nhân thương tật nào trong số 46 nhà xuất bản tại hai giáo xứ trên đảo Abaco Đại.

-----------------------------------------------------------------------------

In [20]:
Judge.chat_messages_for_summary(LLM)

[{'content': 'Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.\n    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.\n    The text:\n    At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island.',
  'role': 'assistant'},
 {'content': 'Analyze the original sentence: At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island., the expected Vietnamese translation: Theo báo cáo đến thời điểm hiện tại, trong 46 người công bố thuộc hai hội thánh ở đảo Great Abaco thì không có anh chị nào bị thương., and the generated translation: Tới giờ, chưa có báo 

In [17]:
# print(result)

In [18]:
# judge_result = "Theo báo cáo đến thời điểm hiện tại, trong 46 người công bố thuộc hai hội thánh ở đảo Great Abaco thì không có nạn nhân bị thương."
# get_score(phoMT_dev_envi[4]["question"], judge_result, phoMT_dev_envi[4]["answer"])

In [23]:
def score_translate(message, answer, turns = 1):
    result = Judge.initiate_chat(
        recipient = LLM,
        max_turns = turns + 1,
        message = message,
        history = history,
        answer = answer
    )
    return get_score(message, str(result), answer)

def score_translate_test(message, answer, history = [], turns = 1):
    result = Judge.initiate_chat(
        recipient = LLM,
        max_turns = turns + 1,
        message = message,
        history = history,
        answer = None
    )
    return get_score(message, str(result), answer)

### Agent pairing

In [12]:
# score_dev = []
# for i in range(10):
#     x = phoMT_dev_envi[i]
#     score_dev.append(score_translate(x['question'], x['answer'], 0))
score_test = []
for i in range(20):
    x = phoMT_test_envi[i]
    score_test.append(score_translate(x['question'], x['answer'], 0))

[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Brother Albert Barnett and his wife, Sister Susan Barnett, from the West Congregation in Tuscaloosa, Alabama

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Chịu Brother Albert Barnett và vợ ông, Chị Susan Barnett, từ Đình Tây ở Tuscaloosa, Alabama.

--------------------------------------------------------------------------------
End check
[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that 

In [13]:
# numpy.average(score_dev)

In [14]:
# score_dev

In [15]:
numpy.average(score_test)

17.17580865337162

In [16]:
score_test

[22.15201577762451,
 22.48797176771867,
 15.901757826207133,
 10.208876940631033,
 9.027070761435276,
 36.857838224116975,
 7.620256222296563,
 5.571546612513714,
 11.935715928626076,
 6.041241050487109,
 27.380820067313575,
 12.654686441167149,
 21.163412130853043,
 13.801887742929258,
 21.687179482056585,
 8.640609739997757,
 4.474076510776484,
 20.73358371309331,
 44.80304273880272,
 20.372583388785444]

In [29]:
# score_dev = []
# for i in range(10):
#     x = phoMT_dev_envi[i]
#     score_dev.append(score_translate(x['question'], x['answer'], 1))
# score_test = []
# for i in range(10):
#     x = phoMT_test_envi[i]
#     score_test.append(score_translate(x['question'], x['answer'], 1))

In [30]:
# numpy.average(score_dev)

In [31]:
# score_dev

In [32]:
# numpy.average(score_test)

In [33]:
# score_test

## Improve

In [34]:
llm_config

{'config_list': [{'model': 'llama3',
   'base_url': 'http://localhost:11434/v1',
   'api_key': 'ollama'}]}

In [35]:
len(phoMT_train_envi)

2978000

In [24]:
EPOCH = 1
batch = 2
batch_num = 2
turns = 1
# optimizer_model = "gpt-4-1106-preview"
optimizer = AgentOptimizer(max_actions_per_step=3, llm_config=llm_config, optimizer_model="llama3")
history_recorder = []
for i in range(EPOCH):
    # for index, query in enumerate(train_data):
    for j in range(batch_num):
        for index in range(batch):
            query = phoMT_train_envi[j * batch + index]
            # is_correct = user_proxy.initiate_chat(assistant, answer=query["answer"], problem=query["question"])
            result = score_translate(query['question'], query['answer'], turns)
            history = Judge.chat_messages(LLM)
            print(f"Test: {i}|{j * batch + index}")
            history_recorder.extend(history)
            print(history_recorder)
            # optimizer.record_one_conversation(history, is_satisfied=True)
        # register_for_llm, register_for_exector = optimizer.step()
        # for item in register_for_llm:
        #     LLM.update_function_signature(**item)
        # if len(register_for_exector.keys()) > 0:
        #     Judge.register_function(function_map=register_for_exector)

TypeError: list indices must be integers or slices, not AssistantAgent

## Compare

In [37]:
score_test = []
for i in range(10):
    x = phoMT_test_envi[i]
    score_test.append(score_translate_test(x['question'], x['answer'], 0))

[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Brother Albert Barnett and his wife, Sister Susan Barnett, from the West Congregation in Tuscaloosa, Alabama

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Chịu Brother Albert Barnett và vợ ông, Chị Susan Barnett, từ Đình Tây ở Tuscaloosa, Alabama.

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.48s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Severe storms ripped through parts of the southern and midwestern United States on January 11 and 12, 2020.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Cơn bão cường độ cao đã qua những khu vực phía nam và miền trung Hoa Kỳ vào ngày 11 và 12 tháng 1 năm 2020.

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.79s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Two days of heavy rain, high winds, and numerous tornadoes caused major damage across multiple states.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Hai ngày mưa nặng, gió mạnh và nhiều cơn xoáy trời đã gây thiệt hại lớn trên nhiều bang.

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.89s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Sadly, Brother Albert Barnett and his wife, Sister Susan Barnett, 85 and 75 years old respectively, were killed when a tornado struck their mobile home.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Cùng với điều buồn là anh trai Albert Barnett và vợ anh, chị Susan Barnett, đã qua đời do cơn bão đổ bộ lên nhà di động của hai người khi họ vừa tròn 85 và 75 tuổi.

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.84s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    The United States branch also reports that at least four of our brothers' homes sustained minor damage, along with two Kingdom Halls.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Cục Amerika cũng báo cáo rằng ít nhất có bốn ngôi nhà của anh em chúng ta đã chịu tổn hại nhẹ, cùng với hai Đền thờ Vương quốc.

(Note: "Anh em" is used to refer to brothers or brethren in a familiar or affectionate manner.)

-----------------------------------------------------------------

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.55s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Additionally, the storms caused major damage to a brother's business property.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Ngoài ra, cơn bão còn gây ra thiệt hại nặng nề cho tài sản kinh doanh của một anh em.

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.32s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Local elders and the circuit overseer are offering practical and spiritual support to those affected by this disaster.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Giám mục lưu vực và các ông già trong làng đang cung cấp giúp đỡ thực tế và tâm linh cho những người bị ảnh hưởng bởi sự kiện này.

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.56s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    We know that our heavenly Father, Jehovah, is providing comfort to our brothers and sisters who are grieving because of this tragedy.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Chúng ta biết rằng cha thiên thánh chúng ta, Giê-hô-vá, đang mang lại an ủi cho anh chị em chúng ta đang chịu đau đớn vì sự kiện này.

(Note: "Giê-hô-vá" is the Vietnamese name for God, Jehovah.)

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.65s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    International government agencies and officials have responded to Russia's Supreme Court decision that criminalizes the worship of Jehovah's Witnesses in Russia.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Các cơ quan và chính quyền quốc tế đã phản ứng đối với quyết định của Tòa án Tối cao Nga rằng hành xử thờ cúng Jehovah trong Nga là tội phạm.

--------------------------------------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.88s/it]


[33mJudge[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    These statements have criticized Russia's unjust and harsh judicial action against a minority religious group known for peaceful religious activity.

--------------------------------------------------------------------------------
[33mLLM[0m (to Judge):

Các tuyên bố này đã phê phán hành động pháp lý không công bằng và khắc nghiệt của Nga đối với một nhóm tôn giáo thiểu số được biết đến với hoạt động tôn giáo hòa bình.

(Note: Please keep in mind that translation quality may vary depending on the context and nuances of the or

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.82s/it]


In [38]:
numpy.average(score_test)

0.4235638022422791

In [39]:
score_test

[0.38328737020492554,
 0.422347754240036,
 0.6424505710601807,
 0.5040085315704346,
 0.32535678148269653,
 0.4432627856731415,
 0.28456687927246094,
 0.41430363059043884,
 0.3621178865432739,
 0.453935831785202]