In [1]:
import copy
import json
import os
import numpy
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union

from openai import BadRequestError

import autogen
from autogen import UserProxyAgent, AssistantAgent, ConversableAgent
from autogen import config_list_from_json
from autogen.agentchat import Agent
from autogen.agentchat.contrib.agent_optimizer import AgentOptimizer
from autogen.agentchat.contrib.math_user_proxy_agent import MathUserProxyAgent
from autogen.code_utils import extract_code
from autogen.math_utils import get_answer

## Data

In [2]:
def read_file(url):
    file = open(url, "r")
    data = file.read().split('\n')
    return data

In [3]:
phoMT_dev_en = read_file("data/PhoMT/detokenization/dev/dev.en")
phoMT_dev_vi = read_file("data/PhoMT/detokenization/dev/dev.vi")
phoMT_test_en = read_file("data/PhoMT/detokenization/test/test.en")
phoMT_test_vi = read_file("data/PhoMT/detokenization/test/test.vi")
phoMT_train_en = read_file("data/PhoMT/detokenization/train/train.en")
phoMT_train_vi = read_file("data/PhoMT/detokenization/train/train.vi")

In [4]:
phoMT_dev_en[0] = phoMT_dev_en[0][1:]

In [5]:
phoMT_dev_envi = [];
for index in range(len(phoMT_dev_en)):
    phoMT_dev_envi.append({"question":phoMT_dev_en[index], "answer": phoMT_dev_vi[index]})
phoMT_test_envi = [];
for index in range(len(phoMT_test_en)):
    phoMT_test_envi.append({"question": phoMT_test_en[index], "answer": phoMT_test_vi[index]})
phoMT_train_envi = [];
for index in range(len(phoMT_train_en)):
    phoMT_train_envi.append({"question": phoMT_train_en[index], "answer": phoMT_train_vi[index]})
# phoMT_dev_envi

In [6]:
phoMT_dev_envi[0]["question"]

'Hurricane Dorian, one of the most powerful storms ever recorded in the Atlantic Ocean, made landfall as a Category 5 storm on Great Abaco Island in the northern Bahamas on Sunday morning, September 1, 2019.'

## Benchmark

### SacreBleu

In [7]:
from sacrebleu.metrics import BLEU, CHRF, TER
bleu = BLEU()

def get_score(src, ans, res):
     return bleu.corpus_score([ans], [[res]]).score

## Agent init

### Agents declarations

In [8]:
llm_config = {
    "config_list": [
        {
            "model": "llama3",
            "base_url": "http://localhost:11434/v1",
            "api_key": "ollama",
        }
    ]
}

# user_proxy = autogen.UserProxyAgent(
#     name="Userproxyagent",
#     human_input_mode="NEVER",
#     code_execution_config={"work_dir": "_output", "use_docker": False},
# )

### Custom UserProxyAgent

In [9]:
class JudgeProxyAgent(UserProxyAgent):
    MAX_CONSECUTIVE_AUTO_REPLY = 10
    DEFAULT_REPLY_TEMPLATE = "Generate a response more closely resembling the style, detail, and tone of the provided answer. Focus on specifying key elements to capture the nuances of this answer effectively. The answer: "
    PROMPTS = """Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    """

    def __init__(
        self,
        name: Optional[str] = "JudgeChatAgent",
        human_input_mode: Literal["ALWAYS", "NEVER", "TERMINATE"] = "NEVER",
        **kwargs,
    ):
        super().__init__(
            name=name,
            human_input_mode=human_input_mode,
            **kwargs,
        )
        self.register_reply(
            trigger=ConversableAgent, reply_func=JudgeProxyAgent._check_final_result, position=0
        )
        self.max_function_call_trial = 3
        self.query = None
        self._answer = None
        self.is_correct = None
        # self.history = []

    def initiate_chat(
        self,
        recipient,
        history: Optional[List[Dict]] = [],
        answer=None,
        silent: Optional[bool] = False,
        max_turns = 3,
        **context,
    ):
        self.query = context["message"]
        self._answer = answer
        
        # self.llm_config["context"] = {"question": self.query, "answer": answer}
        # self.DEFAULT_REPLY = self.DEFAULT_REPLY_TEMPLATE + answer
        self.is_correct = None
        self.max_function_call_trial = max_turns
        
        self._prepare_chat(recipient, True)
        # print(self._oai_messages)
        # self._append_oai_message(history)
        for msg in history:
            self._append_oai_message(
                message=msg['content'],
                role=msg['role'],
                conversation_id = recipient if msg['role'] == "user" else self
            ) 
        # print(history)
        # self._oai_messages = history
        error_message = None
        try:
            prompt = self.PROMPTS + context['message']
            self.send(prompt, recipient, silent=silent)
        except BadRequestError as e:
            error_message = str(e)
            self.is_correct = 0
            print("error information: {}".format(error_message))

        recipient.reset()
        self.is_correct = copy.deepcopy(self.is_correct)
        result = self.is_correct
        self._reset()
        return result

    def receive(
        self,
        message: Union[Dict, str],
        sender: Agent,
        request_reply: Optional[bool] = None,
        silent: Optional[bool] = False,
    ):
        self._process_received_message(message, sender, silent)
        if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False:
            return

        self.is_correct = self.chat_messages[sender][-1].get("content")
        if (get_score(self.query, self.is_correct, self._answer) >= 90):
            return

        self.max_function_call_trial = self.max_function_call_trial - 1
        if (self.max_function_call_trial <= 0):
            self.max_function_call_trial = 0
            return

        reply = f"Review the generated translation for the sentence: {self.query}. Ensure it accurately conveys the meaning, tone, and context of the original text in natural Vietnamese. If any errors or unnatural phrasing are present, correct them. Respond with only the revised translation, without any additional explanations or commentary."
        if self._answer is not None:
            reply = f'Analyze the original sentence: {self.query}, the expected Vietnamese translation: {self._answer}, and the generated translation: {self.is_correct}. Identify the differences between {self.is_correct} and {self._answer}, and provide guidance to improve the translation so it aligns more closely with {self._answer}. Focus on preserving meaning, tone, style, and naturalness in Vietnamese while addressing any discrepancies.'
        # if self._answer is not None:
        #     reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
        self.send(reply, sender, silent=silent)

    def _check_final_result(
        self,
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
    ):
        messages = messages[-1]
        if isinstance(messages, dict):
            messages = messages.get("content")
            if messages is None:
                return False, None
            if (messages.find("\n") >= 0):
                print("Response longer than expected?\n" + messages)
                # messages = messages.split("\n")[0]

        self.is_correct = messages
        temp_score = get_score(self.query, messages, self._answer)
        print("Score: " + str(temp_score))
        if (temp_score >= 90):
            return True, "The result is passable. Please reply me with the same answer as before."
        return False, None

    def _reset(self):
        # super()._reset()
        self.max_function_call_trial = 0
        self.is_correct = None
        self.query = None
        self._answer = None

In [10]:
class PromptAssistant(AssistantAgent):
    PROMPTS = """Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    """
    def __init__(
        self,
        after: Agent,
        before: Agent = None,
        name: Optional[str] = "PromptAssistantAgent",
        human_input_mode: Literal["ALWAYS", "NEVER", "TERMINATE"] = "NEVER",
        **kwargs,
    ):
        super().__init__(
            name=name,
            human_input_mode=human_input_mode,
            **kwargs,
        )
        self.before = before
        self.after = after
        self.history = []
        self.is_start = False

    def initiate_chat(
        self,
        recipient,
        history: List[Dict] = None,
        silent: Optional[bool] = False,
        max_turns = 3,
        **kwargs
    ):
        self.is_start = True
        self._prepare_chat(recipient, True)
        for msg in history:
            self._append_oai_message(
                message=msg['content'],
                role=msg['role'],
                conversation_id = recipient
            )
        print(self.chat_messages_for_summary(recipient))
        error_message = None
        try:
            prompt = self.PROMPTS + kwargs['message']
            self.send(prompt, recipient, silent=silent)
        except BadRequestError as e:
            error_message = str(e)
            print("error information: {}".format(error_message))
        recipient.reset()
        self.is_start = False
        print(self.chat_messages_for_summary(recipient))
        return self.chat_messages_for_summary(recipient)[-1]['content']

    def receive(
        self,
        message: Union[Dict, str],
        sender: Agent,
        request_reply: Optional[bool] = None,
        silent: Optional[bool] = False,
    ):
        self._process_received_message(message, sender, silent)
        if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False:
            return
        if self.is_start:
            reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
            if reply is not None:
                self.send(reply, sender, silent=silent)
            return
        if (sender == self.before):
            reply = sender.chat_messages_for_summary(self)[0]['content']
            if len(sender.chat_messages_for_summary(self)) > 1:
                reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
            if reply is not None:
                self.send(reply, self.after, silent=silent, request_reply=True)
                # print(self.chat_messages_for_summary(self.after))
                self.history = self.chat_messages_for_summary(self.after)
                self.send(self.chat_messages_for_summary(self.after)[-1]['content'], self.before, silent=silent)


In [11]:
LLM = autogen.AssistantAgent(
    name="LLM",
    system_message="You are a helpful assistant",
    code_execution_config=False,
    llm_config=llm_config,
)

Judge = JudgeProxyAgent(
    name="Judge",
    system_message="You are an advisor",
    code_execution_config=False,
)

PromptGenerator = PromptAssistant(
    before=Judge,
    after=LLM,
    name="PromptGenerator",
    system_message="You are a prompt engineer",
    human_input_mode = "NEVER",
    code_execution_config=False,
    llm_config=llm_config,
)

In [12]:
result = Judge.initiate_chat(
    recipient = PromptGenerator,
    max_turns = 2,
    message = phoMT_dev_envi[4]["question"],
    answer = phoMT_dev_envi[4]["answer"]
)

[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island.

--------------------------------------------------------------------------------
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do n

In [13]:
PromptGenerator.history
    # print(Judge.chat_messages(PromptGenerator)[x])

[{'content': 'Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.\n    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.\n    The text:\n    At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island.',
  'role': 'assistant'},
 {'content': 'Tới giờ, chưa có báo cáo về nạn nhân thương tật nào trong số 46 nhà xuất bản tại hai giáo xứ trên đảo Abaco Đại.',
  'role': 'user'},
 {'content': 'Analysis:\n\n1. Main difference: Tới giờ (At this time) is translated differently between the two sentences.\n2. Contextual differences: The expected translation (Theo báo cáo đến thời điểm hiện tại...) sets the context by specifyi

In [14]:
PromptGenerator.chat_messages_for_summary(LLM)

[]

In [15]:
print(result)

Căn cứ vào báo cáo, trong số 46 nhà xuất bản thuộc hai giáo xứ trên đảo Great Abaco thì chưa có nạn nhân thương tật nào được báo cáo.


In [16]:
judge_result = "Tới giờ, chưa có báo cáo về nạn nhân thương tật nào trong số 46 nhà xuất bản tại hai giáo xứ trên đảo Abaco Đại."
get_score(phoMT_dev_envi[4]["question"], result, phoMT_dev_envi[4]["answer"])

12.017396628208415

In [17]:
result = PromptGenerator.initiate_chat(
    recipient = LLM,
    max_turns = 2,
    message = phoMT_dev_envi[4]["question"],
    history = PromptGenerator.history,
    clear_history = False
)

[{'content': 'Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.\n    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.\n    The text:\n    At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island.', 'role': 'assistant'}, {'content': 'Tới giờ, chưa có báo cáo về nạn nhân thương tật nào trong số 46 nhà xuất bản tại hai giáo xứ trên đảo Abaco Đại.', 'role': 'user'}, {'content': 'Analysis:\n\n1. Main difference: Tới giờ (At this time) is translated differently between the two sentences.\n2. Contextual differences: The expected translation (Theo báo cáo đến thời điểm hiện tại...) sets the context by specifying tha

In [18]:
def score_translate(message, answer, turns = 1):
    result = Judge.initiate_chat(
        recipient = PromptGenerator,
        max_turns = turns + 1,
        message = message,
        answer = answer
    )
    return get_score(message, str(result), answer)

def score_translate_test(message, answer, history, turns = 1):
    result = PromptGenerator.initiate_chat(
        recipient = LLM,
        max_turns = turns + 1,
        message = message,
        history = history,
        clear_history = False
    )
    # return result
    return get_score(message, result, answer)

### Agent pairing

In [19]:
# score_dev = []
# for i in range(10):
#     x = phoMT_dev_envi[i]
#     score_dev.append(score_translate(x['question'], x['answer'], 0))
score_test = []
for i in range(20):
    x = phoMT_test_envi[i]
    score_test.append(score_translate(x['question'], x['answer'], 0))

[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Brother Albert Barnett and his wife, Sister Susan Barnett, from the West Congregation in Tuscaloosa, Alabama

--------------------------------------------------------------------------------
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any

In [20]:
# numpy.average(score_dev)

In [21]:
# score_dev

In [22]:
numpy.average(score_test)

16.181075241611182

In [23]:
score_test

[22.15201577762451,
 4.791697208649826,
 5.122366272740833,
 5.3976467790265215,
 36.055133232914045,
 14.296145628396559,
 16.18274764491393,
 10.168586985453928,
 9.784168213672302,
 12.496315176496601,
 28.453277380748098,
 5.542900348964859,
 10.629480219240392,
 4.4782388238006465,
 25.825152355482132,
 15.822637961675994,
 8.319071183342182,
 19.720824137924055,
 46.06894414936015,
 22.314155351796114]

## Improve

In [24]:
len(phoMT_train_envi)

2978000

In [25]:
EPOCH = 1
batch = 20
turns = 1
# optimizer_model = "gpt-4-1106-preview"
# optimizer = AgentOptimizer(max_actions_per_step=3, llm_config=llm_config, optimizer_model="llama3")
history_recorder = []
for index in range(batch):
    query = phoMT_train_envi[index]
    # is_correct = user_proxy.initiate_chat(assistant, answer=query["answer"], problem=query["question"])
    result = score_translate(query['question'], query['answer'], turns)
    history = PromptGenerator.history
    print(f"Test: {index}")
    history_recorder.extend(history)
    print(len(history_recorder))

[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    It begins with a countdown.

--------------------------------------------------------------------------------
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide 

## Compare

In [26]:
score_test = []
for i in range(20):
    x = phoMT_test_envi[i]
    score_test.append(score_translate_test(x['question'], x['answer'], history_recorder, 0))

[{'content': 'Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.\n    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.\n    The text:\n    It begins with a countdown.', 'role': 'assistant'}, {'content': 'Đó là với một bảng đếm ngược.', 'role': 'user'}, {'content': 'The original sentence is: "It begins with a countdown."\n\nThe expected translation provided is: "Câu chuyện bắt đầu với buổi lễ đếm ngược."\n\nThe generated translation is: "Đó là với một bảng đếm ngược."\n\nComparison of the two translations reveals:\n\n1. The phrase "It begins with" is not translated accurately in the generated output. It should be translated as "Câu chuyện bắt đầu" to convey the idea that th

In [27]:
numpy.average(score_test)

17.17580865337162

In [28]:
score_test

[22.15201577762451,
 22.48797176771867,
 15.901757826207133,
 10.208876940631033,
 9.027070761435276,
 36.857838224116975,
 7.620256222296563,
 5.571546612513714,
 11.935715928626076,
 6.041241050487109,
 27.380820067313575,
 12.654686441167149,
 21.163412130853043,
 13.801887742929258,
 21.687179482056585,
 8.640609739997757,
 4.474076510776484,
 20.73358371309331,
 44.80304273880272,
 20.372583388785444]

In [30]:
score_test = []
for i in range(20):
    x = phoMT_test_envi[i]
    score_test.append(score_translate_test(x['question'], x['answer'], [], 0))

[]
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Brother Albert Barnett and his wife, Sister Susan Barnett, from the West Congregation in Tuscaloosa, Alabama

--------------------------------------------------------------------------------
[33mLLM[0m (to PromptGenerator):

Chịu Brother Albert Barnett và vợ ông, Chị Susan Barnett, từ Đình Tây ở Tuscaloosa, Alabama.

--------------------------------------------------------------------------------
[{'content': 'Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that

In [31]:
numpy.average(score_test)

17.17580865337162