In [1]:
import copy
import json
import os
import numpy
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union

from openai import BadRequestError

import autogen
from autogen import UserProxyAgent, AssistantAgent, ConversableAgent
from autogen import config_list_from_json
from autogen.agentchat import Agent
from autogen.agentchat.contrib.agent_optimizer import AgentOptimizer
from autogen.agentchat.contrib.math_user_proxy_agent import MathUserProxyAgent
from autogen.code_utils import extract_code
from autogen.math_utils import get_answer

## Data

In [2]:
def read_file(url):
    file = open(url, "r")
    data = file.read().split('\n')
    return data

In [3]:
phoMT_dev_en = read_file("data/PhoMT/detokenization/dev/dev.en")
phoMT_dev_vi = read_file("data/PhoMT/detokenization/dev/dev.vi")
phoMT_test_en = read_file("data/PhoMT/detokenization/test/test.en")
phoMT_test_vi = read_file("data/PhoMT/detokenization/test/test.vi")
phoMT_train_en = read_file("data/PhoMT/detokenization/train/train.en")
phoMT_train_vi = read_file("data/PhoMT/detokenization/train/train.vi")

In [4]:
phoMT_dev_en[0] = phoMT_dev_en[0][1:]

In [5]:
phoMT_dev_envi = [];
for index in range(len(phoMT_dev_en)):
    phoMT_dev_envi.append({"question":phoMT_dev_en[index], "answer": phoMT_dev_vi[index]})
phoMT_test_envi = [];
for index in range(len(phoMT_test_en)):
    phoMT_test_envi.append({"question": phoMT_test_en[index], "answer": phoMT_test_vi[index]})
phoMT_train_envi = [];
for index in range(len(phoMT_train_en)):
    phoMT_train_envi.append({"question": phoMT_train_en[index], "answer": phoMT_train_vi[index]})
# phoMT_dev_envi

In [6]:
phoMT_dev_envi[0]["question"]

'Hurricane Dorian, one of the most powerful storms ever recorded in the Atlantic Ocean, made landfall as a Category 5 storm on Great Abaco Island in the northern Bahamas on Sunday morning, September 1, 2019.'

## Benchmark

### SacreBleu

In [7]:
from sacrebleu.metrics import BLEU, CHRF, TER
bleu = BLEU()

def get_score(src, ans, res):
     return bleu.corpus_score([ans], [[res]]).score

## Agent init

### Agents declarations

In [8]:
llm_config = {
    "config_list": [
        {
            "model": "llama3",
            "base_url": "http://localhost:11434/v1",
            "api_key": "ollama",
        }
    ]
}

# user_proxy = autogen.UserProxyAgent(
#     name="Userproxyagent",
#     human_input_mode="NEVER",
#     code_execution_config={"work_dir": "_output", "use_docker": False},
# )

### Custom UserProxyAgent

In [83]:
class JudgeProxyAgent(UserProxyAgent):
    MAX_CONSECUTIVE_AUTO_REPLY = 10
    DEFAULT_REPLY_TEMPLATE = "Generate a response more closely resembling the style, detail, and tone of the provided answer. Focus on specifying key elements to capture the nuances of this answer effectively. The answer: "
    PROMPTS = """Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    """

    def __init__(
        self,
        name: Optional[str] = "JudgeChatAgent",
        human_input_mode: Literal["ALWAYS", "NEVER", "TERMINATE"] = "NEVER",
        **kwargs,
    ):
        super().__init__(
            name=name,
            human_input_mode=human_input_mode,
            **kwargs,
        )
        self.register_reply(
            trigger=ConversableAgent, reply_func=JudgeProxyAgent._check_final_result, position=0
        )
        self.max_function_call_trial = 3
        self.query = None
        self._answer = None
        self.is_correct = None
        # self.history = []

    def initiate_chat(
        self,
        recipient,
        history: List[Dict] = [],
        answer=None,
        silent: Optional[bool] = False,
        max_turns = 3,
        **context,
    ):
        self.query = context["message"]
        self._answer = answer
        
        self.is_correct = None
        self.max_function_call_trial = max_turns
        recipient.history = history
        print("Recipient.history: " + str(len(recipient.history)))
        
        self._prepare_chat(recipient, True)
        
        # for msg in history:
        #     self._append_oai_message(
        #         message=msg['content'],
        #         role=msg['role'],
        #         conversation_id = recipient if msg['role'] == "user" else self
        #     )
        
        error_message = None
        try:
            prompt = self.PROMPTS + context['message']
            self.send(prompt, recipient, silent=silent)
        except BadRequestError as e:
            error_message = str(e)
            self.is_correct = 0
            print("error information: {}".format(error_message))

        recipient.reset()
        self.is_correct = copy.deepcopy(self.is_correct)
        result = self.is_correct
        self._reset()
        return result

    def receive(
        self,
        message: Union[Dict, str],
        sender: Agent,
        request_reply: Optional[bool] = None,
        silent: Optional[bool] = False,
    ):
        self._process_received_message(message, sender, silent)
        if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False:
            return

        self.is_correct = self.chat_messages[sender][-1].get("content")
        if self._answer is not None:
            if (get_score(self.query, self.is_correct, self._answer) >= 90):
                return

        self.max_function_call_trial = self.max_function_call_trial - 1
        if (self.max_function_call_trial <= 0):
            self.max_function_call_trial = 0
            return

        reply = f"Using the original sentence: {self.query} and the given Vietnamese translation: {self.is_correct}, provide clear and concise guidance to another Large Language Model on how to produce a translation that aligns more closely with {self.is_correct}. The guidance should address tone, accuracy, and naturalness in Vietnamese"
        if self._answer is not None:
            reply = f'Analyze the original sentence: {self.query}, the expected Vietnamese translation: {self._answer}, and the generated translation: {self.is_correct}. Identify the differences between {self.is_correct} and {self._answer}, and provide guidance to improve the translation so it aligns more closely with {self._answer}. Focus on preserving meaning, tone, style, and naturalness in Vietnamese while addressing any discrepancies.'
        # if self._answer is not None:
        #     reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
        self.send(reply, sender, silent=silent)

    def _check_final_result(
        self,
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
    ):
        messages = messages[-1]
        if isinstance(messages, dict):
            messages = messages.get("content")
            if messages is None:
                return False, None
            if (messages.find("\n") >= 0):
                print("Response longer than expected?\n" + messages)
                # messages = messages.split("\n")[0]

        self.is_correct = messages
        temp_score = get_score(self.query, messages, self._answer)
        print("Score: " + str(temp_score))
        if (temp_score >= 90):
            return True, "The result is passable. Please reply me with the same answer as before."
        return False, None

    def _reset(self):
        # super()._reset()
        self.max_function_call_trial = 0
        self.is_correct = None
        self.query = None
        self._answer = None

In [84]:
class PromptAssistant(AssistantAgent):
    PROMPTS = """Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    """
    def __init__(
        self,
        after: Agent,
        before: Agent = None,
        name: Optional[str] = "PromptAssistantAgent",
        human_input_mode: Literal["ALWAYS", "NEVER", "TERMINATE"] = "NEVER",
        **kwargs,
    ):
        super().__init__(
            name=name,
            human_input_mode=human_input_mode,
            **kwargs,
        )
        self.before = before
        self.after = after
        self.history = []
        self.is_start = False

    def initiate_chat(
        self,
        recipient,
        history: List[Dict] = None,
        silent: Optional[bool] = False,
        max_turns = 3,
        **kwargs
    ):
        self.is_start = True
        self._prepare_chat(recipient, True)
        for msg in history:
            self._append_oai_message(
                message=msg['content'],
                role=msg['role'],
                conversation_id = recipient
            )
        print(len(self.chat_messages_for_summary(recipient)))
        error_message = None
        try:
            prompt = self.PROMPTS + kwargs['message']
            self.send(prompt, recipient, silent=silent)
        except BadRequestError as e:
            error_message = str(e)
            print("error information: {}".format(error_message))
        recipient.reset()
        self.is_start = False
        print(self.chat_messages_for_summary(recipient))
        return self.chat_messages_for_summary(recipient)[-1]['content']

    def receive(
        self,
        message: Union[Dict, str],
        sender: Agent,
        request_reply: Optional[bool] = None,
        silent: Optional[bool] = False,
    ):
        self._process_received_message(message, sender, silent)
        if request_reply is False or request_reply is None and self.reply_at_receive[sender] is False:
            return
        if self.is_start:
            # reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
            # if reply is not None:
            #     self.send(reply, sender, silent=silent)
            return
        if (sender == self.before):
            reply = sender.chat_messages_for_summary(self)[0]['content']
            if len(sender.chat_messages_for_summary(self)) > 1:
                reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
            else:
                print("Self.history: " + str(len(self.history)))
                for msg in self.history:
                    self._append_oai_message(
                        message=msg['content'],
                        role=msg['role'],
                        conversation_id = self.after
                    )
            if reply is not None:
                self.send(reply, self.after, silent=silent, request_reply=True)
                if (self._oai_messages[self.after][-1]['content'].find('Note') != -1):
                    temp = self._oai_messages[self.after][-1]['content']
                    self._oai_messages[self.after][-1]['content'] = temp[:temp.find('Note')].strip()
                # print(self.chat_messages_for_summary(self.after))
                self.history = self.chat_messages_for_summary(self.after)
                self.send(self.chat_messages_for_summary(self.after)[-1]['content'], self.before, silent=silent)


In [85]:
LLM = autogen.AssistantAgent(
    name="LLM",
    system_message="You are a helpful assistant",
    code_execution_config=False,
    llm_config=llm_config,
)

Judge = JudgeProxyAgent(
    name="Judge",
    system_message="You are an advisor",
    code_execution_config=False,
)

PromptGenerator = PromptAssistant(
    before=Judge,
    after=LLM,
    name="PromptGenerator",
    system_message="You are a prompt engineer",
    human_input_mode = "NEVER",
    code_execution_config=False,
    llm_config=llm_config,
)

In [86]:
result = Judge.initiate_chat(
    recipient = PromptGenerator,
    max_turns = 2,
    message = phoMT_dev_envi[4]["question"],
    # answer = phoMT_dev_envi[4]["answer"]
)

Recipient.history: 0
[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island.

--------------------------------------------------------------------------------
Self.history: 0
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond

In [55]:
PromptGenerator.history
    # print(Judge.chat_messages(PromptGenerator)[x])

[{'content': 'Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.\n    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.\n    The text:\n    At this time, there have been no reported injuries among the 46 publishers in the two congregations on Great Abaco Island.',
  'role': 'assistant'},
 {'content': 'Tới giờ, chưa có báo cáo về nạn nhân thương tật nào trong số 46 nhà xuất bản tại hai giáo xứ trên đảo Abaco Đại.',
  'role': 'user'},
 {'content': 'Analysis:\n\n1. Main difference: Tới giờ (At this time) is translated differently between the two sentences.\n2. Contextual differences: The expected translation (Theo báo cáo đến thời điểm hiện tại...) sets the context by specifyi

In [56]:
PromptGenerator.chat_messages_for_summary(LLM)

[]

In [15]:
print(result)

Căn cứ vào báo cáo, trong số 46 nhà xuất bản thuộc hai giáo xứ trên đảo Great Abaco thì chưa có nạn nhân thương tật nào được báo cáo.


In [82]:
judge_result = "Tới thời điểm hiện nay, không có tin tức về người bị thương trong 46 Nhà xuất bản thuộc hai giáo hội trên đảo Abaco lớn."
get_score(phoMT_dev_envi[4]["question"], result, phoMT_dev_envi[4]["answer"])

9.067805671339572

In [17]:
# result = PromptGenerator.initiate_chat(
#     recipient = LLM,
#     max_turns = 2,
#     message = phoMT_dev_envi[4]["question"],
#     history = PromptGenerator.history,
#     clear_history = False
# )

In [65]:
def score_translate(message, answer, turns = 1, history = None):
    if history is None:
        result = Judge.initiate_chat(
            recipient = PromptGenerator,
            max_turns = turns + 1,
            message = message,
            answer = answer,
        )
    else:
        result = Judge.initiate_chat(
            recipient = PromptGenerator,
            max_turns = turns + 1,
            message = message,
            answer = answer,
            history = history
        )
    return get_score(message, str(result), answer)

def score_translate_test(message, answer, turns = 1, history = None):
    if history is None:
        result = Judge.initiate_chat(
            recipient = PromptGenerator,
            max_turns = turns + 1,
            message = message,
        )
    else:
        result = Judge.initiate_chat(
            recipient = PromptGenerator,
            max_turns = turns + 1,
            message = message,
            history = history
        )
    # result = PromptGenerator.initiate_chat(
    #     recipient = LLM,
    #     max_turns = turns + 1,
    #     message = message,
    #     history = history,
    #     clear_history = False
    # )
    # return result
    return get_score(message, str(result), answer)

### Agent pairing

In [59]:
# score_dev = []
# for i in range(10):
#     x = phoMT_dev_envi[i]
#     score_dev.append(score_translate(x['question'], x['answer'], 0))
score_test = []
for i in range(20):
    x = phoMT_test_envi[i]
    score_test.append(score_translate(
        message= x['question'],
        answer= x['answer'],
        turns= 0
    ))

Recipient.history: 0
[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Brother Albert Barnett and his wife, Sister Susan Barnett, from the West Congregation in Tuscaloosa, Alabama

--------------------------------------------------------------------------------
Self.history: 0
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the

In [20]:
# numpy.average(score_dev)

In [21]:
# score_dev

In [60]:
numpy.average(score_test)

19.641160944550357

In [61]:
score_test

[7.439820585622744,
 14.252530659248798,
 20.781018774386045,
 5.429471835711659,
 38.62047189132909,
 31.27670021100431,
 20.057642372408086,
 5.643841729252477,
 14.568396616958381,
 18.096701954268017,
 29.352202619333035,
 10.336477939542148,
 9.301004337806368,
 7.67066493467495,
 37.64306315047608,
 12.6254971485354,
 11.005096745083616,
 22.621905400530046,
 46.06894414936015,
 30.03176583547575]

## Improve

In [24]:
len(phoMT_train_envi)

2978000

In [66]:
EPOCH = 1
batch = 500
turns = 1
# optimizer_model = "gpt-4-1106-preview"
# optimizer = AgentOptimizer(max_actions_per_step=3, llm_config=llm_config, optimizer_model="llama3")
history_recorder = []
for index in range(batch):
    query = phoMT_train_envi[index]
    # is_correct = user_proxy.initiate_chat(assistant, answer=query["answer"], problem=query["question"])
    result = score_translate(query['question'], query['answer'], turns)
    history = PromptGenerator.history
    print(f"Test: {index}")
    history_recorder.extend(history)
    print(len(history_recorder))

Recipient.history: 0
[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    It begins with a countdown.

--------------------------------------------------------------------------------
Self.history: 0
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up rema

In [67]:
len(history_recorder)

1985

## Compare

In [68]:
score_test = []
for i in range(20):
    x = phoMT_test_envi[i]
    score_test.append(score_translate_test(
        message = x['question'],
        answer = x['answer'],
        turns = 0,
        history = history_recorder))

Recipient.history: 1985
[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Brother Albert Barnett and his wife, Sister Susan Barnett, from the West Congregation in Tuscaloosa, Alabama

--------------------------------------------------------------------------------
Self.history: 1985
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only wi

In [69]:
numpy.average(score_test)

16.49081157391174

In [70]:
score_test

[21.01215804453097,
 4.716217301670693,
 27.337878709888287,
 4.948724401040942,
 30.338500722781674,
 10.048426812212309,
 9.646519637281893,
 8.8186209989258,
 12.787395553510192,
 12.48740514218606,
 25.58082003102854,
 5.860903545110117,
 0.5260879113815321,
 5.173183782924148,
 34.03600247443673,
 17.509131039045975,
 6.898021911491362,
 16.428949863171198,
 46.14734996195437,
 29.51393363366204]

In [71]:
score_test = []
for i in range(20):
    x = phoMT_test_envi[i]
    score_test.append(score_translate_test(
        message = x['question'],
        answer = x['answer'],
        turns = 1,
        history = history_recorder))

Recipient.history: 1985
[33mJudge[0m (to PromptGenerator):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only with the requested output. Do not include any explanations, introductions, follow-up remarks, or additional feedback. Provide exactly and only what is specified in the task.
    The text:
    Brother Albert Barnett and his wife, Sister Susan Barnett, from the West Congregation in Tuscaloosa, Alabama

--------------------------------------------------------------------------------
Self.history: 1985
[33mPromptGenerator[0m (to LLM):

Translate a sentence from English to Vietnamese. Produce an accurate, context-sensitive translation that maintains the tone and meaning of the original sentence. Ensure that the output sounds natural for native Vietnamese speakers.
    Respond only wi

In [72]:
numpy.average(score_test)

12.531904147475226

In [73]:
score_test

[2.8842948234904697,
 1.488426036027701,
 2.6188245697609496,
 3.32694901571206,
 9.126696715633766,
 22.500095738124404,
 9.48633252679611,
 8.488986195284015,
 16.918917678913147,
 18.834313314190524,
 12.909540549433984,
 4.1202784939919095,
 8.78497559398328,
 10.8043996762779,
 19.963051283534003,
 27.19326877457978,
 5.874867257930802,
 21.955627460695332,
 17.817371610898217,
 25.540865634246146]