In [2]:
import os
import openai_async
import pandas as pd
import asyncio
import random

In [3]:
api_key = os.environ["OPENAI_API_KEY"]
available_models = [
    "gpt-3.5-turbo-16k-0613",
    "gpt-3.5-turbo-16k",
]

In [11]:
def format_prompt(original: str, translations: list[str]) -> str:
    assert len(translations) == 3
    prefix = "You are given an original sentence in English and three candidate translations. Which of them conveys the original meaning in the most accurate and fluent way? Print 1, 2 or 3."
    add_index = lambda idx_text: f"{idx_text[0]}) {idx_text[1]}"
    options = list(map(add_index, enumerate(translations, 1)))
    return "\n".join([prefix, original] + options)


async def select(model: str, original: str, translations: list[str]) -> int | None:
    completion = await openai_async.chat_complete(
        api_key=api_key,
        timeout=60,
        payload={
            "model": model,
            "messages": [
                {
                    "role": "system",
                    "content": "You are a native Russian speaker who is also completely fluent in English.",
                },
                {"role": "user", "content": format_prompt(original, translations)},
            ],
        },
    )
    try:
        # taking the first symbol to gracefully handle the cases where model prints entire translation
        result = int(completion.json()["choices"][0]["message"]["content"][0])
        assert 1 <= result <= 3
        return result - 1
    except Exception as e:
        print(e)
        return None

In [12]:
print(format_prompt("Hello, world!", ["Привет, мир!", "Здравствуй, мир.", "Hi!"]))

You are given an original sentence in English and three candidate translations. Which of them conveys the original meaning in the most accurate and fluent way? Print 1, 2 or 3.
Hello, world!
1) Привет, мир!
2) Здравствуй, мир.
3) Hi!


In [25]:
async def run_evaluation(dataset: list[pd.Series]) -> tuple[list[int], list[list[str]], list[str]]:
    n = len(dataset)
    choice = [-1 for i in range(n)]
    labels = [[] for i in range(n)]
    model = ["" for i in range(n)]

    async def task(idx: int, row: pd.Series) -> None:
        model[idx] = random.choice(available_models)
        labels[idx] = ["google", "deepl", "yandex"]
        random.shuffle(labels[idx])
        original = row["sentence"]
        translations = [row[label] for label in labels[idx]]
        print("starting select", idx)
        choice[idx] = await select(model[idx], original, translations)
        print("done with select", idx)

    await asyncio.gather(*[task(idx, row) for idx, (_df_idx, row) in enumerate(dataset)])
    return choice, labels, model

In [28]:
dataset = list(pd.read_csv("data/dataset.csv").iterrows())
choice, labels, model = await run_evaluation(dataset[:10])

starting select 0
starting select 1
starting select 2
done with select 0
done with select 1
done with select 2


In [29]:
print(choice)
print(labels)
print(model)

[0, 0, 0]
[['deepl', 'yandex', 'google'], ['deepl', 'google', 'yandex'], ['google', 'deepl', 'yandex']]
['gpt-3.5-turbo-16k', 'gpt-3.5-turbo-16k-0613', 'gpt-3.5-turbo-16k']
