In [66]:
import os
import openai_async
import pandas as pd
import asyncio
import random
from typing import Any

In [3]:
api_key = os.environ["OPENAI_API_KEY"]
available_models = [
    "gpt-3.5-turbo-16k-0613",
    "gpt-3.5-turbo-16k",
]

In [38]:
def format_prompt(original: str, translations: list[str]) -> str:
    assert len(translations) == 3
    prefix = "You are given an original sentence in English and three candidate translations. Which of them conveys the original meaning in the most accurate and fluent way? Score each one from 1 to 10, print only these three numbers, separated by spaces."
    add_index = lambda idx_text: f"{idx_text[0]}) {idx_text[1]}"
    options = list(map(add_index, enumerate(translations, 1)))
    return "\n".join([prefix, original] + options)


async def select(model: str, original: str, translations: list[str]) -> list[int] | None:
    completion = await openai_async.chat_complete(
        api_key=api_key,
        timeout=60,
        payload={
            "model": model,
            "messages": [
                {
                    "role": "system",
                    "content": "You are a native Russian speaker who is also completely fluent in English.",
                },
                {"role": "user", "content": format_prompt(original, translations)},
            ],
        },
    )
    try:
        content = completion.json()["choices"][0]["message"]["content"]
        pieces = content.split()[:3]
        assert len(pieces) == 3
        scores = list(map(int, pieces))
        assert 1 <= min(scores) and max(scores) <= 10
        return scores
    except Exception as e:
        print(e)
        return None

In [36]:
print(format_prompt("Hello, world!", ["Привет, мир!", "Здравствуй, мир.", "Hi!"]))

You are given an original sentence in English and three candidate translations. Which of them conveys the original meaning in the most accurate and fluent way? Score each one from 1 to 10, print only these three numbers, separated by spaces.
Hello, world!
1) Привет, мир!
2) Здравствуй, мир.
3) Hi!


In [42]:
async def run_evaluation(dataset: list[tuple[Any, pd.Series]]) -> tuple[list[list[int]], list[list[str]], list[str]]:
    n = len(dataset)
    scores = [[] for i in range(n)]
    labels = [[] for i in range(n)]
    model = ["" for i in range(n)]

    async def task(idx: int, row: pd.Series) -> None:
        model[idx] = random.choice(available_models)
        labels[idx] = ["google", "deepl", "yandex"]
        random.shuffle(labels[idx])
        original = row["sentence"]
        translations = [row[label] for label in labels[idx]]
        print("starting select", idx)
        scores[idx] = await select(model[idx], original, translations)
        print("done with select", idx)

    await asyncio.gather(*[task(idx, row) for idx, (_df_idx, row) in enumerate(dataset)])
    return scores, labels, model

In [None]:
dataset = list(pd.read_csv("data/dataset.csv").iterrows())
scores, labels, model = await run_evaluation(dataset[200:300])

In [None]:
for c, l, m in zip(scores, labels, model):
    try:
        print(*c, *l, m, sep=",")
    except:
        print(",,,,,,")