In [2]:
import os
import openai_async
import pandas as pd
import asyncio
import random

In [3]:
api_key = os.environ["OPENAI_API_KEY"]
available_models = [
    "gpt-3.5-turbo-16k-0613",
    "gpt-3.5-turbo-16k",
]

In [38]:
def format_prompt(original: str, translations: list[str]) -> str:
    assert len(translations) == 3
    prefix = "You are given an original sentence in English and three candidate translations. Which of them conveys the original meaning in the most accurate and fluent way? Score each one from 1 to 10, print only these three numbers, separated by spaces."
    add_index = lambda idx_text: f"{idx_text[0]}) {idx_text[1]}"
    options = list(map(add_index, enumerate(translations, 1)))
    return "\n".join([prefix, original] + options)


async def select(model: str, original: str, translations: list[str]) -> list[int] | None:
    completion = await openai_async.chat_complete(
        api_key=api_key,
        timeout=60,
        payload={
            "model": model,
            "messages": [
                {
                    "role": "system",
                    "content": "You are a native Russian speaker who is also completely fluent in English.",
                },
                {"role": "user", "content": format_prompt(original, translations)},
            ],
        },
    )
    try:
        content = completion.json()["choices"][0]["message"]["content"]
        pieces = content.split()[:3]
        assert len(pieces) == 3
        scores = list(map(int, pieces))
        assert 1 <= min(scores) and max(scores) <= 10
        return scores
    except Exception as e:
        print(e)
        return None

In [36]:
print(format_prompt("Hello, world!", ["Привет, мир!", "Здравствуй, мир.", "Hi!"]))

You are given an original sentence in English and three candidate translations. Which of them conveys the original meaning in the most accurate and fluent way? Score each one from 1 to 10, print only these three numbers, separated by spaces.
Hello, world!
1) Привет, мир!
2) Здравствуй, мир.
3) Hi!


In [42]:
async def run_evaluation(dataset: list[pd.Series]) -> tuple[list[int], list[list[str]], list[str]]:
    n = len(dataset)
    scores = [[] for i in range(n)]
    labels = [[] for i in range(n)]
    model = ["" for i in range(n)]

    async def task(idx: int, row: pd.Series) -> None:
        model[idx] = random.choice(available_models)
        labels[idx] = ["google", "deepl", "yandex"]
        random.shuffle(labels[idx])
        original = row["sentence"]
        translations = [row[label] for label in labels[idx]]
        print("starting select", idx)
        scores[idx] = await select(model[idx], original, translations)
        print("done with select", idx)

    await asyncio.gather(*[task(idx, row) for idx, (_df_idx, row) in enumerate(dataset)])
    return scores, labels, model

In [49]:
dataset = list(pd.read_csv("data/dataset.csv").iterrows())
scores, labels, model = await run_evaluation(dataset[50:100])

starting select 0
starting select 1
starting select 2
starting select 3
starting select 4
starting select 5
starting select 6
starting select 7
starting select 8
starting select 9
starting select 10
starting select 11
starting select 12
starting select 13
starting select 14
starting select 15
starting select 16
starting select 17
starting select 18
starting select 19
starting select 20
starting select 21
starting select 22
starting select 23
starting select 24
starting select 25
starting select 26
starting select 27
starting select 28
starting select 29
starting select 30
starting select 31
starting select 32
starting select 33
starting select 34
starting select 35
starting select 36
starting select 37
starting select 38
starting select 39
starting select 40
starting select 41
starting select 42
starting select 43
starting select 44
done with select 1
done with select 23
done with select 8
done with select 33
done with select 0
done with select 43
done with select 2
done with select 40

In [51]:
for c, l, m in zip(scores, labels, model):
    try:
        print(*c, *l, m, sep=",")
    except:
        print(",,,,,,")

7,6,9,deepl,google,yandex,gpt-3.5-turbo-16k-0613
9,8,8,deepl,google,yandex,gpt-3.5-turbo-16k-0613
9,7,8,yandex,google,deepl,gpt-3.5-turbo-16k
6,9,7,google,deepl,yandex,gpt-3.5-turbo-16k
9,8,7,google,deepl,yandex,gpt-3.5-turbo-16k-0613
7,8,9,yandex,deepl,google,gpt-3.5-turbo-16k
10,8,10,yandex,google,deepl,gpt-3.5-turbo-16k-0613
8,9,7,yandex,deepl,google,gpt-3.5-turbo-16k
7,9,8,google,yandex,deepl,gpt-3.5-turbo-16k
7,8,5,deepl,google,yandex,gpt-3.5-turbo-16k
8,6,9,google,yandex,deepl,gpt-3.5-turbo-16k-0613
8,10,9,yandex,deepl,google,gpt-3.5-turbo-16k
6,9,8,deepl,google,yandex,gpt-3.5-turbo-16k
7,9,8,yandex,google,deepl,gpt-3.5-turbo-16k
10,9,8,yandex,google,deepl,gpt-3.5-turbo-16k
10,9,8,google,yandex,deepl,gpt-3.5-turbo-16k-0613
9,8,7,deepl,yandex,google,gpt-3.5-turbo-16k-0613
9,7,8,yandex,deepl,google,gpt-3.5-turbo-16k
,,,,,,
9,8,10,deepl,yandex,google,gpt-3.5-turbo-16k-0613
10,8,5,yandex,google,deepl,gpt-3.5-turbo-16k
7,8,9,google,yandex,deepl,gpt-3.5-turbo-16k-0613
5,7,8,google,yand