## Import libraries

In [1]:
import os
import json
import asyncio
from typing import List, Union, Dict, Optional
from openai import OpenAI, AsyncOpenAI

## Pull translated datasets

In [2]:
OUTPUT_DIR = "../data/vietnamese"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
def get_structured_output(response: Union[str, dict], key: Optional[str] = None) -> dict:
    response = response if isinstance(response, dict) else json.loads(response)
    res = json.loads(response["response"]["body"]["choices"][0]["message"]["content"])
    if key:
        res = res.get(key, res)

    return res


async def pull_processed_batches(ids: List[str], client: AsyncOpenAI = AsyncOpenAI(), key: Optional[str] = None):
    batch_response = await asyncio.gather(*[client.batches.retrieve(id) for id in ids])

    failed_ids, success_ids, in_progress_ids = [], [], []
    success_file_ids = []
    for res in batch_response:
        if res.status == "failed":
            failed_ids.append(res.id)
        elif res.status == "in_progress":
            in_progress_ids.append(res.id)
        elif res.status == "completed":
            success_ids.append(res.id)
            success_file_ids.append(res.output_file_id)

    success_output = await asyncio.gather(*[client.files.content(id) for id in success_file_ids])
    success_output = [get_structured_output(response.text, key=key) for response in success_output]

    return {
        "failed": failed_ids,
        "in_progress_ids": in_progress_ids,
        "success_ids": success_ids,
        "success_output": success_output
    }

In [4]:
def update_json_file(file_path: str, new_data: Union[dict, List[dict]]):
    """
    Updates a JSON file by appending new data to an existing array or creating a new array if it doesn't exist.

    Args:
        file_path: The path to the JSON file.
        new_data: The new data to append. It can be a single object or a list of objects.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except FileNotFoundError:
        data = []

    if isinstance(new_data, list):
        data.extend(new_data)
    else:
        data.append(new_data)

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


In [5]:
batch_id_path = "../data/batch_ids.json"
with open(batch_id_path, "r", encoding="utf-8") as f:
    batch_ids = json.load(f)


failed = []
in_progress = []
updated_batch_ids = []
for item in batch_ids:
    dataset_name: str = item["dataset"]
    results = await pull_processed_batches(item["batch_ids"])

    # delete ids
    updated_batch_ids.append({
        "dataset": dataset_name,
        "batch_ids": results["in_progress_ids"]
    })

    # write output into file
    ds_parent, ds_file = dataset_name.split("/")
    os.makedirs(os.path.join(OUTPUT_DIR, ds_parent), exist_ok=True)
    update_json_file(os.path.join(OUTPUT_DIR, dataset_name), results["success_output"])