### JSONL dataset format
```jsonl
{"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
```

In [406]:
import pandas as pd
import numpy as np
import os
import json
from collections import defaultdict
from IPython.display import Markdown
from tqdm import tqdm
from copy import deepcopy
from openai import OpenAI

In [407]:
# data prep
df = pd.read_excel("train.xlsx")
df["parent_id"] = df["parent_id"].fillna("None").astype(str)
df["message_id"] = df["message_id"].astype(str)
df["rank"] = df["rank"].fillna(-1).astype(int)
len(df)

children_map = defaultdict(list)
roots = df[df["parent_id"] == "None"]
for _, row in df.iterrows():
    if row["parent_id"] != "None":
        children_map[row["parent_id"]].append(row.to_dict())

def build_conversation(node, history=None):
    if history is None:
        history = []

    # role = "user" if node["role"] == "prompter" else "assistant"
    history = history + [{
        "role": node["role"],
        "text": node["text"],
        # "rank": int(node["rank"]),
    }]

    children = children_map.get(node["message_id"], [])
    if not children:  # leaf
        return [history]

    results = []
    for child in children:
        results.extend(build_conversation(child, history))
    return results

examples = []
for _, root in tqdm(roots.iterrows(), total=len(roots)):
    examples.extend(build_conversation(root.to_dict()))

with open("oasst2.jsonl", "w", encoding="utf-8") as f:
    for convo in examples:
        record = {"messages": convo}
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

100%|██████████| 5126/5126 [00:00<00:00, 8705.61it/s] 


In [408]:
with open("oasst2.jsonl", "r", encoding="utf-8") as f:
    data:list = [json.loads(line) for line in f]
# data_copy = deepcopy(data)
len(data)

29234

In [409]:
x = []
y = {}
i = 0
for idx,convo in tqdm(enumerate(data)):
    if convo['messages'][-1]['role']!='assistant':
        x.append(convo['messages'])
        y[f"req_{i}"] = idx
        i += 1

29234it [00:00, 2066473.69it/s]


In [410]:
client = OpenAI(api_key=os.getenv("API_KEY"))
model  = 'gpt-4.1-mini'
prompt = """This is a multi-turn conversation which is missing the assistant's response at the end. 
Generate an appropriate assitant response STRICTLY in the JSON scehma {{'role':'assistant', 'text':'<your response here>'}}
Output only that dictionary/json. DO NOT output any other wordings, greetings etc. Do not generate more than 600 tokens. Here is the convo:
{x} """

with open("batch_in.jsonl","w", encoding='utf-8') as f:
    for idx,miss in enumerate(x):
        line = {
            "custom_id":f"req_{idx}",
            "method": "POST",
            "url": "/v1/responses",
            "body": {
                "model":"gpt-5-mini",
                "reasoning":{'effort':'minimal'},
                "input":prompt.format(x=miss)}}
        
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

# file = client.files.create(file=open("batch_in.jsonl", "rb"), purpose="batch")
# batch = client.batches.create(input_file_id=file.id, endpoint="/v1/responses", completion_window="24h")

# batch_id= batch.id
# batch = client.batches.retrieve(batch_id)
# print(batch.status)
# if batch.status=='completed':
#     out_fid = batch.output_file_id
#     out = client.files.content(out_fid)
#     out.write_to_file("batch_out.jsonl")

In [411]:
with open(r"batch_out.jsonl", 'r', encoding='utf-8') as f:
    resp = [json.loads(line) for line in f]

In [412]:
adds = {} #req_id -> addendum
failed = []
for res in resp:
    id = res['custom_id']
    try:
        ass = eval(res['response']['body']['output'][1]['content'][0]['text'])
        adds[id] = ass
    except:
        failed.append(id)
        adds[id] = {"role": "assistant", "content": "I am not sure how to help with that."}

In [413]:
for req_no in range(7889):
    req = f"req_{req_no}"
    idx = y[req]
    add = [adds[req]]
    data[idx]['messages'].extend(add)

In [420]:
with open("data.jsonl", 'w', encoding='utf-8') as f:
    for line in data:
        f.write(json.dumps(line, ensure_ascii=False) + '\n')