In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import base64
import os

load_dotenv()

openai_client = OpenAI()

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def call_4o(prompt, image_path=None):
    content = [{"type": "text", "text": prompt}]
    if image_path:
        base64_image = encode_image(image_path)
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
            },
        })
    completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
            "role": "user",
            "content": content
        }]
    )
    return completion.choices[0].message.content

def call_4o_text_only(prompt):
    completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )
    return completion.choices[0].message.content


In [2]:
import json

eval_json_path = "valid_contents.json"
train_json_path = "train_contents.json"

with open(eval_json_path, "r") as f:
    eval_contents = json.load(f)

with open(train_json_path, "r") as f:
    train_contents = json.load(f)

valid_dataset = []
train_dataset = []

for custom_id, content in eval_contents.items():
    valid_dataset.append({
        "custom_id": custom_id,
        "image_path": f"data/crowdai/val/images/{custom_id}.jpg",
        "caption": content
    })

for custom_id, content in train_contents.items():
    train_dataset.append({
        "custom_id": custom_id,
        "image_path": f"data/crowdai/train/images/{custom_id}.jpg",
        "caption": content
    })

In [3]:
import re
from tqdm import tqdm
from typing import List, Dict, Any

def parse_caption(dataset: List[Dict[str, Any]]):
    dirty_dataset = []
    good_dataset = []
    for item in tqdm(dataset):
        caption = item["caption"]
        pattern = r"(\d+)[:\s]+([^\n]+)"
        matches = re.findall(pattern, caption)
        descriptions = {int(idx): desc.strip() for idx, desc in matches}
        if not descriptions:
            dirty_dataset.append(item)
            continue
        item["descriptions"] = descriptions
        good_dataset.append(item)
    
    return good_dataset, dirty_dataset

good_dataset, dirty_dataset = parse_caption(train_dataset)

100%|██████████| 8366/8366 [00:00<00:00, 225657.70it/s]


In [None]:
dirty_dataset[0], len(dirty_dataset)

In [None]:
import jinja2
from tqdm import tqdm
import re

SYSTEMP_PROMPT = """You are a description collator. Please process the input according to the following rules:

# Task
Extract instance indexes and their corresponding descriptions from the input text.  
The instance index is an integer, and it can appear anywhere near the description, possibly surrounded by brackets, parentheses, or separated by spaces, dashes, colons, etc.  
You must accurately associate each instance index with its corresponding description, regardless of formatting inconsistencies.

# Input
{{input_prompt}}

# Processing Rules
1. For each instance, extract:
   - The instance index (an integer).
   - The associated description text.
2. Rephrase each description into a **noun phrase** that starts with "**a building**" or "**the building**".
   - If the original description already starts with "a building" or "the building", keep it.
   - If it does not, rephrase it naturally so that it does.
3. Focus strictly on spatial location, structure, or appearance.  
   Do not add new information, actions, or interpretations.
4. Ignore all irrelevant symbols, formatting inconsistencies, and line breaks in the input.

# Output Format
Return the results in **strict JSON format**, where:
- Each **key** is the instance index (as a string).
- Each **value** is the cleaned and corrected noun phrase.

The JSON structure must look like:
```json
{
    "0": "[Noun phrase for instance 0]",
    "1": "[Noun phrase for instance 1]",
    "2": "[Noun phrase for instance 2]",
    ...
}
"""

template = jinja2.Template(SYSTEMP_PROMPT)

new_train_dataset = []

for item in tqdm(train_dataset):
    # image = Image.open(item["image_path"])
    caption = item["caption"]
    prompt = template.render(input_prompt=caption)

    response = call_4o_text_only(prompt)
    # Extract the JSON string from the response text
    json_match = re.search(r'\{.*\}', response, re.DOTALL)
    if json_match:
        final_prompt = json.loads(json_match.group())
    else:
        final_prompt = {}
    new_train_dataset.append({
        **item,
        "descriptions": final_prompt
    })

with open("new_train_dataset.json", "w") as f:
    json.dump(new_train_dataset, f)


new_valid_dataset = []

for item in tqdm(valid_dataset):
    # image = Image.open(item["image_path"])
    caption = item["caption"]
    prompt = template.render(input_prompt=caption)

    response = call_4o_text_only(prompt)
    # Extract the JSON string from the response text
    json_match = re.search(r'\{.*\}', response, re.DOTALL)
    if json_match:
        final_prompt = json.loads(json_match.group())
    else:
        final_prompt = {}
    new_valid_dataset.append({
        **item,
        "descriptions": final_prompt
    })

with open("new_valid_dataset.json", "w") as f:
    json.dump(new_valid_dataset, f)

# Batch

In [6]:
BATCH_REQUEST = {
    "custom_id": "", 
    "method": "POST", 
    "url": "/v1/chat/completions", 
    "body": {
        "model": "gpt-4o-mini",
        "messages":[{
            "role": "user",
            "content": ""
        }],
    "max_tokens": 1000}
}

SYSTEMP_PROMPT = """You are a description collator. Please process the input according to the following rules:

# Task
Extract instance indexes and their corresponding descriptions from the input text.  
The instance index is an integer, and it can appear anywhere near the description, possibly surrounded by brackets, parentheses, or separated by spaces, dashes, colons, etc.  
You must accurately associate each instance index with its corresponding description, regardless of formatting inconsistencies.

# Input
{{input_prompt}}

# Processing Rules
1. For each instance, extract:
   - The instance index (an integer).
   - The associated description text.
2. Rephrase each description into a **noun phrase** that starts with "**a building**" or "**the building**".
   - If the original description already starts with "a building" or "the building", keep it.
   - If it does not, rephrase it naturally so that it does.
3. Focus strictly on spatial location, structure, or appearance.  
   Do not add new information, actions, or interpretations.
4. Ignore all irrelevant symbols, formatting inconsistencies, and line breaks in the input.

# Output Format
Return the results in **strict JSON format**, where:
- Each **key** is the instance index (as a string).
- Each **value** is the cleaned and corrected noun phrase.

The JSON structure must look like:
```json
{
    "0": "[Noun phrase for instance 0]",
    "1": "[Noun phrase for instance 1]",
    "2": "[Noun phrase for instance 2]",
    ...
}
"""

template = jinja2.Template(SYSTEMP_PROMPT)

for item in tqdm(train_dataset):
    custom_id = item["custom_id"]
    caption = item["caption"]
    prompt = template.render(input_prompt=caption)

    BATCH_REQUEST["custom_id"] = custom_id
    BATCH_REQUEST["body"]["messages"][0]["content"] = prompt

    with open("train_clean_batch_request.jsonl", "a") as f:
        f.write(json.dumps(BATCH_REQUEST) + "\n")


for item in tqdm(valid_dataset):
    custom_id = item["custom_id"]
    caption = item["caption"]
    prompt = template.render(input_prompt=caption)

    BATCH_REQUEST["custom_id"] = custom_id
    BATCH_REQUEST["body"]["messages"][0]["content"] = prompt

    with open("valid_clean_batch_request.jsonl", "a") as f:
        f.write(json.dumps(BATCH_REQUEST) + "\n")

100%|██████████| 8366/8366 [00:00<00:00, 26943.52it/s]
100%|██████████| 1820/1820 [00:00<00:00, 29171.41it/s]


In [7]:
train_batch_request_file = openai_client.files.create(
    file=open("train_clean_batch_request.jsonl", "rb"),
    purpose="batch"
)
print(train_batch_request_file.id)

train_batch_job = openai_client.batches.create(
    input_file_id=train_batch_request_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "nightly eval job"
    }
)
print(f"train batch job: {train_batch_job.id}")

file-1YRsLzckdtPMSTv6NCQA1d
train batch job: batch_6810a854237481909beda6fab3554932


In [8]:
valid_batch_request_file = openai_client.files.create(
    file=open("valid_clean_batch_request.jsonl", "rb"),
    purpose="batch"
)
print(valid_batch_request_file.id)

valid_batch_job = openai_client.batches.create(
    input_file_id=valid_batch_request_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "nightly eval job"
    }
)
print(f"valid batch job: {valid_batch_job.id}")

file-QJX3gtEbngUDK34d6dAq1K
valid batch job: batch_6810a859f2ac81909497d9acaddaea49


# Review Batch

In [10]:
train_clean_batch_job = openai_client.batches.retrieve("batch_6810a854237481909beda6fab3554932")
valid_clean_batch_job = openai_client.batches.retrieve("batch_6810a859f2ac81909497d9acaddaea49")

print(train_clean_batch_job.status)
print(valid_clean_batch_job.status)

in_progress
in_progress


# Download cleaned

In [None]:
train_clean_output = openai_client.files.content(train_clean_batch_job.output_file_id)
valid_clean_output = openai_client.files.content(valid_clean_batch_job.output_file_id)

import pdb; pdb.set_trace()

train_clean_output_text = train_clean_output.text
valid_clean_output_text = valid_clean_output.text

output_dir = "output_jsonl"
os.makedirs(output_dir, exist_ok=True)

with open(f"{output_dir}/train_clean_output.jsonl", "w") as f: f.write(train_clean_output_text)

with open(f"{output_dir}/valid_clean_output.jsonl", "w") as f: f.write(valid_clean_output_text) 