In [None]:
!pip install -q openai pandas
#For OPEN AI api

import os
import json
import random
from pathlib import Path

from openai import OpenAI
import pandas as pd


# API KEY
os.environ["OPENAI_API_KEY"] = ""



# Config
CATEGORIES_CSV = Path("categories.synset.csv")
OUT_PATH       = Path("sentences_roles_gpt_v1.jsonl")

# OpenAI model
MODEL_NAME = "gpt-4.1-mini"


# 218 category * 7 intent * 6 ≈ 9k example
JOBS_PER_CATEGORY_PER_INTENT = 6

JOBS_PER_BATCH = 20

# Relations
RELATIONS = [
    "near",
    "on_top_of",
    "behind",
    "in_front_of",
    "inside",
    "next_to",
    "between",
    "under",
]

# Intents
INTENTS = [
    "CreateObject",
    "PlaceObject",
    "RotateObject",
    "ResizeObject",
    "DeleteObject",
    "SetRoom",
    "SetMaterial",
]

# Intent descriptions
INTENT_DESCRIPTION = {
    "CreateObject": "create a new object in the 3D room",
    "PlaceObject": "place or move an existing object relative to another object",
    "RotateObject": "rotate an existing object",
    "ResizeObject": "resize an existing object",
    "DeleteObject": "delete or remove an existing object",
    "SetRoom": "change room-level properties such as size or style",
    "SetMaterial": "change the material or color of an object",
}


# For loading csv file

def load_categories_from_csv(csv_path: Path):

    if not csv_path.exists():
        raise FileNotFoundError(f"CSV not found: {csv_path}")

    df = pd.read_csv(csv_path, header=0)
    syn_words_col = df.columns[3]

    raw_words = (
        df[syn_words_col]
        .dropna()
        .astype(str)
        .str.strip()
        .str.lower()
    )

    cats = set()
    for s in raw_words:
        first = s.split(",")[0].strip()
        if first:
            cats.add(first)

    categories = sorted(cats)
    return categories


def build_generation_jobs(categories):
    jobs = []

    for cat in categories:
        for intent in INTENTS:
            for _ in range(JOBS_PER_CATEGORY_PER_INTENT):
                job = {
                    "intent": intent,
                    "moved_category": cat,
                    "reference_category": None,
                    "relation": None,
                }

                if intent in ["CreateObject", "PlaceObject"]:
                    ref_cat = random.choice(
                        [c for c in categories if c != cat]
                    )
                    rel = random.choice(RELATIONS)
                    job["reference_category"] = ref_cat
                    job["relation"] = rel

                elif intent == "RotateObject":
                    pass
                elif intent == "ResizeObject":
                    pass
                elif intent == "DeleteObject":
                    pass
                elif intent == "SetMaterial":
                    pass
                elif intent == "SetRoom":
                    pass

                jobs.append(job)

    random.shuffle(jobs)
    return jobs


SYSTEM_PROMPT = """
You are generating high-quality training data for a text-to-3D command parser.
The assistant controls a 3D room (Unity-like) and understands natural language commands.

For each scenario, you MUST output exactly ONE JSON object on its own line (JSONL).
Do NOT include any explanations or extra text, only raw JSON per line.

JSON SCHEMA (all keys must be present):
{
  "text": str,                      // user command in natural English
  "gold_command": {
    "intent": str,                  // one of: CreateObject, PlaceObject, RotateObject, ResizeObject, DeleteObject, SetRoom, SetMaterial
    "args": {
      "moved_category": str | null, // semantic category of the main object, or null if not applicable
      "reference_category": str | null, // category of the reference object if there is one, else null
      "relation": str | null        // one of: near, on_top_of, behind, in_front_of, inside, next_to, between, under, or null
    }
  },
  "moved_span": str | null,        // the exact noun phrase in the sentence referring to the moved/target object, or null
  "reference_span": str | null     // the exact noun phrase in the sentence referring to the reference object, or null
}

Requirements:
- The "text" should sound like a user talking to a 3D assistant: short, imperative or descriptive commands.
- Respect the requested intent: the meaning of the sentence must match the intent.
- For CreateObject / PlaceObject, almost always use TWO objects and a spatial relation (when relation is not null).
- For other intents (Rotate/Resize/Delete/SetMaterial), usually only one main object, no reference object, relation = null.
- "moved_span" and "reference_span" must appear verbatim inside "text".
- Use natural variety in language: verbs like "place", "put", "move", "drop", "set", "create", "spawn", etc.
- Use different position structures, for example:
    - "X on Y"
    - "on top of Y is X"
    - "X is placed behind Y"
    - "move X away from Y"
    - "put X next to Y"
    - "place X under Y"
- The "relation" field must be consistent with the sentence meaning.
- In about 20% of the sentences, introduce a SMALL typo in either the moved_span or reference_span (e.g., 'cardl', 'lampp').
  The typo should be minor (extra letter, missing letter, swapped letters) but still recognizable.
"""


def build_user_prompt(batch_jobs):
    lines = []
    lines.append(
        "Generate one JSON object per scenario below. Use the schema described above.\n"
        "Scenarios:"
    )
    for i, job in enumerate(batch_jobs, start=1):
        intent = job["intent"]
        moved = job["moved_category"]
        ref   = job["reference_category"]
        rel   = job["relation"]

        desc = INTENT_DESCRIPTION[intent]

        line = f"{i}. intent={intent} ({desc}); moved_category='{moved}'"
        if ref is not None:
            line += f"; reference_category='{ref}'"
        else:
            line += "; reference_category=null"
        if rel is not None:
            line += f"; relation='{rel}'"
        else:
            line += "; relation=null"

        lines.append(line)

    return "\n".join(lines)


def call_gpt_for_batch(client, batch_jobs):
    user_prompt = build_user_prompt(batch_jobs)

    resp = client.responses.create(
        model=MODEL_NAME,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        max_output_tokens=4096,
    )

    content = resp.output[0].content[0].text
    lines = content.splitlines()

    records = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if len(line) > 2 and line[0].isdigit() and line[1] in [".", ")"]:
            idx = line.find("{")
            if idx != -1:
                line = line[idx:]
        try:
            obj = json.loads(line)
            records.append(obj)
        except json.JSONDecodeError:
            print("WARNING: could not parse line as JSON:", line[:120])
            continue

    return records


# Main

def main():
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY empty please write your key.")

    client = OpenAI(api_key=api_key)

    print(f"Loading categories from {CATEGORIES_CSV} ...")
    categories = load_categories_from_csv(CATEGORIES_CSV)
    print(f"Total unique categories: {len(categories)}")

    print("Building generation jobs ...")
    jobs = build_generation_jobs(categories)
    print(f"Total jobs (scenarios): {len(jobs)}")

    # Output file
    fout = open(OUT_PATH, "w", encoding="utf-8")
    total_written = 0

    try:
        for i in range(0, len(jobs), JOBS_PER_BATCH):
            batch_jobs = jobs[i: i + JOBS_PER_BATCH]
            print(
                f"Batch {i//JOBS_PER_BATCH + 1} / ~{len(jobs)//JOBS_PER_BATCH + 1} "
                f"(jobs {i}..{i+len(batch_jobs)-1})"
            )

            records = call_gpt_for_batch(client, batch_jobs)

            for rec in records:
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
                total_written += 1

    finally:
        fout.close()

    print(f"Done. Total written examples: {total_written}")
    print(f"Output file: {OUT_PATH}")

main()


Loading categories from categories.synset.csv ...
Total unique categories: 218
Building generation jobs ...
Total jobs (scenarios): 9156
Batch 1 / ~458 (jobs 0..19)
Batch 2 / ~458 (jobs 20..39)
Batch 3 / ~458 (jobs 40..59)
Batch 4 / ~458 (jobs 60..79)
Batch 5 / ~458 (jobs 80..99)
Batch 6 / ~458 (jobs 100..119)
Batch 7 / ~458 (jobs 120..139)
Batch 8 / ~458 (jobs 140..159)
Batch 9 / ~458 (jobs 160..179)
Batch 10 / ~458 (jobs 180..199)
Batch 11 / ~458 (jobs 200..219)
Batch 12 / ~458 (jobs 220..239)
Batch 13 / ~458 (jobs 240..259)
Batch 14 / ~458 (jobs 260..279)
Batch 15 / ~458 (jobs 280..299)
Batch 16 / ~458 (jobs 300..319)
Batch 17 / ~458 (jobs 320..339)
Batch 18 / ~458 (jobs 340..359)
Batch 19 / ~458 (jobs 360..379)
Batch 20 / ~458 (jobs 380..399)
Batch 21 / ~458 (jobs 400..419)
Batch 22 / ~458 (jobs 420..439)
Batch 23 / ~458 (jobs 440..459)
Batch 24 / ~458 (jobs 460..479)
Batch 25 / ~458 (jobs 480..499)
Batch 26 / ~458 (jobs 500..519)
Batch 27 / ~458 (jobs 520..539)
Batch 28 / ~458 (jo

In [None]:
!pip install -q openai

import os
import json
import random
from pathlib import Path

from openai import OpenAI

# Api key
os.environ["OPENAI_API_KEY"] = ""

# Configuration

OUT_PATH = Path("intent_variations_v1.jsonl")

MODEL_NAME = "gpt-4.1-mini"

# 7 intent × 600 = 4200 total sample
TARGET_PER_INTENT = 600

JOBS_PER_BATCH = 30

#Intents
INTENTS = [
    "CreateObject",
    "PlaceObject",
    "RotateObject",
    "ResizeObject",
    "DeleteObject",
    "SetRoom",
    "SetMaterial",
]

# Intent description
INTENT_DESCRIPTION = {
    "CreateObject": "create a new object in the 3D room",
    "PlaceObject": "place or move an existing object relative to another object",
    "RotateObject": "rotate an existing object",
    "ResizeObject": "resize an existing object",
    "DeleteObject": "delete or remove an existing object",
    "SetRoom": "change room-level properties such as size, type or style of the room",
    "SetMaterial": "change the color, texture or material of an object",
}

# Sample object names
OBJECT_CATEGORIES = [
    "chair", "desk", "table", "lamp", "bed", "sofa", "bookshelf", "cabinet",
    "monitor", "tv", "cup", "bottle", "plant", "laptop", "keyboard", "mouse",
    "carpet", "picture frame", "door", "window",
]

ROOM_TYPES = [
    "living room", "bedroom", "office", "kitchen", "dining room",
    "hallway", "studio", "meeting room",
]

MATERIAL_TYPES = [
    "wood", "glass", "metal", "plastic", "marble", "fabric",
    "red", "blue", "white", "black", "matte", "shiny",
]

# Sentence Style
STYLE_HINTS = [
    "short_imperative",       # "Create a chair."
    "polite_request",         # "Could you please create a chair near the desk?"
    "question_form",          # "Can you place the lamp on the desk?"
    "multi_clause",           # "First create a chair and then move it next to the desk."
    "verbose_description",    # "I would like you to gently place the lamp right on top of the table."
    "very_short",             # "place chair"
]


def build_intent_jobs():

    jobs = []

    for intent in INTENTS:
        for _ in range(TARGET_PER_INTENT):
            job = {
                "intent": intent,
                "moved_object": random.choice(OBJECT_CATEGORIES),
                "room": random.choice(ROOM_TYPES),
                "material": random.choice(MATERIAL_TYPES),
                "style_hint": random.choice(STYLE_HINTS),
            }
            jobs.append(job)

    random.shuffle(jobs)
    return jobs


SYSTEM_PROMPT = """
You are generating training data for an intent classification model
for a text-to-3D room assistant.

The assistant has 7 possible intents:

- CreateObject
- PlaceObject
- RotateObject
- ResizeObject
- DeleteObject
- SetRoom
- SetMaterial

For each scenario, you MUST output exactly ONE JSON object on its own line (JSONL).
Do NOT include any explanations or extra text, only raw JSON per line.

JSON SCHEMA (all keys must be present):
{
  "text": str,
  "gold_command": {
    "intent": str   // one of: CreateObject, PlaceObject, RotateObject, ResizeObject, DeleteObject, SetRoom, SetMaterial
  }
}

Requirements:
- The "text" must clearly express the given intent and not any other intent.
- Use a wide variety of natural language:
  - different verbs: "create", "spawn", "generate", "place", "put", "move", "drop",
    "rotate", "turn", "spin", "resize", "scale", "enlarge", "shrink", "delete",
    "remove", "clear", "set", "change", "make the room", etc.
  - different sentence styles:
    - short commands: "create a chair"
    - polite requests: "could you please place the lamp on the desk?"
    - questions: "can you rotate the chair by 30 degrees?"
    - multi-clause: "first create a chair and then move it next to the table"
    - very short: "resize the sofa", "set room to office"
- The sentences should mention objects, rooms, and materials when useful, but the schema
  only requires the intent label.
- Make sure that:
  - CreateObject refers to creating/spawning new objects.
  - PlaceObject refers to moving/placing an object (possibly relative to another object).
  - RotateObject refers to changing an object's rotation/orientation.
  - ResizeObject refers to changing an object's size.
  - DeleteObject refers to removing/deleting objects.
  - SetRoom refers to changing room-level properties (size, style, type).
  - SetMaterial refers to changing the material, color or texture of an object.
- In around 10-15% of the sentences, introduce a SMALL typo in either the main verb or the object name,
  e.g., "cretae", "plaec", "lampp", "deskk". The typo should be minor but the sentence should remain understandable.
"""


def build_user_prompt(batch_jobs):
    lines = []
    lines.append(
        "Generate one JSON object per scenario below. Use the schema described above.\n"
        "Scenarios:"
    )

    for i, job in enumerate(batch_jobs, start=1):
        intent = job["intent"]
        obj = job["moved_object"]
        room = job["room"]
        material = job["material"]
        style = job["style_hint"]
        desc = INTENT_DESCRIPTION[intent]

        line = (
            f"{i}. intent={intent} ({desc}); "
            f"object='{obj}'; room='{room}'; material='{material}'; style='{style}'"
        )
        lines.append(line)

    return "\n".join(lines)


def call_gpt_for_batch(client, batch_jobs):
    user_prompt = build_user_prompt(batch_jobs)

    resp = client.responses.create(
        model=MODEL_NAME,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        max_output_tokens=4096,
    )

    content = resp.output[0].content[0].text
    lines = content.splitlines()

    records = []
    for line in lines:
        line = line.strip()
        if not line:
            continue

        if len(line) > 2 and line[0].isdigit() and line[1] in [".", ")"]:
            idx = line.find("{")
            if idx != -1:
                line = line[idx:]
        try:
            obj = json.loads(line)

            if (
                isinstance(obj, dict)
                and "text" in obj
                and "gold_command" in obj
                and isinstance(obj["gold_command"], dict)
                and "intent" in obj["gold_command"]
            ):
                records.append(obj)
            else:
                print("WARNING: unexpected schema:", line[:120])
        except json.JSONDecodeError:
            print("WARNING: could not parse JSON line:", line[:120])
            continue

    return records



#Main
def main():
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("Empty Apikey")

    client = OpenAI(api_key=api_key)

    print("Building intent variation jobs ...")
    jobs = build_intent_jobs()
    print(f"Total jobs (scenarios): {len(jobs)}")

    fout = open(OUT_PATH, "w", encoding="utf-8")
    total_written = 0

    try:
        for i in range(0, len(jobs), JOBS_PER_BATCH):
            batch_jobs = jobs[i: i + JOBS_PER_BATCH]
            print(
                f"Batch {i//JOBS_PER_BATCH + 1} / ~{len(jobs)//JOBS_PER_BATCH + 1} "
                f"(jobs {i}..{i+len(batch_jobs)-1})"
            )

            records = call_gpt_for_batch(client, batch_jobs)

            for rec in records:
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
                total_written += 1

    finally:
        fout.close()

    print(f"Done. Total written examples: {total_written}")
    print(f"Output file: {OUT_PATH}")


# Call the main
main()


Building intent variation jobs ...
Total jobs (scenarios): 4200
Batch 1 / ~141 (jobs 0..29)
Batch 2 / ~141 (jobs 30..59)
Batch 3 / ~141 (jobs 60..89)
Batch 4 / ~141 (jobs 90..119)
Batch 5 / ~141 (jobs 120..149)
Batch 6 / ~141 (jobs 150..179)
Batch 7 / ~141 (jobs 180..209)
Batch 8 / ~141 (jobs 210..239)
Batch 9 / ~141 (jobs 240..269)
Batch 10 / ~141 (jobs 270..299)
Batch 11 / ~141 (jobs 300..329)
Batch 12 / ~141 (jobs 330..359)
Batch 13 / ~141 (jobs 360..389)
Batch 14 / ~141 (jobs 390..419)
Batch 15 / ~141 (jobs 420..449)
Batch 16 / ~141 (jobs 450..479)
Batch 17 / ~141 (jobs 480..509)
Batch 18 / ~141 (jobs 510..539)
Batch 19 / ~141 (jobs 540..569)
Batch 20 / ~141 (jobs 570..599)
Batch 21 / ~141 (jobs 600..629)
Batch 22 / ~141 (jobs 630..659)
Batch 23 / ~141 (jobs 660..689)
Batch 24 / ~141 (jobs 690..719)
Batch 25 / ~141 (jobs 720..749)
Batch 26 / ~141 (jobs 750..779)
Batch 27 / ~141 (jobs 780..809)
Batch 28 / ~141 (jobs 810..839)
Batch 29 / ~141 (jobs 840..869)
Batch 30 / ~141 (jobs 870

In [None]:
!pip install -q openai pandas

import os
import json
import random
from pathlib import Path

from openai import OpenAI
import pandas as pd

# Api Key
os.environ["OPENAI_API_KEY"] = ""


CATEGORIES_CSV = Path("categories.synset.csv")   # ShapeNet categories for reading
OUT_PATH       = Path("slot_roletagger_v1.jsonl")

MODEL_NAME = "gpt-4.1-mini"

# Total 20k sample:
# 8 relation * 2500 ≈ 20,000
TARGET_PER_RELATION = 2500

JOBS_PER_BATCH = 30

#Relations
RELATIONS = [
    "near",
    "on_top_of",
    "behind",
    "in_front_of",
    "inside",
    "next_to",
    "between",
    "under",
]

# Intents
INTENTS_FOR_SLOTS = [
    "CreateObject",
    "PlaceObject",
]

INTENT_DESCRIPTION = {
    "CreateObject": "create or spawn a new object in the 3D room",
    "PlaceObject": "place or move an existing object relative to another object",
}

# Sentece style
STYLE_HINTS = [
    "simple_two_objects",
    "with_third_object",
    "with_pronoun",
    "long_instruction",
    "natural_chatty",
    "compact_command",
]

# Noise degree
NOISE_HINTS = [
    "clean",
    "clean",
    "clean",
    "light_typo",
    "light_typo",
    "heavy_typo",
]



def load_categories_from_csv(csv_path: Path):

    if not csv_path.exists():
        raise FileNotFoundError(f"CSV not found: {csv_path}")

    df = pd.read_csv(csv_path, header=0)
    syn_words_col = df.columns[3]

    raw_words = (
        df[syn_words_col]
        .dropna()
        .astype(str)
        .str.strip()
        .str.lower()
    )

    cats = set()
    for s in raw_words:
        first = s.split(",")[0].strip()
        if first:
            cats.add(first)

    categories = sorted(cats)
    return categories


def build_generation_jobs(categories):

    jobs = []

    for rel in RELATIONS:
        for _ in range(TARGET_PER_RELATION):
            moved = random.choice(categories)
            ref = random.choice([c for c in categories if c != moved])

            job = {
                "intent": random.choice(INTENTS_FOR_SLOTS),
                "moved_category": moved,
                "reference_category": ref,
                "relation": rel,
                "style_hint": random.choice(STYLE_HINTS),
                "noise_hint": random.choice(NOISE_HINTS),
            }
            jobs.append(job)

    random.shuffle(jobs)
    return jobs


SYSTEM_PROMPT = """
You are generating advanced training data for a Slot / RoleTagger model
for a text-to-3D room assistant.

The main goal is to teach the model to:
- identify the moved/target object span in the sentence,
- identify the reference object span in the sentence,
- identify the spatial relation between them.

We focus on two intents:
- CreateObject  (create/spawn a new object)
- PlaceObject   (place or move an existing object)

For each scenario, you MUST output exactly ONE JSON object on its own line (JSONL).
Do NOT include any explanations or extra text, only raw JSON per line.

JSON SCHEMA (all keys must be present):
{
  "text": str,                      // user command in natural English
  "gold_command": {
    "intent": str,                  // one of: CreateObject, PlaceObject
    "args": {
      "moved_category": str,        // semantic category of the main object (e.g., 'chair')
      "reference_category": str,    // category of the reference object (e.g., 'desk')
      "relation": str               // one of: near, on_top_of, behind, in_front_of, inside, next_to, between, under
    }
  },
  "moved_span": str,                // EXACT noun phrase in 'text' that refers to the moved/target object
  "reference_span": str            // EXACT noun phrase in 'text' that refers to the reference object
}

Global Requirements:
- The "text" must clearly describe a spatial relation between TWO main objects:
  a moved object and a reference object.
- The "moved_span" and "reference_span" MUST appear verbatim inside "text".
- The "moved_span" should correspond to the object being created or moved.
- The "reference_span" should correspond to the object used as spatial reference.
- The "relation" value must be consistent with the sentence meaning:
  - near        → close to / near / close by / right beside
  - on_top_of   → on / on top of / resting on / sitting on
  - behind      → behind / at the back of
  - in_front_of → in front of / in front / before
  - inside      → inside / in / within (as in "inside the box")
  - next_to     → next to / beside / alongside
  - between     → between two objects (the reference object should be the main one used)
  - under       → under / underneath / below

Language and Style requirements:
- Use a wide variety of natural language and sentence structures.
- Include:
  - simple commands: "put the chair on the desk"
  - sentences with a third object: "between the sofa and the lamp, place the chair next to the desk"
  - pronouns: "create a chair and move it behind the desk"
  - longer multi-clause instructions, but still with ONE primary moved/reference pair in the annotation
  - short, compact commands: "chair on desk", "lamp under table"

Noise / Typos:
- In about 20-25% of sentences, introduce a SMALL typo in either the main object names
  or the main verb, e.g., "chr", "chaor", "dsek", "lampp", "plaec", "moev".
- The typo must be minor: the sentence should remain understandable.
- Even when there is a typo, the "moved_span" and "reference_span" MUST match the exact,
  possibly misspelled phrase used in the text.

Complexity:
- For "with_third_object" style, you may mention an extra object, but the primary
  moved_span and reference_span should refer to the pair encoded in the args.
- For "with_pronoun", you may use "it" or "them" in addition to the explicit mention,
  but the spans must still mark the explicit noun phrases, not the pronouns.
- For "long_instruction", you can use multi-step sentences, but there must be one clear
  main moved/reference pair for the annotation.

IMPORTANT:
- Always fill moved_span and reference_span (never null).
- Always keep exactly one primary moved object and one primary reference object in the schema.
"""


def build_user_prompt(batch_jobs):
    lines = []
    lines.append(
        "Generate one JSON object per scenario below. Use the schema described above.\n"
        "Scenarios:"
    )

    for i, job in enumerate(batch_jobs, start=1):
        intent = job["intent"]
        moved = job["moved_category"]
        ref   = job["reference_category"]
        rel   = job["relation"]
        style = job["style_hint"]
        noise = job["noise_hint"]
        desc  = INTENT_DESCRIPTION[intent]

        line = (
            f"{i}. intent={intent} ({desc}); "
            f"moved_category='{moved}'; reference_category='{ref}'; "
            f"relation='{rel}'; style='{style}'; noise='{noise}'"
        )
        lines.append(line)

    return "\n".join(lines)


def call_gpt_for_batch(client, batch_jobs):
    user_prompt = build_user_prompt(batch_jobs)

    resp = client.responses.create(
        model=MODEL_NAME,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        max_output_tokens=4096,
    )

    content = resp.output[0].content[0].text
    lines = content.splitlines()

    records = []
    for line in lines:
        line = line.strip()
        if not line:
            continue

        if len(line) > 2 and line[0].isdigit() and line[1] in [".", ")"]:
            idx = line.find("{")
            if idx != -1:
                line = line[idx:]
        try:
            obj = json.loads(line)

            if (
                isinstance(obj, dict)
                and "text" in obj
                and "gold_command" in obj
                and isinstance(obj["gold_command"], dict)
                and "intent" in obj["gold_command"]
                and "args" in obj["gold_command"]
                and isinstance(obj["gold_command"]["args"], dict)
                and "moved_category" in obj["gold_command"]["args"]
                and "reference_category" in obj["gold_command"]["args"]
                and "relation" in obj["gold_command"]["args"]
                and "moved_span" in obj
                and "reference_span" in obj
            ):
                records.append(obj)
            else:
                print("WARNING: unexpected schema:", line[:120])
        except json.JSONDecodeError:
            print("WARNING: could not parse JSON line:", line[:120])
            continue

    return records


#Main
def main():
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY empty please enter it.")

    client = OpenAI(api_key=api_key)

    print(f"Loading categories from {CATEGORIES_CSV} ...")
    categories = load_categories_from_csv(CATEGORIES_CSV)
    print(f"Total unique categories: {len(categories)}")

    print("Building Slot/RoleTagger generation jobs ...")
    jobs = build_generation_jobs(categories)
    print(f"Total jobs (scenarios): {len(jobs)}")  # ≈ 20,000

    fout = open(OUT_PATH, "w", encoding="utf-8")
    total_written = 0

    try:
        for i in range(0, len(jobs), JOBS_PER_BATCH):
            batch_jobs = jobs[i: i + JOBS_PER_BATCH]
            print(
                f"Batch {i//JOBS_PER_BATCH + 1} / ~{len(jobs)//JOBS_PER_BATCH + 1} "
                f"(jobs {i}..{i+len(batch_jobs)-1})"
            )

            records = call_gpt_for_batch(client, batch_jobs)

            for rec in records:
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
                total_written += 1

    finally:
        fout.close()

    print(f"Done. Total written examples: {total_written}")
    print(f"Output file: {OUT_PATH}")

main()


Loading categories from categories.synset.csv ...
Total unique categories: 218
Building Slot/RoleTagger generation jobs ...
Total jobs (scenarios): 20000
Batch 1 / ~667 (jobs 0..29)
Batch 2 / ~667 (jobs 30..59)
Batch 3 / ~667 (jobs 60..89)
Batch 4 / ~667 (jobs 90..119)
Batch 5 / ~667 (jobs 120..149)
Batch 6 / ~667 (jobs 150..179)
Batch 7 / ~667 (jobs 180..209)
Batch 8 / ~667 (jobs 210..239)
Batch 9 / ~667 (jobs 240..269)
Batch 10 / ~667 (jobs 270..299)
Batch 11 / ~667 (jobs 300..329)
Batch 12 / ~667 (jobs 330..359)
Batch 13 / ~667 (jobs 360..389)
Batch 14 / ~667 (jobs 390..419)
Batch 15 / ~667 (jobs 420..449)
Batch 16 / ~667 (jobs 450..479)
Batch 17 / ~667 (jobs 480..509)
Batch 18 / ~667 (jobs 510..539)
Batch 19 / ~667 (jobs 540..569)
Batch 20 / ~667 (jobs 570..599)
Batch 21 / ~667 (jobs 600..629)
Batch 22 / ~667 (jobs 630..659)
Batch 23 / ~667 (jobs 660..689)
Batch 24 / ~667 (jobs 690..719)
Batch 25 / ~667 (jobs 720..749)
Batch 26 / ~667 (jobs 750..779)
Batch 27 / ~667 (jobs 780..809)

In [None]:
from google.colab import files
files.download("slot_roletagger_v1.jsonl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# train_intent_v2.py
#traning the intent model

import json
import random
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer

DATA_PATHS = [
    Path("generated_intents_v1.jsonl"),
    Path("sentences_roles_gpt_v1.jsonl"),
    Path("intent_variations_v1.jsonl"),
]


MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"


SAVE_PATH = Path("intent_head_v2.pt")

# Intent lists
INTENTS = [
    "SetRoom",
    "CreateObject",
    "PlaceObject",
    "RotateObject",
    "ResizeObject",
    "DeleteObject",
    "SetMaterial",
]

INTENT2ID = {name: i for i, name in enumerate(INTENTS)}
NUM_INTENTS = len(INTENTS)

BATCH_SIZE = 32
EPOCHS = 30
LR = 1e-3
VAL_SPLIT = 0.2
SEED = 42


def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def load_jsonl(path: Path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

# Load Dataset

class IntentDataset(Dataset):
    def __init__(self, embeddings: torch.Tensor, labels: torch.Tensor):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return self.embeddings.size(0)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Model


class IntentHead(nn.Module):

    def __init__(self, emb_dim: int, num_intents: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_intents),
        )

    def forward(self, x):
        return self.net(x)


# Train / Eval
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_examples = 0

    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        batch_size = yb.size(0)
        total_loss += loss.item() * batch_size
        preds = logits.argmax(dim=1)
        total_correct += (preds == yb).sum().item()
        total_examples += batch_size

    return total_loss / total_examples, total_correct / total_examples


def eval_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_examples = 0

    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)

            logits = model(xb)
            loss = criterion(logits, yb)

            batch_size = yb.size(0)
            total_loss += loss.item() * batch_size
            preds = logits.argmax(dim=1)
            total_correct += (preds == yb).sum().item()
            total_examples += batch_size

    return total_loss / total_examples, total_correct / total_examples


# Main
def main():
    set_seed(SEED)
    #merge datsets
    all_texts = []
    all_labels = []

    for path in DATA_PATHS:
        if not path.exists():
            print(f"[WARN] Dataset file not found, skipping: {path}")
            continue

        print(f"Loading data from {path} ...")
        items = load_jsonl(path)
        used_count = 0

        for it in items:
            if "text" not in it:
                continue
            gc = it.get("gold_command", {})
            intent_name = gc.get("intent", None)
            if intent_name not in INTENT2ID:
                continue

            all_texts.append(it["text"])
            all_labels.append(INTENT2ID[intent_name])
            used_count += 1

        print(f"  -> Used samples from this file: {used_count}")

    if not all_texts:
        raise RuntimeError("No samples found from any DATA_PATHS. Check your JSONL files and paths.")

    labels = torch.tensor(all_labels, dtype=torch.long)
    print(f"Total usable samples (ALL FILES MERGED): {len(all_texts)}")

    from collections import Counter
    cnt = Counter(all_labels)
    print("Class distribution (intent_id: count):")
    for intent_name, idx in INTENT2ID.items():
        print(f"  {idx} ({intent_name}): {cnt.get(idx, 0)}")

    # 2) SBERT  embeddings
    print(f"Loading SBERT encoder: {MODEL_NAME}")
    encoder = SentenceTransformer(MODEL_NAME)

    print("Encoding sentences with SBERT (this may take a bit)...")
    embeddings = encoder.encode(
        all_texts,
        convert_to_tensor=True,
        batch_size=64,
        show_progress_bar=True
    )
    emb_dim = embeddings.size(1)
    print(f"Embedding shape: {embeddings.shape}")

    # 3) Training
    indices = list(range(len(all_texts)))
    random.shuffle(indices)

    split = int(len(indices) * (1 - VAL_SPLIT))
    train_idx = indices[:split]
    val_idx = indices[split:]

    train_emb = embeddings[train_idx]
    train_labels = labels[train_idx]

    val_emb = embeddings[val_idx]
    val_labels = labels[val_idx]

    train_ds = IntentDataset(train_emb, train_labels)
    val_ds = IntentDataset(val_emb, val_labels)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # 4) Model, Loss, Optim
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = IntentHead(emb_dim=emb_dim, num_intents=NUM_INTENTS).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    best_val_acc = 0.0


    # 5) Training
    for epoch in range(1, EPOCHS + 1):
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = eval_one_epoch(model, val_loader, criterion, device)

        print(
            f"[Epoch {epoch:02d}/{EPOCHS}] "
            f"TrainLoss={train_loss:.4f} ValLoss={val_loss:.4f} "
            f"TrainAcc={train_acc:.3f} ValAcc={val_acc:.3f}"
        )

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            save_obj = {
                "state_dict": model.state_dict(),
                "intent_list": INTENTS,
                "embedding_model_name": MODEL_NAME,
                "emb_dim": emb_dim,
            }
            torch.save(save_obj, SAVE_PATH)
            print(f"  -> New best IntentHead saved to {SAVE_PATH} (ValAcc={val_acc:.3f})")

    print("Training finished.")


if __name__ == "__main__":
    main()


[WARN] Dataset file not found, skipping: generated_intents_v1.jsonl
[WARN] Dataset file not found, skipping: sentences_roles_gpt_v1.jsonl
Loading data from intent_variations_v1.jsonl ...
  -> Used samples from this file: 4198
Total usable samples (ALL FILES MERGED): 4198
Class distribution (intent_id: count):
  0 (SetRoom): 600
  1 (CreateObject): 598
  2 (PlaceObject): 596
  3 (RotateObject): 603
  4 (ResizeObject): 601
  5 (DeleteObject): 601
  6 (SetMaterial): 599
Loading SBERT encoder: sentence-transformers/all-mpnet-base-v2
Encoding sentences with SBERT (this may take a bit)...


Batches:   0%|          | 0/66 [00:00<?, ?it/s]

Embedding shape: torch.Size([4198, 768])
Using device: cpu
[Epoch 01/30] TrainLoss=1.0914 ValLoss=0.3963 TrainAcc=0.838 ValAcc=0.936
  -> New best IntentHead saved to intent_head_v2.pt (ValAcc=0.936)
[Epoch 02/30] TrainLoss=0.2634 ValLoss=0.1963 TrainAcc=0.945 ValAcc=0.950
  -> New best IntentHead saved to intent_head_v2.pt (ValAcc=0.950)
[Epoch 03/30] TrainLoss=0.1662 ValLoss=0.1615 TrainAcc=0.957 ValAcc=0.957
  -> New best IntentHead saved to intent_head_v2.pt (ValAcc=0.957)
[Epoch 04/30] TrainLoss=0.1302 ValLoss=0.1373 TrainAcc=0.968 ValAcc=0.960
  -> New best IntentHead saved to intent_head_v2.pt (ValAcc=0.960)
[Epoch 05/30] TrainLoss=0.1120 ValLoss=0.1292 TrainAcc=0.970 ValAcc=0.960
[Epoch 06/30] TrainLoss=0.0973 ValLoss=0.1228 TrainAcc=0.976 ValAcc=0.968
  -> New best IntentHead saved to intent_head_v2.pt (ValAcc=0.968)
[Epoch 07/30] TrainLoss=0.0882 ValLoss=0.1217 TrainAcc=0.976 ValAcc=0.967
[Epoch 08/30] TrainLoss=0.0795 ValLoss=0.1184 TrainAcc=0.975 ValAcc=0.967
[Epoch 09/30] 

In [None]:
# train_slots_v3.py
# traning all slots model

import json
import random
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer

# Data paths
DATA_PATHS = [
    Path("sentences_roles_gpt_v1.jsonl"),
    Path("slot_roletagger_v1.jsonl"),
]

MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
SAVE_PATH  = Path("slot_head_v3.pt")

BATCH_SIZE = 32
EPOCHS = 60
LR = 1e-3
VAL_SPLIT = 0.2
SEED = 42

# Utils
def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def load_jsonl(path: Path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

def extract_intent_args(it):
    cmd  = it.get("gold_command", {})
    args = cmd.get("args", {}) if isinstance(cmd, dict) else {}

    intent = cmd.get("intent", "")

    moved_cat = it.get("moved_category", None)
    if moved_cat is None:
        moved_cat = args.get("moved_category", None)

    ref_cat = it.get("reference_category", None)
    if ref_cat is None:
        ref_cat = args.get("reference_category", "none")

    relation = args.get("relation", None)
    side     = args.get("side", "none")
    qty      = args.get("quantity", 1)
    dist     = args.get("distance_m", 0.0)

    if side is None:
        side = "none"

    try:
        qty = int(qty)
    except (TypeError, ValueError):
        qty = 1

    try:
        dist = float(dist)
    except (TypeError, ValueError):
        dist = 0.0

    return intent, moved_cat, ref_cat, relation, side, qty, dist


def build_slot_vocabs(all_items):
    category_set = set()
    relation_set = set()
    side_set = set()
    quantity_set = set()

    for it in all_items:
        intent, moved_cat, ref_cat, relation, side, qty, dist = extract_intent_args(it)

        if intent not in ["CreateObject", "PlaceObject", "MoveObject"]:
            continue

        if moved_cat:
            category_set.add(moved_cat)
        if ref_cat:
            category_set.add(ref_cat)

        if relation is not None:
            relation_set.add(relation)

        if side is None:
            side = "none"
        side_set.add(side)

        quantity_set.add(qty)

    category_set.add("none")

    category_list = sorted(category_set)
    relation_list = sorted(relation_set)
    side_list     = sorted(side_set)
    quantity_list = sorted(quantity_set)

    cat2id  = {c: i for i, c in enumerate(category_list)}
    rel2id  = {r: i for i, r in enumerate(relation_list)}
    side2id = {s: i for i, s in enumerate(side_list)}
    qty2id  = {q: i for i, q in enumerate(quantity_list)}

    vocabs = {
        "category_list": category_list,
        "relation_list": relation_list,
        "side_list":     side_list,
        "quantity_list": quantity_list,
        "cat2id":  cat2id,
        "rel2id":  rel2id,
        "side2id": side2id,
        "qty2id":  qty2id,
    }
    return vocabs

class SlotDataset(Dataset):
    def __init__(self, embeddings, slot_labels):
        """
        embeddings: [N, emb_dim]
        slot_labels: dict of tensors:
          {
            "moved_category":     LongTensor [N],
            "reference_category": LongTensor [N],
            "relation":           LongTensor [N],
            "side":               LongTensor [N],
            "quantity":           LongTensor [N],
            "distance":           FloatTensor [N],
          }
        """
        self.embeddings = embeddings
        self.slot_labels = slot_labels

    def __len__(self):
        return self.embeddings.size(0)

    def __getitem__(self, idx):
        x = self.embeddings[idx]
        y = {k: v[idx] for k, v in self.slot_labels.items()}
        return x, y


# Model
class SlotHeadV3(nn.Module):
    def __init__(self, emb_dim, num_cat, num_rel, num_side, num_qty):
        super().__init__()
        hidden_dim1 = 512
        hidden_dim2 = 256

        self.backbone = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim1),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU(),
            nn.Dropout(0.3),
        )

        self.moved_category_head     = nn.Linear(hidden_dim2, num_cat)
        self.reference_category_head = nn.Linear(hidden_dim2, num_cat)
        self.relation_head = nn.Linear(hidden_dim2, num_rel)
        self.side_head     = nn.Linear(hidden_dim2, num_side)
        self.quantity_head = nn.Linear(hidden_dim2, num_qty)
        self.distance_head = nn.Linear(hidden_dim2, 1)

    def forward(self, x):
        h = self.backbone(x)
        out = {
            "moved_category_logits":     self.moved_category_head(h),
            "reference_category_logits": self.reference_category_head(h),
            "relation_logits":           self.relation_head(h),
            "side_logits":               self.side_head(h),
            "quantity_logits":           self.quantity_head(h),
            "distance":                  self.distance_head(h).squeeze(-1),
        }
        return out

# Train

def compute_loss(outputs, labels, ce_loss, mse_loss,
                 w_moved_cat=1.0, w_ref_cat=1.0,
                 w_rel=1.0, w_side=0.5, w_qty=0.5, w_dist=1.0):

    moved_cat_loss = ce_loss(outputs["moved_category_logits"],
                             labels["moved_category"])
    ref_cat_loss = ce_loss(outputs["reference_category_logits"],
                           labels["reference_category"])

    rel_loss  = ce_loss(outputs["relation_logits"],  labels["relation"])
    side_loss = ce_loss(outputs["side_logits"],      labels["side"])
    qty_loss  = ce_loss(outputs["quantity_logits"],  labels["quantity"])

    dist_loss = mse_loss(outputs["distance"], labels["distance"])

    total = (
        w_moved_cat * moved_cat_loss +
        w_ref_cat   * ref_cat_loss   +
        w_rel       * rel_loss       +
        w_side      * side_loss      +
        w_qty       * qty_loss       +
        w_dist      * dist_loss
    )

    loss_dict = {
        "total":            total,
        "moved_category":   moved_cat_loss.detach().item(),
        "reference_category": ref_cat_loss.detach().item(),
        "relation":         rel_loss.detach().item(),
        "side":             side_loss.detach().item(),
        "quantity":         qty_loss.detach().item(),
        "distance":         dist_loss.detach().item(),
    }
    return total, loss_dict


def train_one_epoch(model, loader, ce_loss, mse_loss, optimizer, device):
    model.train()
    total_loss = 0.0
    total_examples = 0

    correct_moved_cat = 0
    correct_ref_cat   = 0
    correct_rel       = 0
    correct_side      = 0
    correct_qty       = 0

    for xb, yb in loader:
        xb = xb.to(device)
        labels = {k: v.to(device) for k, v in yb.items()}

        optimizer.zero_grad()
        outputs = model(xb)

        loss, _ = compute_loss(outputs, labels, ce_loss, mse_loss)
        loss.backward()
        optimizer.step()

        batch_size = xb.size(0)
        total_loss += loss.item() * batch_size
        total_examples += batch_size

        moved_cat_pred = outputs["moved_category_logits"].argmax(dim=1)
        ref_cat_pred   = outputs["reference_category_logits"].argmax(dim=1)
        rel_pred       = outputs["relation_logits"].argmax(dim=1)
        side_pred      = outputs["side_logits"].argmax(dim=1)
        qty_pred       = outputs["quantity_logits"].argmax(dim=1)

        correct_moved_cat += (moved_cat_pred == labels["moved_category"]).sum().item()
        correct_ref_cat   += (ref_cat_pred   == labels["reference_category"]).sum().item()
        correct_rel       += (rel_pred       == labels["relation"]).sum().item()
        correct_side      += (side_pred      == labels["side"]).sum().item()
        correct_qty       += (qty_pred       == labels["quantity"]).sum().item()

    avg_loss = total_loss / max(total_examples, 1)
    acc_moved_cat = correct_moved_cat / max(total_examples, 1)
    acc_ref_cat   = correct_ref_cat   / max(total_examples, 1)
    acc_rel       = correct_rel       / max(total_examples, 1)
    acc_side      = correct_side      / max(total_examples, 1)
    acc_qty       = correct_qty       / max(total_examples, 1)

    metrics = {
        "loss":          avg_loss,
        "acc_moved_cat": acc_moved_cat,
        "acc_ref_cat":   acc_ref_cat,
        "acc_rel":       acc_rel,
        "acc_side":      acc_side,
        "acc_qty":       acc_qty,
    }
    return metrics


def eval_one_epoch(model, loader, ce_loss, mse_loss, device):
    model.eval()
    total_loss = 0.0
    total_examples = 0

    correct_moved_cat = 0
    correct_ref_cat   = 0
    correct_rel       = 0
    correct_side      = 0
    correct_qty       = 0

    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            labels = {k: v.to(device) for k, v in yb.items()}

            outputs = model(xb)
            loss, _ = compute_loss(outputs, labels, ce_loss, mse_loss)

            batch_size = xb.size(0)
            total_loss += loss.item() * batch_size
            total_examples += batch_size

            moved_cat_pred = outputs["moved_category_logits"].argmax(dim=1)
            ref_cat_pred   = outputs["reference_category_logits"].argmax(dim=1)
            rel_pred       = outputs["relation_logits"].argmax(dim=1)
            side_pred      = outputs["side_logits"].argmax(dim=1)
            qty_pred       = outputs["quantity_logits"].argmax(dim=1)

            correct_moved_cat += (moved_cat_pred == labels["moved_category"]).sum().item()
            correct_ref_cat   += (ref_cat_pred   == labels["reference_category"]).sum().item()
            correct_rel       += (rel_pred       == labels["relation"]).sum().item()
            correct_side      += (side_pred      == labels["side"]).sum().item()
            correct_qty       += (qty_pred       == labels["quantity"]).sum().item()

    avg_loss = total_loss / max(total_examples, 1)
    acc_moved_cat = correct_moved_cat / max(total_examples, 1)
    acc_ref_cat   = correct_ref_cat   / max(total_examples, 1)
    acc_rel       = correct_rel       / max(total_examples, 1)
    acc_side      = correct_side      / max(total_examples, 1)
    acc_qty       = correct_qty       / max(total_examples, 1)

    metrics = {
        "loss":          avg_loss,
        "acc_moved_cat": acc_moved_cat,
        "acc_ref_cat":   acc_ref_cat,
        "acc_rel":       acc_rel,
        "acc_side":      acc_side,
        "acc_qty":       acc_qty,
    }
    return metrics

# Main
def main():
    set_seed(SEED)

    all_items = []
    for path in DATA_PATHS:
        if not path.exists():
            print(f"[WARN] Dataset not found, skipping: {path}")
            continue
        print(f"Loading data from {path} ...")
        items = load_jsonl(path)
        print(f"  -> Loaded {len(items)} raw items")
        all_items.extend(items)

    if not all_items:
        raise RuntimeError("No items loaded from any DATA_PATHS. Check your file paths.")

    print(f"Total raw items from all files: {len(all_items)}")


    vocabs = build_slot_vocabs(all_items)
    cat2id  = vocabs["cat2id"]
    rel2id  = vocabs["rel2id"]
    side2id = vocabs["side2id"]
    qty2id  = vocabs["qty2id"]

    category_list = vocabs["category_list"]
    relation_list = vocabs["relation_list"]
    side_list     = vocabs["side_list"]
    quantity_list = vocabs["quantity_list"]

    print(f"Categories: {category_list}")
    print(f"Relations:  {relation_list}")
    print(f"Sides:      {side_list}")
    print(f"Quantities: {quantity_list}")


    texts = []
    moved_cat_labels = []
    ref_cat_labels   = []
    rel_labels       = []
    side_labels      = []
    qty_labels       = []
    dist_labels      = []

    for it in all_items:
        text = it.get("text", None)
        if not text:
            continue

        intent, moved_cat, ref_cat, relation, side, qty, dist = extract_intent_args(it)

        if intent not in ["CreateObject", "PlaceObject", "MoveObject"]:
            continue

        if relation is None:
            continue

        if side is None:
            side = "none"

        if (moved_cat not in cat2id or
            ref_cat   not in cat2id or
            relation  not in rel2id or
            side      not in side2id or
            qty       not in qty2id):
            continue

        texts.append(text)
        moved_cat_labels.append(cat2id[moved_cat])
        ref_cat_labels.append(cat2id[ref_cat])
        rel_labels.append(rel2id[relation])
        side_labels.append(side2id[side])
        qty_labels.append(qty2id[qty])
        dist_labels.append(float(dist))

    print(f"Total slot samples (after filtering): {len(texts)}")
    if len(texts) == 0:
        raise RuntimeError("No valid slot samples found. Check your JSONL content.")

    moved_cat_labels = torch.tensor(moved_cat_labels, dtype=torch.long)
    ref_cat_labels   = torch.tensor(ref_cat_labels,   dtype=torch.long)
    rel_labels       = torch.tensor(rel_labels,       dtype=torch.long)
    side_labels      = torch.tensor(side_labels,      dtype=torch.long)
    qty_labels       = torch.tensor(qty_labels,       dtype=torch.long)
    dist_labels      = torch.tensor(dist_labels,      dtype=torch.float32)

    # 4) SBERT
    print(f"Loading SBERT encoder: {MODEL_NAME}")
    encoder = SentenceTransformer(MODEL_NAME)

    print("Encoding sentences with SBERT (this may take a bit)...")
    embeddings = encoder.encode(
        texts,
        convert_to_tensor=True,
        batch_size=64,
        show_progress_bar=True,
    )
    emb_dim = embeddings.size(1)
    print(f"Embedding shape: {embeddings.shape}")

    # 5) Train
    indices = list(range(len(texts)))
    random.shuffle(indices)

    split = int(len(indices) * (1 - VAL_SPLIT))
    train_idx = indices[:split]
    val_idx   = indices[split:]

    train_emb = embeddings[train_idx]
    val_emb   = embeddings[val_idx]

    train_labels = {
        "moved_category":     moved_cat_labels[train_idx],
        "reference_category": ref_cat_labels[train_idx],
        "relation":           rel_labels[train_idx],
        "side":               side_labels[train_idx],
        "quantity":           qty_labels[train_idx],
        "distance":           dist_labels[train_idx],
    }

    val_labels = {
        "moved_category":     moved_cat_labels[val_idx],
        "reference_category": ref_cat_labels[val_idx],
        "relation":           rel_labels[val_idx],
        "side":               side_labels[val_idx],
        "quantity":           qty_labels[val_idx],
        "distance":           dist_labels[val_idx],
    }

    train_ds = SlotDataset(train_emb, train_labels)
    val_ds   = SlotDataset(val_emb,   val_labels)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # 6) Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = SlotHeadV3(
        emb_dim=emb_dim,
        num_cat=len(category_list),
        num_rel=len(relation_list),
        num_side=len(side_list),
        num_qty=len(quantity_list),
    ).to(device)

    ce_loss  = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    best_val_loss = float("inf")

    # 7) Training
    for epoch in range(1, EPOCHS + 1):
        train_metrics = train_one_epoch(model, train_loader, ce_loss, mse_loss, optimizer, device)
        val_metrics   = eval_one_epoch(model,   val_loader,   ce_loss, mse_loss, device)

        print(
            f"[Epoch {epoch:02d}/{EPOCHS}] "
            f"TrainLoss={train_metrics['loss']:.4f} "
            f"ValLoss={val_metrics['loss']:.4f} "
            f"TrainAcc(mov/ref/rel/side/qty)=("
            f"{train_metrics['acc_moved_cat']:.3f}/"
            f"{train_metrics['acc_ref_cat']:.3f}/"
            f"{train_metrics['acc_rel']:.3f}/"
            f"{train_metrics['acc_side']:.3f}/"
            f"{train_metrics['acc_qty']:.3f}) "
            f"ValAcc(mov/ref/rel/side/qty)=("
            f"{val_metrics['acc_moved_cat']:.3f}/"
            f"{val_metrics['acc_ref_cat']:.3f}/"
            f"{val_metrics['acc_rel']:.3f}/"
            f"{val_metrics['acc_side']:.3f}/"
            f"{val_metrics['acc_qty']:.3f})"
        )

        if val_metrics["loss"] < best_val_loss:
            best_val_loss = val_metrics["loss"]
            save_obj = {
                "state_dict":        model.state_dict(),
                "embedding_model_name": MODEL_NAME,
                "emb_dim":           emb_dim,
                "category_list":     category_list,
                "relation_list":     relation_list,
                "side_list":         side_list,
                "quantity_list":     quantity_list,
            }
            torch.save(save_obj, SAVE_PATH)
            print(f"  -> New best SlotHead v3 saved to {SAVE_PATH} (ValLoss={best_val_loss:.4f})")

    print("Slot v3 training finished.")


if __name__ == "__main__":
    main()


Loading data from sentences_roles_gpt_v1.jsonl ...
  -> Loaded 9152 raw items
Loading data from slot_roletagger_v1.jsonl ...
  -> Loaded 19937 raw items
Total raw items from all files: 29089
Categories: ['Chest of drawers', 'Shampo', 'airplane', 'animal', 'apple', 'armoire', 'ashcan', 'backpack', 'bag', 'ball', 'base', 'basket', 'bathtub', 'battery', 'batttery', 'beanbag', 'bear', 'bed', 'beer bottle', 'bench', 'bicycle', 'bidet', 'bird', 'blind', 'book', 'bookcase', 'booth', 'bottle', 'bowl', 'box', 'bucket', 'buffet', 'bunk bed', 'bus', 'butterfly', 'cabinet', 'cable', 'cabnet', 'cabniet', 'cage', 'calculator', 'camera', 'can opener', 'candle', 'cap', 'car', 'carrot', 'carton', 'cartonn', 'case shot', 'cassette', 'cat', 'cd player', 'cellular telephone', 'cereal box', 'cerial box', 'chair', 'chaise longue', 'chandelier', 'chest of drawers', 'chest of draws', 'china cabinet', 'chocolate', 'clock', 'coaster', 'coatrack', 'coffee table', 'coin', 'computer', 'container', 'cookie', 'count

Batches:   0%|          | 0/353 [00:00<?, ?it/s]

Embedding shape: torch.Size([22549, 768])
Using device: cpu
[Epoch 01/60] TrainLoss=10.7052 ValLoss=7.6557 TrainAcc(mov/ref/rel/side/qty)=(0.081/0.098/0.411/1.000/1.000) ValAcc(mov/ref/rel/side/qty)=(0.261/0.301/0.565/1.000/1.000)
  -> New best SlotHead v3 saved to slot_head_v3.pt (ValLoss=7.6557)
[Epoch 02/60] TrainLoss=7.0460 ValLoss=5.7878 TrainAcc(mov/ref/rel/side/qty)=(0.305/0.338/0.548/1.000/1.000) ValAcc(mov/ref/rel/side/qty)=(0.409/0.446/0.632/1.000/1.000)
  -> New best SlotHead v3 saved to slot_head_v3.pt (ValLoss=5.7878)
[Epoch 03/60] TrainLoss=5.8341 ValLoss=5.0719 TrainAcc(mov/ref/rel/side/qty)=(0.399/0.443/0.600/1.000/1.000) ValAcc(mov/ref/rel/side/qty)=(0.455/0.475/0.671/1.000/1.000)
  -> New best SlotHead v3 saved to slot_head_v3.pt (ValLoss=5.0719)
[Epoch 04/60] TrainLoss=5.2150 ValLoss=4.6943 TrainAcc(mov/ref/rel/side/qty)=(0.450/0.486/0.633/1.000/1.000) ValAcc(mov/ref/rel/side/qty)=(0.484/0.496/0.697/1.000/1.000)
  -> New best SlotHead v3 saved to slot_head_v3.pt (Val

In [None]:
!pip install -q transformers accelerate


In [None]:
# train_roletagger_v1.py
# Label schema
#   O       -> other tokens
#   B-MOVED -> first token of moved span
#   I-MOVED -> moved span's continuation tokens
#   B-REF   -> reference span's first token
#   I-REF   -> reference span continuation tokens


from transformers import (
    BertTokenizerFast,
    BertForTokenClassification,
    get_linear_schedule_with_warmup,
)

from torch.optim import AdamW
from pathlib import Path
import json
import random
import torch
from torch.utils.data import Dataset, DataLoader

# Config

DATA_PATH = Path("slot_roletagger_v1.jsonl")
PRETRAINED_MODEL_NAME = "bert-base-uncased"
SAVE_DIR = Path("roletagger_bert_v1")

MAX_LEN = 64
BATCH_SIZE = 16
EPOCHS = 6
LR = 5e-5
VAL_SPLIT = 0.2
SEED = 42


def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


# Data loading

def load_jsonl(path: Path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items


# Label scheme
LABEL_LIST = ["O", "B-MOVED", "I-MOVED", "B-REF", "I-REF"]
LABEL2ID = {lab: i for i, lab in enumerate(LABEL_LIST)}
ID2LABEL = {i: lab for lab, i in LABEL2ID.items()}


def find_span_char_indices(text: str, span: str):
    if not span:
        return None

    text_lower = text.lower()
    span_lower = span.lower()

    idx = text_lower.find(span_lower)
    if idx == -1:
        return None

    start = idx
    end = idx + len(span_lower)
    return (start, end)


def build_labels_for_example(text, moved_span, ref_span, offsets):
    labels = [LABEL2ID["O"]] * len(offsets)

    moved_range = find_span_char_indices(text, moved_span)
    ref_range   = find_span_char_indices(text, ref_span)


    token_types = ["O"] * len(offsets)

    if moved_range is not None:
        m_start, m_end = moved_range
        for i, (tok_start, tok_end) in enumerate(offsets):
            if tok_start == tok_end == 0:
                continue
            if tok_start >= m_start and tok_end <= m_end and tok_end > tok_start:
                token_types[i] = "MOVED"

    if ref_range is not None:
        r_start, r_end = ref_range
        for i, (tok_start, tok_end) in enumerate(offsets):
            if tok_start == tok_end == 0:
                continue
            if tok_start >= r_start and tok_end <= r_end and tok_end > tok_start:
                if token_types[i] == "O":
                    token_types[i] = "REF"

    prev_type = "O"
    for i, ttype in enumerate(token_types):
        if offsets[i] == (0, 0):
            labels[i] = -100
            continue

        if ttype == "O":
            labels[i] = LABEL2ID["O"]
        elif ttype == "MOVED":
            if prev_type != "MOVED":
                labels[i] = LABEL2ID["B-MOVED"]
            else:
                labels[i] = LABEL2ID["I-MOVED"]
        elif ttype == "REF":
            if prev_type != "REF":
                labels[i] = LABEL2ID["B-REF"]
            else:
                labels[i] = LABEL2ID["I-REF"]

        prev_type = ttype

    return labels

# Dataset

class RoleTaggerDataset(Dataset):
    def __init__(self, encodings, labels):
        """
        encodings: tokenizer(input_ids, attention_mask, token_type_ids, ...)
        labels: [N, max_len] LongTensor
        """
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return self.labels.size(0)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item


# Training


def main():
    set_seed(SEED)

    if not DATA_PATH.exists():
        raise FileNotFoundError(f"DATA_PATH not found: {DATA_PATH}")

    print(f"Loading data from {DATA_PATH} ...")
    items = load_jsonl(DATA_PATH)
    print(f"  -> Loaded {len(items)} items")

    texts = []
    moved_spans = []
    ref_spans = []

    for it in items:
        text = it.get("text", None)
        moved = it.get("moved_span", None)
        ref   = it.get("reference_span", None)

        if not text or moved is None or ref is None:
            continue

        texts.append(text)
        moved_spans.append(moved)
        ref_spans.append(ref)

    print(f"Total usable examples: {len(texts)}")
    if len(texts) == 0:
        raise RuntimeError("No usable examples with text + moved_span + reference_span")

    tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL_NAME)

    print("Tokenizing with offsets ...")
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=MAX_LEN,
        return_offsets_mapping=True,
        return_tensors="pt",
    )

    offsets = encodings["offset_mapping"]
    encodings.pop("offset_mapping")


    all_labels = []
    num_skipped = 0

    for i in range(len(texts)):
        text = texts[i]
        moved = moved_spans[i]
        ref = ref_spans[i]
        offs = offsets[i].tolist()

        labels = build_labels_for_example(text, moved, ref, offs)

        all_labels.append(labels)

    print(f"Labelled examples: {len(all_labels)} (skipped: {num_skipped})")

    labels_tensor = torch.tensor(all_labels, dtype=torch.long)

    # Traning
    indices = list(range(labels_tensor.size(0)))
    random.shuffle(indices)

    split = int(len(indices) * (1 - VAL_SPLIT))
    train_idx = indices[:split]
    val_idx   = indices[split:]

    train_encodings = {k: v[train_idx] for k, v in encodings.items()}
    val_encodings   = {k: v[val_idx]   for k, v in encodings.items()}

    train_labels = labels_tensor[train_idx]
    val_labels   = labels_tensor[val_idx]

    train_ds = RoleTaggerDataset(train_encodings, train_labels)
    val_ds   = RoleTaggerDataset(val_encodings,   val_labels)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = BertForTokenClassification.from_pretrained(
        PRETRAINED_MODEL_NAME,
        num_labels=len(LABEL_LIST),
        id2label=ID2LABEL,
        label2id=LABEL2ID,
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=LR)

    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps,
    )

    best_val_loss = float("inf")
    SAVE_DIR.mkdir(parents=True, exist_ok=True)
    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_train_loss = 0.0
        total_train_tokens = 0

        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            batch_size = batch["input_ids"].size(0)
            total_train_loss += loss.item() * batch_size
            total_train_tokens += batch_size

        avg_train_loss = total_train_loss / max(total_train_tokens, 1)

        model.eval()
        total_val_loss = 0.0
        total_val_tokens = 0

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}

                outputs = model(**batch)
                loss = outputs.loss

                batch_size = batch["input_ids"].size(0)
                total_val_loss += loss.item() * batch_size
                total_val_tokens += batch_size

        avg_val_loss = total_val_loss / max(total_val_tokens, 1)

        print(
            f"[Epoch {epoch:02d}/{EPOCHS}] "
            f"TrainLoss={avg_train_loss:.4f} "
            f"ValLoss={avg_val_loss:.4f}"
        )

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            print(f"  -> New best model (ValLoss={best_val_loss:.4f}), saving to {SAVE_DIR}")
            model.save_pretrained(SAVE_DIR)
            tokenizer.save_pretrained(SAVE_DIR)

            with open(SAVE_DIR / "labels.json", "w", encoding="utf-8") as f:
                json.dump(
                    {
                        "label_list": LABEL_LIST,
                        "label2id": LABEL2ID,
                        "id2label": ID2LABEL,
                    },
                    f,
                    ensure_ascii=False,
                    indent=2,
                )

    print("RoleTagger training finished.")


if __name__ == "__main__":
    main()


Loading data from slot_roletagger_v1.jsonl ...
  -> Loaded 19937 items
Total usable examples: 19937


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing with offsets ...
Labelled examples: 19937 (skipped: 0)
Using device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 01/6] TrainLoss=0.1354 ValLoss=0.0229
  -> New best model (ValLoss=0.0229), saving to roletagger_bert_v1
[Epoch 02/6] TrainLoss=0.0235 ValLoss=0.0211
  -> New best model (ValLoss=0.0211), saving to roletagger_bert_v1
[Epoch 03/6] TrainLoss=0.0210 ValLoss=0.0225
[Epoch 04/6] TrainLoss=0.0188 ValLoss=0.0230
[Epoch 05/6] TrainLoss=0.0162 ValLoss=0.0252
[Epoch 06/6] TrainLoss=0.0112 ValLoss=0.0291
RoleTagger training finished.


In [None]:
# -*- coding: utf-8 -*-
# generate_relation_bio_dataset_v2.py

!pip -q install openai transformers

import os, re, json, random
from pathlib import Path
from collections import Counter
from openai import OpenAI
from transformers import BertTokenizerFast

# -------------------------
# API KEY
# -------------------------
os.environ["OPENAI_API_KEY"] = ""

# -------------------------
# Config
# -------------------------
OUT_PATH = Path("relation_bio_synth_v2.jsonl")
MODEL_NAME = "gpt-4.1-mini"
SEED = 42
random.seed(SEED)

INTENTS = ["CreateObject", "PlaceObject"]

RELATIONS = ["behind","between","in_front_of","inside","near","next_to","on_top_of","under"]

CATEGORIES = [
    "chair", "table", "desk", "bed", "sofa", "lamp", "bookshelf", "cabinet",
    "wardrobe", "dresser", "mirror", "carpet", "pillow", "blanket", "plant",
    "vase", "picture_frame", "tv", "monitor", "keyboard", "mouse", "laptop",
    "speaker", "clock", "fan", "air_conditioner", "heater", "window", "door",
    "shelf", "bench", "stool", "cushion", "coffee_table", "side_table",
    "nightstand", "bathtub", "toilet", "sink", "fridge", "oven", "microwave",
    "chair_mat", "coat_rack", "shoe_rack", "broom", "bucket", "trash_bin",
    "cupboard", "pan", "plate", "bottle", "mug", "guitar", "piano", "fence",
    "tree", "rock", "flower_pot"
]

SAMPLES_PER_REL = 1200   # total ~ 9600
BATCH_SIZE = 20


TOKENIZER_NAME = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(TOKENIZER_NAME)


# Cue variants
CUE_VARIANTS = {
    "on_top_of": [("on top of", 0.50), ("on", 0.50)],
    "inside":    [("inside", 0.50), ("in", 0.50)],
    "in_front_of":[("in front of", 1.00)],
    "behind":    [("behind", 1.00)],
    "under":     [("under", 0.70), ("underneath", 0.30)],
    "next_to":   [("next to", 0.60), ("beside", 0.40)],
    "near":      [("near", 0.60), ("close to", 0.40)],
    "between":   [("between", 1.00)],
}

def weighted_choice(pairs):
    r = random.random()
    s = 0.0
    for item, w in pairs:
        s += w
        if r <= s:
            return item
    return pairs[-1][0]


# LLM prompt

SYSTEM_PROMPT = r"""
You generate clean relation-extraction training examples for text-to-3D commands.

Return ONLY JSONL: one JSON per line, no extra text.

Each JSON schema:
{
  "text": str,
  "gold": {
    "intent": "CreateObject"|"PlaceObject",
    "moved_category": str,
    "reference_category": str,
    "reference_category_2": str|null,
    "relation": str
  },
  "spans": {
    "moved_span": str,
    "reference_span": str,
    "cue_span": str
  }
}

Rules:
- Exactly ONE relation cue in the sentence (no "inside the room", no extra "on the wall", etc.)
- moved_span and reference_span must be exact substrings of text.
- cue_span must be exact substring of text and must match the given relation and cue request.
- Keep the sentence short, imperative, unambiguous.
- For between:
  - cue_span must be "between"
  - reference_span must contain TWO objects connected by "and" (single phrase)
  - gold.reference_category_2 must be non-null
- For on_top_of:
  - cue_span is either "on top of" OR "on" (as a preposition: "on the/a/an ...")
- For inside:
  - cue_span is either "inside" OR "in" (as a preposition: "in the/a/an ...")
"""

def build_jobs():
    jobs = []
    for rel in RELATIONS:
        for _ in range(SAMPLES_PER_REL):
            intent = random.choice(INTENTS)
            moved = random.choice(CATEGORIES)

            if rel == "between":
                ref1 = random.choice([c for c in CATEGORIES if c != moved])
                ref2 = random.choice([c for c in CATEGORIES if c not in (moved, ref1)])
                cue = weighted_choice(CUE_VARIANTS[rel])
                jobs.append({
                    "intent": intent, "moved_category": moved,
                    "reference_category": ref1, "reference_category_2": ref2,
                    "relation": rel, "cue": cue
                })
            else:
                ref = random.choice([c for c in CATEGORIES if c != moved])
                cue = weighted_choice(CUE_VARIANTS[rel])
                jobs.append({
                    "intent": intent, "moved_category": moved,
                    "reference_category": ref, "reference_category_2": None,
                    "relation": rel, "cue": cue
                })
    random.shuffle(jobs)
    return jobs

def build_user_prompt(batch_jobs):
    lines = ["Generate one JSON per scenario.\nScenarios:"]
    for i, j in enumerate(batch_jobs, 1):
        lines.append(
            f"{i}. intent={j['intent']}; moved={j['moved_category']}; "
            f"ref1={j['reference_category']}; ref2={j['reference_category_2']}; "
            f"relation={j['relation']}; REQUIRED_CUE='{j['cue']}'"
        )
    return "\n".join(lines)

def call_batch(client, batch_jobs):
    prompt = build_user_prompt(batch_jobs)
    resp = client.responses.create(
        model=MODEL_NAME,
        input=[{"role":"system","content":SYSTEM_PROMPT},
               {"role":"user","content":prompt}],
        max_output_tokens=4096,
    )
    txt = resp.output[0].content[0].text
    lines = [ln.strip() for ln in txt.splitlines() if ln.strip()]
    recs = []
    for ln in lines:
        if ln[0].isdigit() and "{" in ln:
            ln = ln[ln.index("{"):]
        try:
            recs.append(json.loads(ln))
        except Exception:
            pass
    return recs


# BIO tagging via offsets

LABELS = ["O","B-MOVED","I-MOVED","B-REF","I-REF","B-CUE","I-CUE"]
label2id = {l:i for i,l in enumerate(LABELS)}

def char_span(text, span):
    if not span:
        return None
    i = text.lower().find(span.lower())
    if i < 0:
        return None
    return (i, i+len(span))

def spans_to_bio_tags(text, moved_span, ref_span, cue_span):
    enc = tokenizer(
        text,
        return_offsets_mapping=True,
        add_special_tokens=False
    )
    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
    offsets = enc["offset_mapping"]

    tags = ["O"] * len(tokens)

    def apply_span(span, B, I):
        cs = char_span(text, span)
        if cs is None:
            return False
        a,b = cs
        hit = []
        for ti,(s,e) in enumerate(offsets):
            if e <= a or s >= b:
                continue
            # token intersects span
            hit.append(ti)
        if not hit:
            return False
        tags[hit[0]] = B
        for ti in hit[1:]:
            tags[ti] = I
        return True

    ok1 = apply_span(moved_span, "B-MOVED","I-MOVED")
    ok2 = apply_span(ref_span,   "B-REF","I-REF")
    ok3 = apply_span(cue_span,   "B-CUE","I-CUE")

    return ok1 and ok2 and ok3, tokens, tags


# Validation

def validate_llm_record(rec):
    try:
        text = rec["text"]
        gold = rec["gold"]
        spans = rec["spans"]
        rel = gold["relation"]
        cue = spans["cue_span"].lower().strip()
        low = text.lower()

        if rel not in RELATIONS:
            return False, "bad_relation"

        # cue must appear exactly once
        if cue and low.count(cue) != 1:
            return False, "cue_not_once"

        # relation-specific cue checks
        if rel == "on_top_of":
            if cue not in ["on top of","on"]:
                return False, "cue_mismatch_on_top_of"
            if cue == "on" and not re.search(r"\bon\s+(the|a|an)\b", low):
                return False, "on_not_preposition"
        if rel == "inside":
            if cue not in ["inside","in"]:
                return False, "cue_mismatch_inside"
            if cue == "in" and not re.search(r"\bin\s+(the|a|an)\b", low):
                return False, "in_not_preposition"
        if rel == "between":
            if cue != "between" or " and " not in low:
                return False, "bad_between"
            if not gold.get("reference_category_2"):
                return False, "between_missing_ref2"

        # spans must exist in text
        for k in ["moved_span","reference_span","cue_span"]:
            if char_span(text, spans.get(k,"")) is None:
                return False, f"{k}_not_in_text"

        return True, "ok"
    except Exception:
        return False, "parse_error"

def main():
    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
    jobs = build_jobs()

    kept = 0
    dropped = Counter()
    rel_kept = Counter()
    cue_kept = Counter()

    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    with OUT_PATH.open("w", encoding="utf-8") as f:
        for bi in range(0, len(jobs), BATCH_SIZE):
            batch = jobs[bi:bi+BATCH_SIZE]
            recs = call_batch(client, batch)

            for rec in recs:
                ok, reason = validate_llm_record(rec)
                if not ok:
                    dropped[reason] += 1
                    continue

                text = rec["text"]
                spans = rec["spans"]
                ok_bio, tokens, tags = spans_to_bio_tags(
                    text,
                    spans["moved_span"],
                    spans["reference_span"],
                    spans["cue_span"],
                )
                if not ok_bio:
                    dropped["bio_align_fail"] += 1
                    continue

                out = {
                    "text": text,
                    "tokens": tokens,
                    "tags": tags,
                    "gold": rec["gold"],
                    "spans": rec["spans"],
                    "tokenizer_name": TOKENIZER_NAME,
                }

                rel = out["gold"]["relation"]
                cue = out["spans"]["cue_span"].lower().strip()
                rel_kept[rel] += 1
                cue_kept[(rel,cue)] += 1

                f.write(json.dumps(out, ensure_ascii=False) + "\n")
                kept += 1

            if (bi // BATCH_SIZE + 1) % 10 == 0:
                print(f"Batch {bi//BATCH_SIZE+1:>4} | kept={kept} | dropped={sum(dropped.values())}")

    print("\nDONE ->", OUT_PATH)
    print("\nKept relation distribution:")
    for r in RELATIONS:
        print(f"  {r:<12}: {rel_kept[r]}")

    print("\nCue breakdown (on_top_of / inside):")
    for (rel,cue),cnt in sorted(cue_kept.items(), key=lambda x:(x[0][0],-x[1])):
        if rel in ["on_top_of","inside"]:
            print(f"  rel={rel:<9} cue='{cue:<10}' -> {cnt}")

    if dropped:
        print("\nDropped reasons (top 15):")
        for k,v in dropped.most_common(15):
            print(f"  {k:<20}: {v}")

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Batch   10 | kept=197 | dropped=3
Batch   20 | kept=394 | dropped=6
Batch   30 | kept=590 | dropped=10
Batch   40 | kept=787 | dropped=13
Batch   50 | kept=985 | dropped=15
Batch   60 | kept=1182 | dropped=18
Batch   70 | kept=1379 | dropped=21
Batch   80 | kept=1576 | dropped=24
Batch   90 | kept=1773 | dropped=27
Batch  100 | kept=1970 | dropped=30
Batch  110 | kept=2165 | dropped=35
Batch  120 | kept=2361 | dropped=39
Batch  130 | kept=2556 | dropped=44
Batch  140 | kept=2755 | dropped=45
Batch  150 | kept=2950 | dropped=50
Batch  160 | kept=3147 | dropped=53
Batch  170 | kept=3345 | dropped=55
Batch  180 | kept=3545 | dropped=55
Batch  190 | kept=3738 | dropped=62
Batch  200 | kept=3937 | dropped=63
Batch  210 | kept=4134 | dropped=66
Batch  220 | kept=4333 | dropped=67
Batch  230 | kept=4529 | dropped=71
Batch  240 | kept=4727 | dropped=73
Batch  250 | kept=4923 | dropped=77
Batch  260 | kept=5118 | dropped=82
Batch  270 | kept=5314 | dropped=86
Batch  280 | kept=5512 | dropped=88

In [None]:
# -*- coding: utf-8 -*-
# train_relation_cue_tagger_v1.py

import json
import random
from pathlib import Path
from typing import List, Dict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    BertTokenizerFast,
    BertForTokenClassification,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW



# Configuration


DATA_PATH = Path("relation_bio_synth_v2.jsonl")
OUT_DIR   = Path("relation_cue_tagger_bert_v1")

# Must match the tokenizer used during dataset generation
PRETRAINED = "bert-base-uncased"

MAX_LEN = 96

SEED = 42
VAL_SPLIT = 0.2

BATCH_SIZE = 16
EPOCHS = 5
LR = 3e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
MAX_GRAD_NORM = 1.0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Only cue-related BIO labels are trained
LABEL_LIST = ["O", "B-CUE", "I-CUE"]
label2id = {l: i for i, l in enumerate(LABEL_LIST)}
id2label = {i: l for l, i in label2id.items()}



# Utility functions


def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def load_jsonl(path: Path) -> List[dict]:
    items = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items


def to_cue_only_tags(tags: List[str]) -> List[str]:
    """
    Keep only B-CUE / I-CUE labels.
    All other labels are mapped to O.
    """
    out = []
    for t in tags:
        if t == "B-CUE":
            out.append("B-CUE")
        elif t == "I-CUE":
            out.append("I-CUE")
        else:
            out.append("O")
    return out


def build_cue_mapping() -> Dict[str, str]:
    """
    Maps normalized cue text to final relation labels.
    The model detects the cue span; relation is inferred here.
    """
    return {
        "on top of": "on_top_of",
        "on":        "on_top_of",

        "inside":    "inside",
        "in":        "inside",

        "in front of": "in_front_of",
        "behind":      "behind",

        "under":       "under",
        "underneath":  "under",

        "next to":     "next_to",
        "beside":      "next_to",

        "near":        "near",
        "close to":    "near",

        "between":     "between",
    }



# Dataset


class CueTokenDataset(Dataset):
    """
    Dataset for cue-only BIO token classification.
    Uses pre-generated WordPiece tokens and tags from the dataset.
    """

    def __init__(self, items: List[dict], tokenizer: BertTokenizerFast, max_len: int):
        self.items = items
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx: int):
        it = self.items[idx]
        text = it.get("text", "")

        # Tokens and tags were generated without special tokens
        tokens = it.get("tokens", [])
        tags   = it.get("tags", [])

        # Fallback (should rarely happen)
        if not tokens or not tags or len(tokens) != len(tags):
            enc = self.tokenizer(
                text,
                add_special_tokens=False,
                return_offsets_mapping=False,
            )
            tokens = self.tokenizer.convert_ids_to_tokens(enc["input_ids"])
            tags = ["O"] * len(tokens)

        cue_tags = to_cue_only_tags(tags)

        # Rebuild input with special tokens and padding
        enc = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt",
            add_special_tokens=True,
        )

        input_ids = enc["input_ids"][0]
        attention_mask = enc["attention_mask"][0]

        # Align labels to word ids
        word_ids = enc.word_ids(batch_index=0)
        labels = torch.full((self.max_len,), -100, dtype=torch.long)

        for i, widx in enumerate(word_ids):
            if widx is None:
                continue
            if 0 <= widx < len(cue_tags):
                labels[i] = label2id.get(cue_tags[widx], 0)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }


def collate_fn(batch):
    input_ids = torch.stack([b["input_ids"] for b in batch], dim=0)
    attention_mask = torch.stack([b["attention_mask"] for b in batch], dim=0)
    labels = torch.stack([b["labels"] for b in batch], dim=0)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }



# Training / Evaluation


@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    correct = 0

    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        out = model(**batch)

        loss = out.loss
        logits = out.logits  # [B, L, C]

        total_loss += float(loss.item()) * batch["input_ids"].size(0)

        preds = logits.argmax(dim=-1)
        labels = batch["labels"]

        mask = labels != -100
        correct += int(((preds == labels) & mask).sum().item())
        total_tokens += int(mask.sum().item())

    avg_loss = total_loss / max(len(loader.dataset), 1)
    token_accuracy = correct / max(total_tokens, 1)
    return avg_loss, token_accuracy



# Main


def main():
    set_seed(SEED)

    if not DATA_PATH.exists():
        raise FileNotFoundError(f"Dataset not found: {DATA_PATH}")

    items = load_jsonl(DATA_PATH)
    if not items:
        raise RuntimeError("Dataset is empty.")

    random.shuffle(items)
    n_total = len(items)
    n_val = int(n_total * VAL_SPLIT)

    val_items = items[:n_val]
    train_items = items[n_val:]

    print(f"[Data] total={n_total} train={len(train_items)} val={len(val_items)}")
    print(f"[Labels] {LABEL_LIST}")

    tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED)

    train_ds = CueTokenDataset(train_items, tokenizer, MAX_LEN)
    val_ds   = CueTokenDataset(val_items, tokenizer, MAX_LEN)

    train_loader = DataLoader(
        train_ds,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=collate_fn,
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_fn,
    )

    model = BertForTokenClassification.from_pretrained(
        PRETRAINED,
        num_labels=len(LABEL_LIST),
        id2label=id2label,
        label2id=label2id,
    ).to(DEVICE)

    optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * EPOCHS
    warmup_steps = int(total_steps * WARMUP_RATIO)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
    )

    best_val_tok_acc = -1.0

    print("[Train] Starting...")
    for epoch in range(1, EPOCHS + 1):
        model.train()
        running_loss = 0.0
        seen = 0

        for batch in train_loader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            out = model(**batch)
            loss = out.loss

            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

            optimizer.step()
            scheduler.step()

            running_loss += float(loss.item()) * batch["input_ids"].size(0)
            seen += int(batch["input_ids"].size(0))

        train_loss = running_loss / max(seen, 1)
        val_loss, val_tok_acc = evaluate(model, val_loader)

        print(
            f"Epoch {epoch:02d} | "
            f"train_loss={train_loss:.4f} | "
            f"val_loss={val_loss:.4f} | "
            f"val_tok_acc={val_tok_acc:.4f}"
        )

        if val_tok_acc > best_val_tok_acc:
            best_val_tok_acc = val_tok_acc

            OUT_DIR.mkdir(parents=True, exist_ok=True)
            model.save_pretrained(OUT_DIR)
            tokenizer.save_pretrained(OUT_DIR)

            with (OUT_DIR / "labels.json").open("w", encoding="utf-8") as f:
                json.dump(
                    {"label_list": LABEL_LIST},
                    f,
                    ensure_ascii=False,
                    indent=2,
                )

            with (OUT_DIR / "cue_mapping.json").open("w", encoding="utf-8") as f:
                json.dump(
                    build_cue_mapping(),
                    f,
                    ensure_ascii=False,
                    indent=2,
                )

            print(f"[Save] Best model saved to {OUT_DIR}")

    print("\nDONE.")
    print(f"Best validation token accuracy: {best_val_tok_acc:.4f}")
    print(f"Output folder: {OUT_DIR}")
    print("Model can be loaded with BertForTokenClassification.from_pretrained(OUT_DIR)")


if __name__ == "__main__":
    main()


[Data] total=9448 train=7559 val=1889
[Labels] ['O', 'B-CUE', 'I-CUE']


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Train] Starting...
Epoch 01 | train_loss=0.0747 | val_loss=0.0000 | val_tok_acc=1.0000
[Save] Best model saved to relation_cue_tagger_bert_v1
Epoch 02 | train_loss=0.0001 | val_loss=0.0000 | val_tok_acc=1.0000
Epoch 03 | train_loss=0.0001 | val_loss=0.0000 | val_tok_acc=1.0000
Epoch 04 | train_loss=0.0000 | val_loss=0.0000 | val_tok_acc=1.0000
Epoch 05 | train_loss=0.0000 | val_loss=0.0000 | val_tok_acc=1.0000

DONE.
Best validation token accuracy: 1.0000
Output folder: relation_cue_tagger_bert_v1
Model can be loaded with BertForTokenClassification.from_pretrained(OUT_DIR)


In [None]:
# Colab test cell: Relation cue tagger + mapping inference

import json
import re
from pathlib import Path
from typing import Optional, Tuple, Dict, List

import torch
from transformers import BertTokenizerFast, BertForTokenClassification

MODEL_DIR = Path("relation_cue_tagger_bert_v1")  # folder created by training script
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 96

def load_json(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def normalize_cue(cue: str) -> str:
    cue = cue.strip().lower()
    cue = re.sub(r"\s+", " ", cue)
    return cue

def load_cue_tagger(folder: Path):
    tokenizer = BertTokenizerFast.from_pretrained(folder)
    model = BertForTokenClassification.from_pretrained(folder).to(DEVICE)
    model.eval()

    labels_path = folder / "labels.json"
    if labels_path.exists():
        label_list = load_json(labels_path)["label_list"]
    else:
        label_list = [model.config.id2label[i] for i in range(model.config.num_labels)]

    mapping_path = folder / "cue_mapping.json"
    cue_mapping = load_json(mapping_path) if mapping_path.exists() else {}

    return tokenizer, model, label_list, cue_mapping

@torch.no_grad()
def predict_cue_span(
    text: str,
    tokenizer,
    model,
    label_list: List[str],
    max_len: int = 96
) -> Tuple[Optional[str], float]:
    if not text.strip():
        return None, 0.0

    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_len,
    )
    enc = {k: v.to(DEVICE) for k, v in enc.items()}

    out = model(**enc)
    logits = out.logits[0]                      # [L, C]
    probs = torch.softmax(logits, dim=-1)       # [L, C]
    pred_ids = probs.argmax(dim=-1).tolist()

    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0].tolist())
    labels = [label_list[i] for i in pred_ids]

    cue_tokens = []
    cue_confs = []
    active = False

    for tok, lab, pr in zip(tokens, labels, probs.tolist()):
        if tok in ["[CLS]", "[SEP]"]:
            continue

        if lab == "B-CUE":
            if cue_tokens:
                break
            active = True
            cue_tokens.append(tok)
            cue_confs.append(max(pr))
        elif lab == "I-CUE" and active:
            cue_tokens.append(tok)
            cue_confs.append(max(pr))
        else:
            if active:
                break

    if not cue_tokens:
        return None, 0.0

    cue_text = tokenizer.convert_tokens_to_string(cue_tokens).strip()
    cue_text = normalize_cue(cue_text)
    conf = float(sum(cue_confs) / max(len(cue_confs), 1))
    return cue_text, conf

def cue_to_relation(cue_text: Optional[str], cue_mapping: Dict[str, str]) -> str:
    if not cue_text:
        return "none"

    cue_text = normalize_cue(cue_text)

    if cue_text in cue_mapping:
        return cue_mapping[cue_text]

    # conservative fallback
    if "in front of" in cue_text:
        return "in_front_of"
    if "on top of" in cue_text:
        return "on_top_of"
    if cue_text == "on":
        return "on_top_of"
    if cue_text == "in":
        return "inside"

    return "none"

@torch.no_grad()
def predict_relation(text: str, tokenizer, model, label_list, cue_mapping) -> Tuple[str, Optional[str], float]:
    cue_text, conf = predict_cue_span(text, tokenizer, model, label_list, max_len=MAX_LEN)
    rel = cue_to_relation(cue_text, cue_mapping)
    return rel, cue_text, conf

# Load
tokenizer, model, label_list, cue_mapping = load_cue_tagger(MODEL_DIR)
print("Loaded:", MODEL_DIR)
print("Labels:", label_list)
print("Mapping keys:", list(cue_mapping.keys())[:10], "...")

# Examples
tests = [
    "put the cup on the table",
    "Put the book on top of the desk.",
    "Place the vase inside the cabinet.",
    "Put the plant in the room.",
    "Move the chair in front of the sofa.",
    "Move the lamp behind the bed.",
    "Put the keyboard under the monitor.",
    "Place the stool next to the desk.",
    "Move the chair beside the table.",
    "Put the cup near the laptop.",
    "Put the chair close to the table.",
    "Move the plant between the sofa and the table.",
    "Create a chair.",
]

for s in tests:
    rel, cue, conf = predict_relation(s, tokenizer, model, label_list, cue_mapping)
    print(f"\nTEXT: {s}")
    print(f"  cue: {cue}  | conf: {conf:.3f}")
    print(f"  relation: {rel}")


Loaded: relation_cue_tagger_bert_v1
Labels: ['O', 'B-CUE', 'I-CUE']
Mapping keys: ['on top of', 'on', 'inside', 'in', 'in front of', 'behind', 'under', 'underneath', 'next to', 'beside'] ...

TEXT: put the cup on the table
  cue: on  | conf: 1.000
  relation: on_top_of

TEXT: Put the book on top of the desk.
  cue: on top of  | conf: 1.000
  relation: on_top_of

TEXT: Place the vase inside the cabinet.
  cue: inside  | conf: 1.000
  relation: inside

TEXT: Put the plant in the room.
  cue: in  | conf: 1.000
  relation: inside

TEXT: Move the chair in front of the sofa.
  cue: in front of  | conf: 1.000
  relation: in_front_of

TEXT: Move the lamp behind the bed.
  cue: behind  | conf: 1.000
  relation: behind

TEXT: Put the keyboard under the monitor.
  cue: under  | conf: 1.000
  relation: under

TEXT: Place the stool next to the desk.
  cue: next to  | conf: 1.000
  relation: next_to

TEXT: Move the chair beside the table.
  cue: beside  | conf: 1.000
  relation: next_to

TEXT: Put t

In [None]:
from pathlib import Path
from google.colab import files

files.download("relation_bio_synth_v2.jsonl")

for f in Path("relation_cue_tagger_bert_v1").iterdir():
    if f.is_file():
        files.download(str(f))


In [None]:
# -*- coding: utf-8 -*-
# train_relation_head_v2.py
#training for relations ESSSSSSSSKİ KOD

import json
import random
from pathlib import Path
from collections import Counter, defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer

# Config

DATA_PATH = Path("sentences_roles_relation_synth_v1.jsonl")

MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
SAVE_PATH  = Path("relation_head_v1.pt")

BATCH_SIZE = 64
EPOCHS     = 30
LR         = 1e-3
VAL_SPLIT  = 0.2
SEED       = 42

RELATIONS = [
    "behind",
    "between",
    "in_front_of",
    "inside",
    "near",
    "next_to",
    "on_top_of",
    "under",
]
RELATION2ID = {name: i for i, name in enumerate(RELATIONS)}
NUM_REL = len(RELATIONS)


def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]



class RelationDataset(Dataset):
    def __init__(self, emb, labels):
        self.emb = emb
        self.labels = labels

    def __len__(self):
        return self.emb.size(0)

    def __getitem__(self, idx):
        return self.emb[idx], self.labels[idx]


class RelationHead(nn.Module):
    def __init__(self, emb_dim, num_relations):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_relations),
        )

    def forward(self, x):
        return self.net(x)


# Helpers

def stratified_split(labels, val_split=0.2):
    groups = defaultdict(list)
    for i, y in enumerate(labels.tolist()):
        groups[y].append(i)

    train_idx, val_idx = [], []
    for y, idxs in groups.items():
        random.shuffle(idxs)
        cut = int(len(idxs) * (1 - val_split))
        train_idx += idxs[:cut]
        val_idx   += idxs[cut:]

    random.shuffle(train_idx)
    random.shuffle(val_idx)
    return train_idx, val_idx


def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total = 0

    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()

        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        total_correct += (logits.argmax(1) == yb).sum().item()
        total += xb.size(0)

    return total_loss / total, total_correct / total


def eval_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total = 0
    per_class_correct = Counter()
    per_class_total = Counter()

    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)

            logits = model(xb)
            loss = criterion(logits, yb)

            total_loss += loss.item() * xb.size(0)
            preds = logits.argmax(1)
            total_correct += (preds == yb).sum().item()
            total += xb.size(0)

            for p, t in zip(preds.cpu(), yb.cpu()):
                per_class_total[t.item()] += 1
                if p.item() == t.item():
                    per_class_correct[t.item()] += 1

    acc = total_correct / total
    per_acc = {
        rid: per_class_correct[rid] / per_class_total[rid]
        for rid in per_class_total
    }
    return total_loss / total, acc, per_acc



# main


def main():
    set_seed(SEED)

    if not DATA_PATH.exists():
        raise FileNotFoundError(f"Dataset not found: {DATA_PATH}")

    print(f"Loading synthetic relation dataset: {DATA_PATH}")
    items = load_jsonl(DATA_PATH)
    print(f"Loaded {len(items)} examples.")

    texts = []
    labels = []

    for obj in items:
        text = obj.get("text", None)
        rel  = obj.get("gold_command", {}).get("args", {}).get("relation", None)
        if not text or not rel:
            continue

        if rel not in RELATIONS:
            continue

        texts.append(text)
        labels.append(RELATION2ID[rel])

    labels = torch.tensor(labels, dtype=torch.long)
    print(f"Usable examples: {len(texts)}")


    dist = Counter([RELATIONS[l] for l in labels.tolist()])
    print("Relation distribution:")
    for k, v in dist.items():
        print(f"  {k}: {v}")


    # SBERT Encoding
    print(f"Loading SBERT model: {MODEL_NAME}")
    encoder = SentenceTransformer(MODEL_NAME)

    print("Encoding sentences...")
    embeddings = encoder.encode(
        texts,
        convert_to_tensor=True,
        batch_size=128,
        show_progress_bar=True
    )
    emb_dim = embeddings.size(1)


    # Split

    train_idx, val_idx = stratified_split(labels)

    train_emb = embeddings[train_idx]
    train_lbl = labels[train_idx]
    val_emb   = embeddings[val_idx]
    val_lbl   = labels[val_idx]

    train_ds = RelationDataset(train_emb, train_lbl)
    val_ds   = RelationDataset(val_emb,   val_lbl)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)


    # Model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    class_counts = Counter(train_lbl.tolist())
    max_c = max(class_counts.values())
    weights = torch.tensor([max_c / class_counts[i] for i in range(NUM_REL)], dtype=torch.float32).to(device)

    model = RelationHead(emb_dim, NUM_REL).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    best_val_acc = 0
    best_val_loss = float("inf")

    for epoch in range(1, EPOCHS + 1):
        tr_loss, tr_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        va_loss, va_acc, per = eval_epoch(model, val_loader, criterion, device)

        print(f"[Epoch {epoch}/{EPOCHS}] "
              f"TrainLoss={tr_loss:.4f}, TrainAcc={tr_acc:.3f}, "
              f"ValLoss={va_loss:.4f}, ValAcc={va_acc:.3f}")

        print("Per-class accuracy:")
        for rid, acc in per.items():
            print(f"  {RELATIONS[rid]:<12} -> {acc:.3f}")

        if va_acc > best_val_acc or va_loss < best_val_loss:
            best_val_acc = va_acc
            best_val_loss = va_loss
            torch.save({
                "state_dict": model.state_dict(),
                "relation_list": RELATIONS,
                "embedding_model_name": MODEL_NAME,
                "emb_dim": emb_dim,
            }, SAVE_PATH)
            print(f"  -> Saved new best model to {SAVE_PATH}")

    print("Training finished!")


if __name__ == "__main__":
    main()


Loading synthetic relation dataset: sentences_roles_relation_synth_v1.jsonl
Loaded 9438 examples.
Usable examples: 9438
Relation distribution:
  near: 1179
  behind: 1181
  on_top_of: 1180
  under: 1180
  next_to: 1179
  between: 1180
  inside: 1180
  in_front_of: 1179
Loading SBERT model: sentence-transformers/all-mpnet-base-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding sentences...


Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Device: cuda
[Epoch 1/30] TrainLoss=1.8523, TrainAcc=0.338, ValLoss=1.2712, ValAcc=0.576
Per-class accuracy:
  next_to      -> 0.165
  behind       -> 0.608
  in_front_of  -> 0.051
  inside       -> 0.864
  on_top_of    -> 0.627
  under        -> 0.542
  between      -> 0.877
  near         -> 0.873
  -> Saved new best model to relation_head_v1.pt
[Epoch 2/30] TrainLoss=0.9947, TrainAcc=0.642, ValLoss=0.7022, ValAcc=0.788
Per-class accuracy:
  next_to      -> 0.534
  behind       -> 0.852
  in_front_of  -> 0.746
  inside       -> 0.911
  on_top_of    -> 0.894
  under        -> 0.860
  between      -> 0.936
  near         -> 0.572
  -> Saved new best model to relation_head_v1.pt
[Epoch 3/30] TrainLoss=0.6247, TrainAcc=0.790, ValLoss=0.4632, ValAcc=0.865
Per-class accuracy:
  next_to      -> 0.818
  behind       -> 0.797
  in_front_of  -> 0.911
  inside       -> 0.924
  on_top_of    -> 0.966
  under        -> 0.894
  between      -> 0.953
  near         -> 0.657
  -> Saved new best model

In [None]:
# -*- coding: utf-8 -*-
# relation_cue_predictor.py

import json
import re
from pathlib import Path
from typing import Optional, Tuple, Dict, List

import torch
from transformers import BertTokenizerFast, BertForTokenClassification


def _load_json(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def _normalize_cue(cue: str) -> str:
    cue = cue.strip().lower()
    cue = re.sub(r"\s+", " ", cue)
    return cue


def load_cue_tagger(folder: Path):
    # load
    tokenizer = BertTokenizerFast.from_pretrained(folder)
    model = BertForTokenClassification.from_pretrained(folder)
    model.eval()

    # labels
    labels_path = folder / "labels.json"
    if labels_path.exists():
        label_list = _load_json(labels_path)["label_list"]
    else:
        label_list = [model.config.id2label[i] for i in range(model.config.num_labels)]

    # mapping
    mapping_path = folder / "cue_mapping.json"
    cue_mapping = _load_json(mapping_path) if mapping_path.exists() else {}

    return tokenizer, model, label_list, cue_mapping


@torch.no_grad()
def predict_cue_span(
    text: str,
    tokenizer,
    model,
    label_list: List[str],
    device,
    max_len: int = 96,
) -> Tuple[Optional[str], float]:
    # empty
    if not text.strip():
        return None, 0.0

    # encode
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_len,
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    # forward
    out = model(**enc)
    logits = out.logits[0]
    probs = torch.softmax(logits, dim=-1)
    pred_ids = probs.argmax(dim=-1).tolist()

    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"][0].tolist())
    labels = [label_list[i] for i in pred_ids]

    # collect cue
    cue_tokens = []
    cue_confs = []
    active = False

    for tok, lab, pr in zip(tokens, labels, probs.tolist()):
        if tok in ["[CLS]", "[SEP]"]:
            continue

        if lab == "B-CUE":
            if cue_tokens:
                break
            active = True
            cue_tokens.append(tok)
            cue_confs.append(max(pr))
        elif lab == "I-CUE" and active:
            cue_tokens.append(tok)
            cue_confs.append(max(pr))
        else:
            if active:
                break

    if not cue_tokens:
        return None, 0.0

    cue_text = tokenizer.convert_tokens_to_string(cue_tokens).strip()
    cue_text = _normalize_cue(cue_text)
    conf = float(sum(cue_confs) / max(len(cue_confs), 1))

    return cue_text, conf


def cue_to_relation(cue_text: Optional[str], cue_mapping: Dict[str, str]) -> str:
    if not cue_text:
        return "none"

    cue_text = _normalize_cue(cue_text)

    # exact
    if cue_text in cue_mapping:
        return cue_mapping[cue_text]

    # fallback
    if "in front of" in cue_text:
        return "in_front_of"
    if "on top of" in cue_text:
        return "on_top_of"
    if cue_text == "on":
        return "on_top_of"
    if cue_text == "in":
        return "inside"

    return "none"


@torch.no_grad()
def predict_relation_from_sentence(
    text: str,
    tokenizer,
    model,
    label_list,
    cue_mapping,
    device,
) -> Tuple[str, Optional[str], float]:
    # predict
    cue_text, conf = predict_cue_span(
        text, tokenizer, model, label_list, device
    )
    rel = cue_to_relation(cue_text, cue_mapping)
    return rel, cue_text, conf


In [None]:
# -*- coding: utf-8 -*-
# generate_object_attributes_dataset_v1.py
# Multi-label attribute dataset generator

!pip install -q openai

import os
import json
import random
from pathlib import Path
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""

MODEL_NAME = "gpt-4.1-mini"

CATEGORIES = [
    "chair", "table", "desk", "bed", "sofa", "lamp", "bookshelf", "cabinet",
    "wardrobe", "dresser", "mirror", "carpet", "pillow", "blanket", "plant",
    "vase", "picture_frame", "tv", "monitor", "keyboard", "mouse", "laptop",
    "speaker", "clock", "fan", "air_conditioner", "heater", "window", "door",
    "shelf", "bench", "stool", "cushion", "coffee_table", "side_table",
    "nightstand", "bathtub", "toilet", "sink", "fridge", "oven", "microwave",
    "chair_mat", "coat_rack", "shoe_rack", "broom", "bucket", "trash_bin",
    "cupboard", "pan", "plate", "bottle", "mug", "guitar", "piano", "fence",
    "tree", "rock", "flower_pot"
]

ATTRS = [
  "supports_on_top",
  "can_be_placed_on",
  "is_container_openable",
  "has_interior_volume",
  "can_go_under",
  "can_be_under",
  "is_wall_mounted",
  "is_floor_object",
  "is_hanging_object",
]

OUT_PATH = Path("object_attributes_synth_v1.jsonl")

SAMPLES_PER_CATEGORY = 30
BATCH_SIZE = 20

SYSTEM_PROMPT = f"""
You are generating high-quality training data for a multi-label object attribute classifier
for a text-to-3D scene system.

Output EXACTLY one JSON object per line (JSONL). No extra text.

Allowed attributes (binary 0/1):
{ATTRS}

JSON schema:
{{
  "text": str,
  "category": str,
  "aliases": [str, ...],
  "description": str,
  "labels": {{
    "supports_on_top": 0|1,
    "can_be_placed_on": 0|1,
    "is_container_openable": 0|1,
    "has_interior_volume": 0|1,
    "can_go_under": 0|1,
    "can_be_under": 0|1,
    "is_wall_mounted": 0|1,
    "is_floor_object": 0|1,
    "is_hanging_object": 0|1
  }}
}}

Rules:
- "category" MUST be one of the provided categories.
- "text" MUST contain the category name (exact token) and may include a short description.
  Example text formats:
    - "object: mug"
    - "object: mug | a small cup for drinking"
- Provide 0-3 aliases (synonyms) when reasonable.
- Labels must reflect common real-world affordances for typical indoor scenes.

Consistency constraints:
- If is_container_openable=1 then has_interior_volume MUST be 1.
- If is_hanging_object=1 then is_floor_object MUST be 0, supports_on_top MUST be 0, has_interior_volume MUST be 0.
- If is_wall_mounted=1 then is_floor_object SHOULD be 0 (unless it's ambiguous; prefer 0).
- Do NOT mark supports_on_top=1 unless it is a stable surface that can hold objects.
- can_be_placed_on is for objects that can typically be placed onto other surfaces (portable items).
- can_go_under means other objects can go under it (clearance).
- can_be_under means the object can fit under another object (small enough).

Data diversity:
- Vary descriptions slightly.
- In about 10% of samples, include a tiny typo in the category inside "text" BUT keep "category" field correct.
"""

def build_jobs(categories):
    jobs = []
    for cat in categories:
        for _ in range(SAMPLES_PER_CATEGORY):
            jobs.append({"category": cat})
    random.shuffle(jobs)
    print(f"Planned jobs: {len(jobs)} ({len(categories)} * {SAMPLES_PER_CATEGORY})")
    return jobs

def build_user_prompt(batch_jobs):
    lines = ["Generate one JSON object per scenario.\nScenarios:"]
    for i, j in enumerate(batch_jobs, 1):
        lines.append(f"{i}. category='{j['category']}'")
    return "\n".join(lines)

def call_gpt_batch(client, batch_jobs):
    prompt = build_user_prompt(batch_jobs)

    resp = client.responses.create(
        model=MODEL_NAME,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
        ],
        max_output_tokens=4096,
    )

    content = resp.output[0].content[0].text
    lines = content.splitlines()

    records = []
    for line in lines:
        line = line.strip()
        if not line:
            continue

        if line[0].isdigit() and "{" in line:
            line = line[line.index("{"):]
        try:
            obj = json.loads(line)
            records.append(obj)
        except Exception:
            print("PARSE ERROR:", line[:160])
    return records

def validate_record(rec):

    if "category" not in rec or rec["category"] not in CATEGORIES:
        return False
    if "text" not in rec or rec["category"] not in rec["text"]:


        return False
    labels = rec.get("labels", {})
    for a in ATTRS:
        if a not in labels or labels[a] not in (0, 1):
            return False


    if labels["is_container_openable"] == 1 and labels["has_interior_volume"] != 1:
        return False
    if labels["is_hanging_object"] == 1:
        if labels["is_floor_object"] != 0: return False
        if labels["supports_on_top"] != 0: return False
        if labels["has_interior_volume"] != 0: return False

    return True

def main():
    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

    jobs = build_jobs(CATEGORIES)

    total_written = 0
    with open(OUT_PATH, "w", encoding="utf-8") as fout:
        for i in range(0, len(jobs), BATCH_SIZE):
            batch = jobs[i:i+BATCH_SIZE]
            print(f"Batch {i//BATCH_SIZE + 1} / {(len(jobs)+BATCH_SIZE-1)//BATCH_SIZE}")

            recs = call_gpt_batch(client, batch)

            for r in recs:
                if validate_record(r):
                    fout.write(json.dumps(r, ensure_ascii=False) + "\n")
                    total_written += 1

    print(f"Done. Total written: {total_written}")
    print(f"Saved to: {OUT_PATH}")

if __name__ == "__main__":
    main()


Planned jobs: 1770 (59 * 30)
Batch 1 / 89
Batch 2 / 89
Batch 3 / 89
Batch 4 / 89
Batch 5 / 89
Batch 6 / 89
Batch 7 / 89
Batch 8 / 89
Batch 9 / 89
Batch 10 / 89
Batch 11 / 89
Batch 12 / 89
Batch 13 / 89
Batch 14 / 89
Batch 15 / 89
Batch 16 / 89
Batch 17 / 89
Batch 18 / 89
Batch 19 / 89
Batch 20 / 89
Batch 21 / 89
Batch 22 / 89
Batch 23 / 89
Batch 24 / 89
Batch 25 / 89
Batch 26 / 89
Batch 27 / 89
Batch 28 / 89
Batch 29 / 89
Batch 30 / 89
Batch 31 / 89
Batch 32 / 89
Batch 33 / 89
Batch 34 / 89
Batch 35 / 89
Batch 36 / 89
Batch 37 / 89
Batch 38 / 89
Batch 39 / 89
Batch 40 / 89
Batch 41 / 89
Batch 42 / 89
Batch 43 / 89
Batch 44 / 89
Batch 45 / 89
Batch 46 / 89
Batch 47 / 89
Batch 48 / 89
Batch 49 / 89
Batch 50 / 89
Batch 51 / 89
Batch 52 / 89
Batch 53 / 89
Batch 54 / 89
Batch 55 / 89
Batch 56 / 89
Batch 57 / 89
Batch 58 / 89
Batch 59 / 89
Batch 60 / 89
Batch 61 / 89
Batch 62 / 89
Batch 63 / 89
Batch 64 / 89
Batch 65 / 89
Batch 66 / 89
Batch 67 / 89
Batch 68 / 89
Batch 69 / 89
Batch 70 / 89


In [None]:
# -*- coding: utf-8 -*-
# train_object_attributes_bert_hf.py


import json
import random
import inspect
from typing import Dict, List, Tuple

import numpy as np
import torch
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)


# Config
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

TRAIN_JSONL = "object_attributes_synth_v1.jsonl"
MODEL_NAME = "bert-base-uncased"
OUT_DIR = "bert_object_attributes_v1_hf"

MAX_LEN = 64
VAL_RATIO = 0.1

ATTRS = [
    "supports_on_top",
    "can_be_placed_on",
    "is_container_openable",
    "has_interior_volume",
    "can_go_under",
    "can_be_under",
    "is_wall_mounted",
    "is_floor_object",
    "is_hanging_object",
]

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# JSONL: read,validate

def read_jsonl_safe(path: str, attrs: List[str]) -> Tuple[List[Dict], Dict]:
    records = []
    bad_json = 0
    bad_schema = 0

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except Exception:
                bad_json += 1
                continue

            if "text" not in obj or "labels" not in obj or not isinstance(obj["labels"], dict):
                bad_schema += 1
                continue

            labels = obj["labels"]
            obj["labels"] = {a: int(labels.get(a, 0)) for a in attrs}
            records.append(obj)

    stats = {
        "loaded_ok": len(records),
        "bad_json": bad_json,
        "bad_schema": bad_schema,
        "total_bad": bad_json + bad_schema,
    }
    return records, stats


def build_xy(records: List[Dict], attrs: List[str]) -> Tuple[List[str], np.ndarray]:
    texts, Y = [], []
    for r in records:
        texts.append(r["text"])
        Y.append([int(r["labels"].get(a, 0)) for a in attrs])
    return texts, np.array(Y, dtype=np.float32)



# Dataset
class MultiLabelTextDataset(Dataset):
    def __init__(self, texts: List[str], y: np.ndarray, tokenizer, max_len: int):
        self.texts = texts
        self.y = y
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tok(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}

        item["labels"] = torch.tensor(self.y[idx], dtype=torch.float32)
        return item


# Metrics
def sigmoid_np(x):
    return 1 / (1 + np.exp(-x))


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid_np(logits)

    thr = 0.5
    preds = (probs >= thr).astype(np.int32)
    labels_i = labels.astype(np.int32)

    tp = int(((preds == 1) & (labels_i == 1)).sum())
    fp = int(((preds == 1) & (labels_i == 0)).sum())
    fn = int(((preds == 0) & (labels_i == 1)).sum())

    prec = tp / (tp + fp + 1e-9)
    rec = tp / (tp + fn + 1e-9)
    f1 = 2 * prec * rec / (prec + rec + 1e-9)

    exact = float((preds == labels_i).all(axis=1).mean())
    return {"micro_f1": f1, "micro_precision": prec, "micro_recall": rec, "exact_match": exact}


# Training
def make_training_args(**kwargs):
    sig = inspect.signature(TrainingArguments.__init__)
    params = set(sig.parameters.keys())

    rename_candidates = [
        ("evaluation_strategy", "eval_strategy"),
        ("eval_strategy", "evaluation_strategy"),
    ]

    fixed = dict(kwargs)
    for a, b in rename_candidates:
        if a in fixed and a not in params and b in params:
            fixed[b] = fixed.pop(a)

    fixed = {k: v for k, v in fixed.items() if k in params}
    return TrainingArguments(**fixed)



# Main
def main():
    records, stats = read_jsonl_safe(TRAIN_JSONL, ATTRS)
    print("Dataset stats:", stats)
    if stats["loaded_ok"] == 0:
        raise RuntimeError("No valid records loaded. Fix JSONL file first.")

    random.shuffle(records)
    texts, Y = build_xy(records, ATTRS)

    n = len(texts)
    n_val = max(1, int(n * VAL_RATIO))
    x_val, y_val = texts[:n_val], Y[:n_val]
    x_tr, y_tr = texts[n_val:], Y[n_val:]

    print(f"Loaded OK: {n} examples | Train: {len(x_tr)} | Val: {len(x_val)}")
    print("Attrs:", ATTRS)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_ds = MultiLabelTextDataset(x_tr, y_tr, tokenizer, MAX_LEN)
    val_ds = MultiLabelTextDataset(x_val, y_val, tokenizer, MAX_LEN)

    id2label = {i: ATTRS[i] for i in range(len(ATTRS))}
    label2id = {ATTRS[i]: i for i in range(len(ATTRS))}

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(ATTRS),
        problem_type="multi_label_classification",
        id2label=id2label,
        label2id=label2id,
    ).to(DEVICE)

    args = make_training_args(
        output_dir=OUT_DIR,
        evaluation_strategy="steps",
        eval_steps=200,
        save_steps=200,
        logging_steps=50,
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=3,
        weight_decay=0.01,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="micro_f1",
        greater_is_better=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.save_model(OUT_DIR)
    tokenizer.save_pretrained(OUT_DIR)

    # demo
    demo = [
        "object: table | a flat surface furniture item for dining or work",
        "object: mug | a cup for drinking hot beverages",
        "object: cabinet | a storage unit with doors",
        "object: chandelier | a ceiling hanging light fixture",
        "object: tv | a screen often wall-mounted",
    ]

    enc = tokenizer(demo, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(DEVICE)
    model.eval()
    with torch.no_grad():
        logits = model(**enc).logits.cpu().numpy()
    probs = sigmoid_np(logits)

    print("\n--- DEMO (probabilities) ---")
    for t, p in zip(demo, probs):
        pairs = sorted([(ATTRS[i], float(p[i])) for i in range(len(ATTRS))], key=lambda x: x[1], reverse=True)
        print(t)
        print("  " + ", ".join([f"{a}={v:.2f}" for a, v in pairs]))


if __name__ == "__main__":
    main()


Dataset stats: {'loaded_ok': 1724, 'bad_json': 0, 'bad_schema': 0, 'total_bad': 0}
Loaded OK: 1724 examples | Train: 1552 | Val: 172
Attrs: ['supports_on_top', 'can_be_placed_on', 'is_container_openable', 'has_interior_volume', 'can_go_under', 'can_be_under', 'is_wall_mounted', 'is_floor_object', 'is_hanging_object']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss



--- DEMO (probabilities) ---
object: table | a flat surface furniture item for dining or work
  is_floor_object=0.88, supports_on_top=0.79, can_go_under=0.74, can_be_placed_on=0.35, has_interior_volume=0.26, is_container_openable=0.25, can_be_under=0.22, is_wall_mounted=0.13, is_hanging_object=0.10
object: mug | a cup for drinking hot beverages
  can_be_under=0.84, can_be_placed_on=0.77, has_interior_volume=0.36, is_container_openable=0.32, is_wall_mounted=0.23, is_floor_object=0.20, is_hanging_object=0.20, supports_on_top=0.15, can_go_under=0.12
object: cabinet | a storage unit with doors
  is_floor_object=0.88, supports_on_top=0.81, has_interior_volume=0.76, can_go_under=0.63, is_container_openable=0.59, can_be_placed_on=0.36, is_wall_mounted=0.20, is_hanging_object=0.19, can_be_under=0.17
object: chandelier | a ceiling hanging light fixture
  is_floor_object=0.55, can_be_under=0.54, can_be_placed_on=0.53, supports_on_top=0.31, can_go_under=0.30, is_wall_mounted=0.18, has_interior_v