# ðŸ“Š Dataset Generation Notebook

This notebook generates synthetic instruction-action pairs for fine-tuning.

## Dataset Design
- **Format**: Instruction â†’ JSON action plan
- **Actions**: move, rotate, scale
- **Objects**: Various 3D primitives with colors
- **Positions**: Semantic scene locations

In [None]:
import json
import random
from pathlib import Path
from itertools import product

## 1. Define Vocabulary

In [None]:
# Objects
COLORS = ["red", "blue", "green", "yellow", "purple", "orange", "white", "black"]
SHAPES = ["box", "cube", "sphere", "ball", "cylinder", "cone", "pyramid"]

# Positions
POSITIONS = [
    "floor", "ground", "center", "origin",
    "blue platform", "red platform", "green platform",
    "left corner", "right corner", "top shelf", "bottom shelf",
    "table", "desk", "pedestal"
]

# Actions
ACTIONS = {
    "move": [
        "Move {obj} to {target}",
        "Place {obj} on {target}",
        "Put {obj} at {target}",
        "Relocate {obj} to {target}",
        "Transfer {obj} to {target}",
    ],
    "rotate": [
        "Rotate {obj} {angle} degrees",
        "Spin {obj} by {angle} degrees",
        "Turn {obj} {angle} degrees {direction}",
    ],
    "scale": [
        "Scale {obj} by {factor}x",
        "Make {obj} {factor} times bigger",
        "Resize {obj} to {factor}x its size",
        "Shrink {obj} to {fraction} its size",
    ]
}

ANGLES = [45, 90, 180, 270, 360]
DIRECTIONS = ["clockwise", "counterclockwise"]
SCALE_FACTORS = [0.5, 2, 3, 0.25]

print(f"Colors: {len(COLORS)}, Shapes: {len(SHAPES)}, Positions: {len(POSITIONS)}")

## 2. Generate Examples

In [None]:
def generate_object():
    """Generate a random object name."""
    color = random.choice(COLORS)
    shape = random.choice(SHAPES)
    return f"{color} {shape}"

def generate_move_example():
    """Generate a move action example."""
    obj = generate_object()
    initial = random.choice(POSITIONS)
    target = random.choice([p for p in POSITIONS if p != initial])
    
    template = random.choice(ACTIONS["move"])
    instruction = template.format(obj=obj, target=target)
    
    output = {
        "object": obj,
        "initial_position": initial,
        "action": "move",
        "target_position": target
    }
    
    return {"instruction": instruction, "output": json.dumps(output)}

def generate_rotate_example():
    """Generate a rotate action example."""
    obj = generate_object()
    angle = random.choice(ANGLES)
    direction = random.choice(DIRECTIONS)
    position = random.choice(POSITIONS)
    
    template = random.choice(ACTIONS["rotate"])
    instruction = template.format(obj=obj, angle=angle, direction=direction)
    
    output = {
        "object": obj,
        "initial_position": position,
        "action": "rotate",
        "target_position": f"{angle} degrees {direction}"
    }
    
    return {"instruction": instruction, "output": json.dumps(output)}

def generate_scale_example():
    """Generate a scale action example."""
    obj = generate_object()
    factor = random.choice(SCALE_FACTORS)
    position = random.choice(POSITIONS)
    
    if factor < 1:
        template = ACTIONS["scale"][3]  # Shrink template
        instruction = template.format(obj=obj, fraction=factor)
    else:
        template = random.choice(ACTIONS["scale"][:3])
        instruction = template.format(obj=obj, factor=factor)
    
    output = {
        "object": obj,
        "initial_position": position,
        "action": "scale",
        "target_position": f"{factor}x original size"
    }
    
    return {"instruction": instruction, "output": json.dumps(output)}

In [None]:
# Generate dataset
def generate_dataset(n_examples=1000):
    """Generate a balanced dataset."""
    generators = [generate_move_example, generate_rotate_example, generate_scale_example]
    
    examples = []
    for _ in range(n_examples):
        gen = random.choice(generators)
        examples.append(gen())
    
    return examples

# Generate
random.seed(42)
dataset = generate_dataset(1000)

print(f"Generated {len(dataset)} examples")
print(f"\nSample:\n{json.dumps(dataset[0], indent=2)}")

## 3. Split Dataset

In [None]:
# Shuffle and split
random.shuffle(dataset)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))

train_data = dataset[:train_size]
val_data = dataset[train_size:train_size + val_size]
test_data = dataset[train_size + val_size:]

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

## 4. Save to JSONL

In [None]:
def save_jsonl(data, filepath):
    """Save data to JSONL format."""
    with open(filepath, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')
    print(f"Saved {len(data)} examples to {filepath}")

# Save files
output_dir = Path("../data")
output_dir.mkdir(exist_ok=True)

save_jsonl(train_data, output_dir / "train.jsonl")
save_jsonl(val_data, output_dir / "val.jsonl")
save_jsonl(test_data, output_dir / "test.jsonl")

## 5. Dataset Statistics

In [None]:
# Analyze action distribution
from collections import Counter

actions = []
for item in dataset:
    output = json.loads(item['output'])
    actions.append(output['action'])

action_counts = Counter(actions)

print("Action Distribution:")
for action, count in action_counts.items():
    print(f"  {action}: {count} ({100*count/len(dataset):.1f}%)")

In [None]:
# Instruction length statistics
lengths = [len(item['instruction'].split()) for item in dataset]

print(f"\nInstruction Length:")
print(f"  Min: {min(lengths)} words")
print(f"  Max: {max(lengths)} words")
print(f"  Avg: {sum(lengths)/len(lengths):.1f} words")

## Next Steps

1. **Manual review** - Check 50 random samples for quality
2. **Add complexity** - Multi-object instructions
3. **Augmentation** - Paraphrase using LLM
4. **Edge cases** - Add ambiguous/malformed inputs