In [None]:
# %pip install -U transformers datasets peft trl bitsandbytes accelerate qwen-vl-utils pillow tensorboard webdataset unsloth


In [1]:
import os
import json
import io
import re
from unsloth import FastVisionModel
import numpy as np
import pandas as pd
import torch
import gc
from PIL import Image
import webdataset as wds
from transformers import AutoProcessor, TrainerCallback, EarlyStoppingCallback
from trl import SFTTrainer, SFTConfig
from datasets import IterableDataset

try:
    import unsloth_zoo.tokenizer_utils as tokenizer_utils_module
    
    # Replace the function with a no-op
    def dummy_fix_untrained_tokens(*args, **kwargs):
        print("✅ Skipping fix_untrained_tokens (patched)")
        return None
    
    tokenizer_utils_module.fix_untrained_tokens = dummy_fix_untrained_tokens
    print("✅ Successfully patched unsloth_zoo.tokenizer_utils.fix_untrained_tokens")
except Exception as e:
    print(f"⚠️ Could not patch tokenizer_utils: {e}")

# Environment setup
os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # <-- ADD THIS
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["OMP_NUM_THREADS"] = "16"

# Initial cleanup
torch.cuda.empty_cache()
gc.collect()

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
✅ Successfully patched unsloth_zoo.tokenizer_utils.fix_untrained_tokens
PyTorch version: 2.7.1+cu128
CUDA available: True
GPU: NVIDIA GH200 480GB


## Config

In [2]:
# --- MAIN CONFIGURATION ---
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
OUTPUT_DIR = "./qwen2.5-vl-3b-price-predictor-final"

# --- EXECUTION CONTROL ---
# Set to True to enable evaluation, SMAPE, and early stopping
USE_VALIDATION = False
# Set to True to generate a submission file after training
RUN_INFERENCE = True
# Set to True to skip training and only run inference
INFERENCE_ONLY = False
# Set to True or a path to resume training from a checkpoint
RESUME_FROM_CHECKPOINT = None


# --- DATASET PATHS (WebDataset URL Patterns) ---
# NOTE: Adjust the shard numbers (e.g., 000066) to match the output from 5_convert.py
WEBDATASET_TRAIN_URL = "./webdataset_train/train-shard-{000000..000067}.tar"
WEBDATASET_VALIDATION_URL = "./webdataset_validation/validation-shard-{000000..000007}.tar"
WEBDATASET_TEST_URL = "./webdataset_test/test-shard-{000000..000074}.tar"


# --- TRAINING HYPERPARAMETERS ---
BATCH_SIZE = 4
EVAL_BATCH_SIZE = 8
GRADIENT_ACCUMULATION = 4
NUM_WORKERS = 0
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4

TRAIN_SAMPLES = 67499    # From preprocessing output
STEPS_PER_EPOCH = TRAIN_SAMPLES // (BATCH_SIZE * GRADIENT_ACCUMULATION)
MAX_STEPS = STEPS_PER_EPOCH * NUM_EPOCHS

# --- EVALUATION SETTINGS ---
# These are only active if USE_VALIDATION = True
EVAL_STEPS = 250
EARLY_STOPPING_PATIENCE = 3 # Stop after 3 evaluations with no improvement


# --- PROCESSOR SETTINGS ---
MIN_PIXELS = 256 * 28 * 28
MAX_PIXELS = 512 * 28 * 28

print("✅ Configuration loaded")
if USE_VALIDATION:
    print(f"🚀 Validation ENABLED: Evaluating every {EVAL_STEPS} steps.")
else:
    print("🚀 Validation DISABLED: Training on the full dataset.")

✅ Configuration loaded
🚀 Validation DISABLED: Training on the full dataset.


## Processor

In [3]:
print("Loading processor...")
processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS,
    trust_remote_code=True
)

if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

processor.tokenizer.padding_side = "right"

print("✅ Processor loaded!")

Loading processor...


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


✅ Processor loaded!


## WebDataset Loading Functions

In [4]:
def decode_sample(sample):
    """Decode preprocessed tensors from WebDataset using np.load to handle .npy format correctly"""
    try:
        # ✅ CORRECT: Use np.load() with BytesIO to properly parse .npy format
        # .npy files have headers - np.frombuffer() was reading EVERYTHING including header!
        input_ids_array = np.load(io.BytesIO(sample["input_ids.npy"]))
        attention_mask_array = np.load(io.BytesIO(sample["attention_mask.npy"]))
        pixel_values_array = np.load(io.BytesIO(sample["pixel_values.npy"]))
        
        metadata = json.loads(sample["metadata.json"].decode("utf-8"))
        
        # Now pixel_values_array has the CORRECT shape from the .npy file!
        # It should be 2D: [num_patches, hidden_dim]
        
        if pixel_values_array.ndim != 2:
            raise ValueError(f"pixel_values has {pixel_values_array.ndim} dims, expected 2D [patches, hidden_dim]")
        
        # Convert to tensors
        input_ids = torch.from_numpy(input_ids_array).long()
        attention_mask = torch.from_numpy(attention_mask_array).long()
        pixel_values = torch.from_numpy(pixel_values_array).float()
        
        image_grid_thw = torch.tensor(metadata["image_grid_thw"], dtype=torch.long)
        
        result = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pixel_values": pixel_values,
            "image_grid_thw": image_grid_thw,
        }
        
        if "sample_id" in metadata:
            result["sample_id"] = metadata["sample_id"]
        
        return result
        
    except Exception as e:
        # Let handler skip this corrupted sample
        print(f"⚠️ Skipping corrupted sample: {e}")
        raise  # Re-raise for wds.warn_and_continue handler


def create_webdataset(url_pattern, is_train=True):
    """Create a WebDataset with proper decoding and error handling"""
    dataset = (
        wds.WebDataset(url_pattern, handler=wds.warn_and_continue)
        .map(decode_sample, handler=wds.warn_and_continue)  # Handler skips errors
    )
    
    if is_train:
        dataset = dataset.shuffle(1000)
    
    return dataset

print("✅ WebDataset functions defined!")



✅ WebDataset functions defined!


In [5]:
class IndexableWebDataset(torch.utils.data.IterableDataset):
    """Wrapper to add __len__ and __getitem__ for Unsloth compatibility"""
    def __init__(self, webdataset, length):
        self.dataset = webdataset
        self.length = length
        self._cache = None
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        """Cache first 250 samples for Unsloth's fix_untrained_tokens check"""
        if self._cache is None:
            # Build cache on first access
            self._cache = {}
            iterator = iter(self.dataset)
            for i in range(min(250, self.length)):
                try:
                    self._cache[i] = next(iterator)
                except StopIteration:
                    break
        
        if idx in self._cache:
            return self._cache[idx]
        
        raise IndexError(f"Index {idx} out of cacheable range (0-249)")
    
    def __iter__(self):
        """Forward actual training iteration to WebDataset"""
        return iter(self.dataset)


In [6]:
print("Loading WebDatasets...")

wds_train = create_webdataset(WEBDATASET_TRAIN_URL, is_train=True)

# Wrap for Unsloth
train_dataset = IndexableWebDataset(wds_train, length=TRAIN_SAMPLES)

eval_dataset = create_webdataset(
    WEBDATASET_VALIDATION_URL, 
    is_train=False
) if USE_VALIDATION else None

print(f"✅ Training dataset loaded from: {WEBDATASET_TRAIN_URL}")


Loading WebDatasets...
✅ Training dataset loaded from: ./webdataset_train/train-shard-{000000..000067}.tar




## Model

In [7]:
print("🚀 Loading Qwen2.5-VL-3B with Unsloth optimization...")
model, tokenizer = FastVisionModel.from_pretrained(
    model_name=MODEL_ID,
    max_seq_length=512,
    load_in_4bit=True,
    dtype=torch.bfloat16,
    use_gradient_checkpointing=False, # We found this was a bottleneck
    trust_remote_code=True,
)


print("\n🔧 Fixing embedding size mismatch...")
print(f"   Tokenizer vocab size: {len(processor.tokenizer)}")
print(f"   Model vocab size BEFORE: {model.config.vocab_size}")

# Resize the embedding layer to handle ALL tokens (including vision tokens)
model.resize_token_embeddings(len(processor.tokenizer))

print(f"   Model vocab size AFTER: {model.config.vocab_size}")
print("✅ Embedding layer resized! Vision tokens can now be processed.")


# model = FastVisionModel.get_peft_model(
#     model,
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0.,
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
#                     "gate_proj", "up_proj", "down_proj"],
#     bias="none",
#     use_gradient_checkpointing=False,
#     random_state=42,
# )

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True,      # Keep vision layers trainable
    finetune_language_layers=True,     # Keep language layers trainable
    finetune_attention_modules=True,   # Keep attention trainable
    finetune_mlp_modules=True,         # Keep MLP trainable
    r=16,                              # LoRA rank
    lora_alpha=16,                     # LoRA alpha
    lora_dropout=0.,                 # Dropout
    bias="none",
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)

# --- SFTConfig with Conditional Evaluation ---
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    gradient_checkpointing=False,
    optim="adamw_torch_fused",
    tf32=True,
    bf16=True,
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=25,
    logging_first_step=True,
    save_strategy="steps",
    save_steps=EVAL_STEPS,
    save_total_limit=3,
    load_best_model_at_end=False,
    remove_unused_columns=False,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
    max_seq_length=512,
    packing=False,
    dataloader_num_workers=NUM_WORKERS,
    dataloader_pin_memory=False,
    torch_empty_cache_steps=20,
    seed=42,
)

# Add validation settings if using validation
if USE_VALIDATION:
    training_args.eval_strategy = "steps"  # ← CHANGE from "epoch"
    training_args.eval_steps = EVAL_STEPS
    training_args.metric_for_best_model = "eval_loss"  # Use loss, not SMAPE
    training_args.greater_is_better = False
    training_args.load_best_model_at_end = True

print(f"✅ Training config: {NUM_EPOCHS} epochs, batch={BATCH_SIZE}, workers={NUM_WORKERS}")
print("✅ Model loaded and SFTConfig configured!")

🚀 Loading Qwen2.5-VL-3B with Unsloth optimization...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.10.1: Fast Qwen2_5_Vl patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GH200 480GB. Num GPUs = 1. Max memory: 94.5 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Qwen2_5_Vl does not support SDPA - switching to fast eager.

🔧 Fixing embedding size mismatch...
   Tokenizer vocab size: 151665
   Model vocab size BEFORE: 151936
   Model vocab size AFTER: 151936
✅ Embedding layer resized! Vision tokens can now be processed.
Unsloth: Making `model.base_model.model.model` require gradients
✅ Training config: 3 epochs, batch=4, workers=0
✅ Model loaded and SFTConfig configured!


In [8]:
def collate_fn(batch):
    """Collate with proper special token masking"""
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [example["input_ids"] for example in batch],
        batch_first=True,
        padding_value=processor.tokenizer.pad_token_id
    )
    
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [example["attention_mask"] for example in batch],
        batch_first=True,
        padding_value=0
    )
    
    pixel_values = torch.cat([example["pixel_values"] for example in batch], dim=0)
    image_grid_thw = torch.stack([example["image_grid_thw"] for example in batch])
    
    labels = input_ids.clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    
    # ✅ THE KEY FIX: Mask ALL special tokens >= vocab_size
    vocab_size = processor.tokenizer.vocab_size
    labels[labels >= vocab_size] = -100
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values,
        "image_grid_thw": image_grid_thw,
        "labels": labels,
    }


In [9]:
class PeriodicCheckpointCallback(TrainerCallback):
    """Save checkpoint every N steps (overwrites previous)"""
    def __init__(self, save_steps=60):
        self.save_steps = save_steps
        self.last_save_step = 0
    
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step - self.last_save_step >= self.save_steps:
            checkpoint_dir = os.path.join(args.output_dir, "checkpoint-periodic")
            kwargs["model"].save_pretrained(checkpoint_dir)
            print(f"\n💾 Periodic checkpoint saved at step {state.global_step}")
            self.last_save_step = state.global_step
        return control

print("✅ Periodic checkpoint callback defined!")


✅ Periodic checkpoint callback defined!


In [10]:
class MemoryCleanupCallback(TrainerCallback):
    """Clean up GPU memory between epochs"""
    def on_epoch_end(self, args, state, control, **kwargs):
        torch.cuda.empty_cache(); gc.collect()
        print("\n🧹 GPU memory cleaned")
        return control

if not INFERENCE_ONLY:
    print("="*60)
    print("INITIALIZING TRAINER")
    print("="*60)
    
    # Load validation dataset if enabled
    eval_dataset_for_trainer = None
    if USE_VALIDATION:
        print(f"Loading validation WebDataset from: {WEBDATASET_VALIDATION_URL}")
        eval_dataset_for_trainer = eval_dataset  # ← Use the one you already created!
    
    print("Loading trainer...")
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,  # ← Plain IterableDataset (no wrapper!)
        eval_dataset=eval_dataset_for_trainer,
        data_collator=collate_fn,
        tokenizer=processor.tokenizer,
    )
    
    trainer.add_callback(MemoryCleanupCallback())
    trainer.add_callback(PeriodicCheckpointCallback(save_steps=60))
    
    if USE_VALIDATION:
        trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))
    
    print("✅ Trainer initialized!")


INITIALIZING TRAINER
Loading trainer...
✅ Skipping fix_untrained_tokens (patched)
✅ Trainer initialized!


## Finetune

In [11]:
# ============================================================
# DEBUG: Test a single batch to find the exact problem
# ============================================================

print("="*60)
print("DEBUGGING BATCH DATA")
print("="*60)

# 1. Get a single raw sample from the dataset
print("\n📦 Fetching one raw sample...")
raw_sample = next(iter(train_dataset))

print("\n✅ Raw sample keys:", raw_sample.keys())
print(f"   input_ids shape: {raw_sample['input_ids'].shape}")
print(f"   pixel_values shape: {raw_sample['pixel_values'].shape}")
print(f"   image_grid_thw shape: {raw_sample['image_grid_thw'].shape}")

# 2. Check raw input_ids values
print("\n🔍 Raw input_ids statistics:")
print(f"   Min: {raw_sample['input_ids'].min().item()}")
print(f"   Max: {raw_sample['input_ids'].max().item()}")
print(f"   Unique tokens: {torch.unique(raw_sample['input_ids']).numel()}")

# Find vision tokens
vision_tokens = raw_sample['input_ids'][(raw_sample['input_ids'] >= 151650) & (raw_sample['input_ids'] <= 151660)]
print(f"   Vision tokens found: {torch.unique(vision_tokens).tolist()}")

# 3. Test collate_fn on a single-item batch
print("\n🔧 Testing collate_fn with batch_size=1...")
batch = [raw_sample]
try:
    collated = collate_fn(batch)
    print("   ✅ Collation successful!")
    
    print("\n📊 Collated batch shapes:")
    print(f"   input_ids: {collated['input_ids'].shape}")
    print(f"   attention_mask: {collated['attention_mask'].shape}")
    print(f"   pixel_values: {collated['pixel_values'].shape}")
    print(f"   image_grid_thw: {collated['image_grid_thw'].shape}")
    print(f"   labels: {collated['labels'].shape}")
    
    print("\n🔍 Labels statistics:")
    print(f"   Min (non-masked): {collated['labels'][collated['labels'] != -100].min().item()}")
    print(f"   Max (non-masked): {collated['labels'][collated['labels'] != -100].max().item()}")
    print(f"   Masked tokens (-100): {(collated['labels'] == -100).sum().item()}")
    print(f"   Total tokens: {collated['labels'].numel()}")
    
    # Check for vision tokens in labels (SHOULD BE -100!)
    labels_flat = collated['labels'].view(-1)
    vision_in_labels = labels_flat[(labels_flat >= 151650) & (labels_flat <= 151660) & (labels_flat != -100)]
    if len(vision_in_labels) > 0:
        print(f"   ⚠️ WARNING: Found {len(vision_in_labels)} vision tokens NOT masked!")
        print(f"   Vision tokens in labels: {torch.unique(vision_in_labels).tolist()}")
    else:
        print(f"   ✅ All vision tokens properly masked")
    
except Exception as e:
    print(f"   ❌ Collation failed: {e}")
    import traceback
    traceback.print_exc()

# 4. Check vocab size
print("\n📚 Model vocabulary:")
print(f"   Tokenizer vocab_size: {processor.tokenizer.vocab_size}")
print(f"   Model config vocab_size: {model.config.vocab_size}")

# Check if any labels exceed vocab size
if 'collated' in locals():
    labels_no_mask = collated['labels'][collated['labels'] != -100]
    if len(labels_no_mask) > 0:
        max_label = labels_no_mask.max().item()
        if max_label >= processor.tokenizer.vocab_size:
            print(f"   ❌ PROBLEM: Max label {max_label} >= vocab_size {processor.tokenizer.vocab_size}")
            print(f"   Out-of-bounds labels: {labels_no_mask[labels_no_mask >= processor.tokenizer.vocab_size].tolist()}")
        else:
            print(f"   ✅ All labels within vocab range")

# 5. Test with multiple samples (batch_size=4)
print("\n\n🧪 Testing with batch_size=4...")
batch_4 = []
dataset_iter = iter(train_dataset)
for i in range(4):
    try:
        batch_4.append(next(dataset_iter))
    except StopIteration:
        break

print(f"   Collected {len(batch_4)} samples")

try:
    collated_4 = collate_fn(batch_4)
    print("   ✅ Batch collation successful!")
    print(f"   Batch shapes: input_ids={collated_4['input_ids'].shape}, labels={collated_4['labels'].shape}")
    
    # Check labels again
    labels_no_mask = collated_4['labels'][collated_4['labels'] != -100]
    if len(labels_no_mask) > 0:
        print(f"   Labels range: {labels_no_mask.min().item()} to {labels_no_mask.max().item()}")
        if labels_no_mask.max().item() >= processor.tokenizer.vocab_size:
            print(f"   ❌ Found out-of-bounds labels!")
        else:
            print(f"   ✅ All labels valid")
            
except Exception as e:
    print(f"   ❌ Batch collation failed: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*60)
print("DEBUG COMPLETE - Share all output above!")
print("="*60)


DEBUGGING BATCH DATA

📦 Fetching one raw sample...

✅ Raw sample keys: dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw', 'sample_id', '__key__'])
   input_ids shape: torch.Size([716])
   pixel_values shape: torch.Size([1296, 1176])
   image_grid_thw shape: torch.Size([3])

🔍 Raw input_ids statistics:
   Min: 4
   Max: 151655
   Unique tokens: 209
   Vision tokens found: [151652, 151653, 151655]

🔧 Testing collate_fn with batch_size=1...
   ✅ Collation successful!

📊 Collated batch shapes:
   input_ids: torch.Size([1, 716])
   attention_mask: torch.Size([1, 716])
   pixel_values: torch.Size([1296, 1176])
   image_grid_thw: torch.Size([1, 3])
   labels: torch.Size([1, 716])

🔍 Labels statistics:
   Min (non-masked): 4
   Max (non-masked): 96693
   Masked tokens (-100): 332
   Total tokens: 716
   ✅ All vision tokens properly masked

📚 Model vocabulary:
   Tokenizer vocab_size: 151643
   Model config vocab_size: 151936
   ✅ All labels within vocab range


🧪 Testi

In [12]:
print("🔍 INSPECTING RAW INPUT_IDS FROM WEBDATASET")
raw_sample = next(iter(train_dataset))

# Get the actual token IDs
token_ids = raw_sample['input_ids']

print(f"\nToken IDs shape: {token_ids.shape}")
print(f"Min ID: {token_ids.min().item()}")
print(f"Max ID: {token_ids.max().item()}")

# Find the HUGE values
huge_values = token_ids[token_ids > 151660]
if len(huge_values) > 0:
    print(f"\n❌ PROBLEM FOUND!")
    print(f"   Found {len(huge_values)} tokens with IDs > 151660")
    print(f"   These values: {huge_values.tolist()[:20]}")  # Show first 20
    print(f"\n   Positions: {torch.where(token_ids > 151660)[0].tolist()[:10]}")
    
    # Show what's around them
    for pos in torch.where(token_ids > 151660)[0][:3]:
        start = max(0, pos-5)
        end = min(len(token_ids), pos+6)
        print(f"\n   Context around position {pos}:")
        print(f"   {token_ids[start:end].tolist()}")


🔍 INSPECTING RAW INPUT_IDS FROM WEBDATASET

Token IDs shape: torch.Size([489])
Min ID: 11
Max ID: 151655


In [13]:
print("\n🔍 CHECKING image_grid_thw VALUES")
for i, sample in enumerate(iter(train_dataset)):
    print(f"\nSample {i}:")
    print(f"   image_grid_thw: {sample['image_grid_thw']}")
    print(f"   pixel_values shape: {sample['pixel_values'].shape}")
    
    t, h, w = sample['image_grid_thw']
    expected = t * h * w
    actual = sample['pixel_values'].shape[0]
    
    print(f"   Expected patches (t*h*w): {expected}")
    print(f"   Actual patches: {actual}")
    print(f"   Match: {expected == actual}")
    
    if expected != actual:
        print(f"   ❌ MISMATCH FOUND!")
        break
    
    if i >= 4:  # Check first 5 samples
        break



🔍 CHECKING image_grid_thw VALUES

Sample 0:
   image_grid_thw: tensor([ 1, 44, 26])
   pixel_values shape: torch.Size([1144, 1176])
   Expected patches (t*h*w): 1144
   Actual patches: 1144
   Match: True

Sample 1:
   image_grid_thw: tensor([ 1, 36, 36])
   pixel_values shape: torch.Size([1296, 1176])
   Expected patches (t*h*w): 1296
   Actual patches: 1296
   Match: True

Sample 2:
   image_grid_thw: tensor([ 1, 36, 36])
   pixel_values shape: torch.Size([1296, 1176])
   Expected patches (t*h*w): 1296
   Actual patches: 1296
   Match: True

Sample 3:
   image_grid_thw: tensor([ 1, 36, 36])
   pixel_values shape: torch.Size([1296, 1176])
   Expected patches (t*h*w): 1296
   Actual patches: 1296
   Match: True

Sample 4:
   image_grid_thw: tensor([ 1, 36, 36])
   pixel_values shape: torch.Size([1296, 1176])
   Expected patches (t*h*w): 1296
   Actual patches: 1296
   Match: True


In [14]:
if not INFERENCE_ONLY:
    print("\n" + "="*60)
    print("STARTING TRAINING")
    print("="*60 + "\n")
    
    trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)
    
    print("\n✅ Training complete!")
    
    # When load_best_model_at_end=True, the trainer already holds the best model.
    # We just need to save it.
    if USE_VALIDATION:
        print(f"Saving the best model from checkpoint: {trainer.state.best_model_checkpoint}")
    else:
        print("Saving final model...")
        
    trainer.save_model(OUTPUT_DIR)
    processor.save_pretrained(OUTPUT_DIR)
    
    print(f"✅ Model and processor saved to {OUTPUT_DIR}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.



STARTING TRAINING



/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [117,0,0], thread: [0,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [117,0,0], thread: [1,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [117,0,0], thread: [2,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [117,0,0], thread: [3,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [117,0,0], thread: [4,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [117,0,

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Inference

In [None]:
if RUN_INFERENCE:
    print("="*60)
    print("RUNNING INFERENCE ON TEST SET")
    print("="*60)
    
    # Get the model
    inference_model = trainer.model if hasattr(trainer, 'model') else model
    inference_model.eval()
    
    # Load test WebDataset
    test_wds = create_webdataset(WEBDATASET_TEST_URL, is_train=False)
    
    predictions = []
    sample_ids = []
    batch = []
    
    print("Processing test samples...")
    
    for sample in tqdm(test_wds, desc="Inference", total=75000):
        batch.append(sample)
        
        # Process batch when full
        if len(batch) == EVAL_BATCH_SIZE:
            # Use collate_fn to prepare batch (already preprocessed!)
            batch_data = collate_fn(batch)
            
            # Move to GPU (remove labels for inference)
            inputs = {
                "input_ids": batch_data["input_ids"].to("cuda"),
                "attention_mask": batch_data["attention_mask"].to("cuda"),
                "pixel_values": batch_data["pixel_values"].to("cuda"),
                "image_grid_thw": batch_data["image_grid_thw"].to("cuda"),
            }
            
            with torch.no_grad():
                generated_ids = inference_model.generate(
                    **inputs,
                    max_new_tokens=20,
                    num_beams=1,
                    do_sample=False,
                    pad_token_id=processor.tokenizer.pad_token_id,
                    eos_token_id=processor.tokenizer.eos_token_id,
                )
            
            # Decode only the generated part
            for i, (in_ids, out_ids) in enumerate(zip(inputs["input_ids"], generated_ids)):
                generated_ids_trimmed = out_ids[len(in_ids):]
                prediction = processor.decode(
                    generated_ids_trimmed,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False,
                ).strip()
                
                # Extract price
                try:
                    predicted_price = float(prediction)
                except:
                    predicted_price = 0.0
                
                predictions.append(predicted_price)
                sample_ids.append(batch[i]["sample_id"])
            
            # Clear batch
            batch = []
            
            # Memory cleanup
            del inputs, generated_ids
            torch.cuda.empty_cache()
    
    # Handle remaining samples
    if len(batch) > 0:
        batch_data = collate_fn(batch)
        inputs = {
            "input_ids": batch_data["input_ids"].to("cuda"),
            "attention_mask": batch_data["attention_mask"].to("cuda"),
            "pixel_values": batch_data["pixel_values"].to("cuda"),
            "image_grid_thw": batch_data["image_grid_thw"].to("cuda"),
        }
        
        with torch.no_grad():
            generated_ids = inference_model.generate(
                **inputs,
                max_new_tokens=20,
                num_beams=1,
                do_sample=False,
                pad_token_id=processor.tokenizer.pad_token_id,
                eos_token_id=processor.tokenizer.eos_token_id,
            )
        
        for i, (in_ids, out_ids) in enumerate(zip(inputs["input_ids"], generated_ids)):
            generated_ids_trimmed = out_ids[len(in_ids):]
            prediction = processor.decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
            ).strip()
            
            try:
                predicted_price = float(prediction)
            except:
                predicted_price = 0.0
            
            predictions.append(predicted_price)
            sample_ids.append(batch[i]["sample_id"])
    
    # Create submission
    submission_df = pd.DataFrame({"sample_id": sample_ids, "price": predictions})
    submission_df.to_csv("test_out.csv", index=False)
    
    print(f"\n✅ Inference complete!")
    print(f"   Total predictions: {len(predictions):,}")
    print(f"   Saved to: test_out.csv")


In [None]:
# ==============================================================================
# POST-PROCESSING: FILL MISSING PREDICTIONS
# ==============================================================================
print("\n" + "="*70)
print("CHECKING FOR MISSING PREDICTIONS")
print("="*70)

# Load test CSV and submission
test_csv = pd.read_csv("../dataset/test.csv")
submission_df = pd.read_csv("test_out.csv")

# Find missing sample IDs
all_test_ids = set(test_csv["sample_id"].tolist())
predicted_ids = set(submission_df["sample_id"].tolist())
missing_ids = sorted(all_test_ids - predicted_ids)

if missing_ids:
    print(f"⚠️  Found {len(missing_ids)} missing predictions")
    print(f"   Sample IDs: {missing_ids}")
    
    # Use training data mean as fallback
    train_csv = pd.read_csv("../dataset/train.csv")
    mean_price = train_csv["price"].mean()
    median_price = train_csv["price"].median()
    
    print(f"\n💡 Fallback strategy:")
    print(f"   Training mean price: ${mean_price:.2f}")
    print(f"   Training median price: ${median_price:.2f}")
    print(f"   Using MEDIAN (more robust to outliers)")
    
    # Create missing predictions with median
    missing_rows = pd.DataFrame({
        "sample_id": missing_ids,
        "price": [median_price] * len(missing_ids)
    })
    
    # Add to submission
    submission_df = pd.concat([submission_df, missing_rows], ignore_index=True)
    submission_df = submission_df.sort_values("sample_id").reset_index(drop=True)
    submission_df.to_csv("test_out.csv", index=False)
    
    print(f"\n✅ Added {len(missing_ids)} predictions with median price: ${median_price:.2f}")
    print(f"   Impact: {len(missing_ids)/75000*100:.4f}% of test set")
else:
    print("✅ All 75,000 predictions present!")

# Verify final count
print(f"\n📊 Final submission stats:")
print(f"   Total rows: {len(submission_df)}")
print(f"   Sample ID range: {submission_df['sample_id'].min()} - {submission_df['sample_id'].max()}")
print(f"   Price range: ${submission_df['price'].min():.2f} - ${submission_df['price'].max():.2f}")
print("="*70)
