In [1]:
class UniPHDArgs:
    def __init__(self):
        # -----------------------------
        # Config / Override Parameters
        # -----------------------------
        self.config_file = ""           # REQUIRED: path to .py config
        self.options = None             # List of overrides via DictAction (e.g., ["lr=1e-4"])

        # -----------------------------
        # Prompt / Text Encoder
        # -----------------------------
        self.freeze_text_encoder = False
        self.train_trigger = "text scribble point"
        self.eval_trigger = "text"

        self.kps_visi_trigger = True
        self.pose_guide_trigger = False
        self.late_within_attn_trigger = True
        self.within_type = "attn_graph"
        self.no_mask = False

        # -----------------------------
        # Model Backbone
        # -----------------------------
        self.backbone = "mobilevit_xxs"
        self.swin_pretrain_path = r"C:\Users\nikhi\Desktop\HuMAR\datasets\RefHuman"

        # -----------------------------
        # Dataset Parameters
        # -----------------------------
        self.dataset_file = "refhuman"
        self.coco_path = "../datasets/RefHuman"
        self.remove_difficult = False

        # -----------------------------
        # Training Parameters
        # -----------------------------
        self.output_dir = "./results/UniPHD_Results"
        self.note = ""
        self.device = "cuda"
        self.seed = 42

        self.resume = ""                # checkpoint path
        self.pretrain_model_path = None # external checkpoint
        self.finetune_ignore = None     # list[str]

        self.start_epoch = 0
        self.eval = False
        self.num_workers = 0
        self.find_unused_params = False
        self.save_log = False

        # -----------------------------
        # Distributed Training
        # -----------------------------
        self.world_size = 1
        self.dist_url = "env://"
        self.rank = 0
        self.local_rank = 0
        self.amp = False                # Mixed precision

        # -----------------------------
        # Additional Keys Updated Later
        # -----------------------------
        self.use_ema = False
        self.debug = False

        # -----------------------------
        # MODEL NAME (REQUIRED FOR build_model_main)
        # -----------------------------
        # MUST be set to something like "UniPHD", "UniRef", "UniSeg", etc.
        self.modelname = "uniphd"             # <---- you MUST fill this
        self.num_classes = 2

        self.lr = 0.0001
        self.lr_adjacent_matrix = 1e-04
        self.param_dict_type = 'default'
        self.lr_backbone = 1e-05
        self.lr_backbone_names = ['backbone.0']
        self.lr_linear_proj_names = ['reference_points', 'sampling_offsets']
        self.lr_linear_proj_mult = 0.1
        self.lr_text_encoder = 0.0001
        self.lr_text_encoder_names = ['text_encoder']
        self.batch_size = 4
        self.weight_decay = 0.0001
        self.epochs = 20
        self.lr_drop = 18
        self.save_checkpoint_interval = 5
        self.clip_max_norm = 0.1

        self.modelname = 'uniphd'
        self.frozen_weights = None
        self.use_checkpoint = False
        self.dilation = False
        self.position_embedding = 'sine'
        self.pe_temperatureH = 20
        self.pe_temperatureW = 20
        self.return_interm_indices = [0, 1, 2, 3]
        self.backbone_freeze_keywords = None

        # for transformer
        self.transformer_type = 'fully_conv_optim'  # 'original', 'efficient', 'fully_conv', 'fully_conv_optim'
        self.hidden_dim = 256
        self.dropout = 0.0
        self.dim_feedforward = 2048
        self.enc_layers = 6
        self.dec_layers = 6
        self.pre_norm = False
        self.return_intermediate_dec = True
        self.enc_n_points = 4
        self.dec_n_points = 4
        self.learnable_tgt_init = False
        self.transformer_activation = 'relu'

        # for main model
        self.num_classes=2
        self.nheads = 8
        self.num_queries = 20
        self.num_feature_levels = 4
        self.dec_pred_class_embed_share = False
        self.dec_pred_pose_embed_share = False
        self.two_stage_type = 'standard'
        self.two_stage_bbox_embed_share = False
        self.two_stage_class_embed_share = False
        self.cls_no_bias = False
        self.num_body_points = 17

        # for loss
        self.focal_alpha = 0.25
        self.cls_loss_coef = 2.0
        self.bbox_loss_coef = 5.0
        self.keypoints_loss_coef = 10.0
        self.keypoints_visi_loss_coef = 4.0
        self.oks_loss_coef=4.0
        self.giou_loss_coef = 2.0
        self.enc_loss_coef = 1.0
        self.interm_loss_coef = 1.0
        self.mask_loss_coef = 2.0
        self.dice_loss_coef = 5.0
        self.no_interm_loss = False
        self.aux_loss = True

        # for matcher
        self.matcher_type = 'HungarianMatcher'
        self.set_cost_class = 2.0
        self.set_cost_bbox = 5.0
        self.set_cost_giou = 2.0
        self.set_cost_keypoints = 10.0
        self.set_cost_keypoints_visi = 4.0
        self.set_cost_oks=4.0
        self.set_cost_kpvis = 0.0
        self.set_cost_mask = 2.0
        self.set_cost_dice = 5.0

        # for postprocess
        self.num_select = 20

        # for ema
        self.use_ema = False
        self.ema_decay = 0.9997
        self.ema_epoch = 0


    def __str__(self):
        """For clean printing."""
        return "\n".join([f"{k}: {v}" for k, v in self.__dict__.items()])


args = UniPHDArgs()
args.config_file = "configs/uniphd.py"
args.modelname = "uniphd"
print(f"‚úì Args configured with backbone: {args.backbone} and transformer: {args.transformer_type}")

‚úì Args configured with backbone: mobilevit_xxs and transformer: fully_conv_optim


In [2]:
def build_model_main():
    from models.registry import MODULE_BUILD_FUNCS
    assert 'uniphd' in MODULE_BUILD_FUNCS._module_dict
    build_func = MODULE_BUILD_FUNCS.get('uniphd')
    model, criterion, postprocessors = build_func(args)
    return model, criterion, postprocessors

In [3]:
# MiniLM: 22.7
# TinyBERT: 14.5
# ALBERTA: 11.6
# MobileBERT: 24.6
# DistilBERT: 66.3
# Roberta: 124.6

In [4]:
import torch

model, criterion, postprocessors = build_model_main()

# Move model to CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"‚úì Model loaded on device: {device}")



Loaded mobilevit_xxs backbone
Output channels per stage: [16, 24, 48, 64]
Layer (type:depth-idx)                                  Param #
Joiner                                                  --
‚îú‚îÄMobileViTBackbone: 1-1                                --
‚îÇ    ‚îî‚îÄFeatureListNet: 2-1                              --
‚îÇ    ‚îÇ    ‚îî‚îÄConvNormAct: 3-1                            464
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-2                             1,472
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-3                             7,696
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-4                             139,888
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-5                             341,824
‚îú‚îÄPositionEmbeddingSineHW: 1-2                          --
Total params: 491,344
Trainable params: 491,344
Non-trainable params: 0
Using Optimized FULLY CONVOLUTIONAL Transfomer
********** Enabling Text Prompt ***************

Use ALBERT as text encoder. Freeze: False
********** Enabling Positional Prompt ************

In [12]:
# fully_conv_optim: 3.8
# fully_conv: 10.0
# optimizedL 8.7
# original: 25.0

In [11]:
import torch
from util.misc import nested_tensor_from_tensor_list

# Create dummy images (batch_size=2, 3 channels, 256x256)
dummy_images = torch.randn(1, 3, 256, 256)

# Create complete dummy targets with all required fields
num_keypoints = 17  # COCO format has 17 keypoints
dummy_targets = []

for i in range(1):
    # Create keypoints: [num_instances, num_keypoints * 3] where 3 = (x, y, visibility)
    # Flattened format: [x1, y1, v1, x2, y2, v2, ...]
    keypoints_flat = torch.rand(1, num_keypoints * 3)  # Random values between 0 and 1
    
    target = {
        'caption': 'A person standing' if i == 0 else 'A person sitting',
        'labels': torch.tensor([1], dtype=torch.long),  # Class labels (1 = person)
        'boxes': torch.tensor([[0.5, 0.5, 0.3, 0.4]], dtype=torch.float32),  # [cx, cy, w, h] normalized
        'keypoints': keypoints_flat,  # [num_instances, num_keypoints * 3] flattened
        'area': torch.tensor([0.12], dtype=torch.float32),  # Area of bbox
        'iscrowd': torch.tensor([0], dtype=torch.long),  # Not a crowd
        'orig_size': torch.tensor([256, 256], dtype=torch.long),  # Original image size
        'size': torch.tensor([256, 256], dtype=torch.long),  # Current image size
        'scribble': torch.rand(8, 2)  # 8 scribble points with (x, y) coordinates
    }
    dummy_targets.append(target)

# Move everything to CUDA
device = next(model.parameters()).device
dummy_images = dummy_images.to(device)

# Move all target tensors to device
for target in dummy_targets:
    for k, v in target.items():
        if isinstance(v, torch.Tensor):
            target[k] = v.to(device)

# Create NestedTensor from list of images
samples = nested_tensor_from_tensor_list(list(dummy_images))

# Set model to eval mode
model.eval()

print(f"‚úì Input prepared: images shape {dummy_images.shape}, {len(dummy_targets)} targets")
print(f"‚úì Keypoints shape: {dummy_targets[0]['keypoints'].shape}")
print(f"‚úì Device: {device}")

# Forward pass
with torch.no_grad():
    outputs = model(samples, dummy_targets)

print("\n‚úÖ Forward pass successful with mobilevit_xxs backbone!")
print(f"‚úì Output keys: {list(outputs.keys())}")
print(f"‚úì Predicted boxes shape: {outputs['pred_boxes'].shape}")
print(f"‚úì Predicted keypoints shape: {outputs['pred_keypoints'].shape}")
print(f"‚úì Predicted logits shape: {outputs['pred_logits'].shape}")

‚úì Input prepared: images shape torch.Size([1, 3, 256, 256]), 1 targets
‚úì Keypoints shape: torch.Size([1, 51])
‚úì Device: cuda:0

‚è±Ô∏è  INFERENCE TIMING BREAKDOWN (ms)
TOTAL FORWARD                              319.38 ms  (100.00%)
Backbone                          28.64 ms  (  9.0%) ‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë
Text Encoder                      61.67 ms  ( 19.3%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë
Text Pos Encoding                  0.00 ms  (  0.0%) ‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë
Multimodal Fusion                  6.65 ms  (  2.1%) ‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚

In [11]:
# ============================================================================
# ALL AVAILABLE MODEL OPTIONS
# ============================================================================

"""
BACKBONE OPTIONS:
-----------------
MobileViT (Ultra-lightweight):
  - mobilevit_xxs    (1.3M params)  ‚≠ê LIGHTEST
  - mobilevit_xs     (2.3M params)
  - mobilevit_s      (5.6M params)

SegFormer (Efficient hierarchical ViT):
  - segformer_mit_b0 (3.7M params)  ‚≠ê BEST EFFICIENCY
  - segformer_mit_b1 (13.7M params)

EfficientFormerV2 (State-of-the-art efficient ViT):
  - efficientformerv2_s0
  - efficientformerv2_s1
  - efficientformerv2_s2

PoolFormer (MetaFormer with pooling):
  - poolformer_s12   (12M params)
  - poolformer_s24
  - poolformer_s36

Swin Transformer (Original):
  - swin_T_224_1k    (28M params)  ‚≠ê ORIGINAL DEFAULT


TRANSFORMER OPTIONS:
--------------------
  - 'original'        (27M params)   - Deformable attention, highest quality
  - 'efficient'       (11M params)   - Linear attention O(N), Conv+Attention
  - 'fully_conv'      (12M params)   - Pure convolution, ConvNeXt blocks
  - 'fully_conv_optim'(2-3M params)  - Ghost modules, ultra-lightweight ‚≠ê


TEXT ENCODER OPTIONS:
---------------------
(Edit line 42 in models/uniphd/text_encoder/text_encoder.py)
  - 'MiniLM'     (23M, 384-dim)   ‚≠ê DEFAULT - Best for sentence embeddings
  - 'TinyBERT'   (14.5M, 312-dim) ‚≠ê LIGHTEST
  - 'DistilBERT' (66M, 768-dim)   - Balanced
  - 'TiTeLATE'   (768-dim)        - Information retrieval
  - 'TinyBERT'   (14.5M, 312-dim) ‚≠ê LIGHTEST - General distilled BERT
  - 'ALBERT'     (11.8M, 768-dim) - Shared-parameter BERT, very efficient
  - 'MobileBERT' (25M, 512-dim)   - Optimized for mobile devices
  - 'DistilBERT' (66M, 768-dim)   - Balanced
  - 'TiTeLATE'   (768-dim)        - Information retrieval
  - 'Roberta'    (125M, 768-dim)  - Original, maximum understanding


USAGE:
------
Change in cell 1:
  args.backbone = "mobilevit_xxs"           # Pick any backbone
  args.transformer_type = 'original'         # Pick any transformer

For text encoder:
  Edit: models/uniphd/text_encoder/text_encoder.py line 42
  Change: self.text_backbone_name = "MiniLM"  # to any option above
"""

print("‚úì All configuration options documented above")
print(f"\nCurrent configuration:")
print(f"  Backbone: {args.backbone}")
print(f"  Transformer: {args.transformer_type}")
print(f"  Text Encoder: MiniLM (to change, edit text_encoder.py)")

‚úì All configuration options documented above

Current configuration:
  Backbone: mobilevit_xxs
  Transformer: efficient
  Text Encoder: MiniLM (to change, edit text_encoder.py)

üí° Recommended for RTX 4050 6GB:
  backbone='mobilevit_xxs' + transformer='fully_conv_optim' + text='TinyBERT'
  Total: ~20M params


In [2]:
# ============================================================================
# BACKBONE COMPATIBILITY TEST - Testing all backbone variants
# ============================================================================

import torch
from io import StringIO
import sys

# List of all backbones to test (excluding Swin variants except default)
backbones_to_test = [
    # MobileViT variants
    'mobilevit_xxs',
    'mobilevit_xs',
    'mobilevit_s',
    
    # SegFormer variants
    'segformer_mit_b0',
    'segformer_mit_b1',
    
    # EfficientFormer variants
    'efficientformerv2_s0',
    'efficientformerv2_s1',
    'efficientformerv2_s2',
    'efficientformer_l1',
    'efficientformer_l3',
    'efficientformer_l7',
    
    # PoolFormer variants
    'poolformer_s12',
    'poolformer_s24',
    'poolformer_s36',
    
    # Swin default only
    'swin_T_224_1k',
]

print("=" * 80)
print("TESTING ALL BACKBONE VARIANTS")
print("=" * 80)
print(f"\nTotal backbones to test: {len(backbones_to_test)}\n")



def build_model_main_backbone(args):
    from models.registry import MODULE_BUILD_FUNCS
    assert 'uniphd' in MODULE_BUILD_FUNCS._module_dict
    build_func = MODULE_BUILD_FUNCS.get('uniphd')
    model, criterion, postprocessors = build_func(args)
    return model, criterion, postprocessors


results = []

for i, backbone_name in enumerate(backbones_to_test, 1):
    print(f"\n[{i}/{len(backbones_to_test)}] Testing: {backbone_name}")
    print("-" * 60)
    
    try:
        # Update args with new backbone
        args.backbone = backbone_name
        
        # Capture stdout to detect fallback messages
        old_stdout = sys.stdout
        sys.stdout = captured_output = StringIO()
        
        # Try to build the model
        # from models.registry import MODULE_BUILD_FUNCS
        # build_func = MODULE_BUILD_FUNCS.get('uniphd')
        test_model, _, _ = build_model_main_backbone(args)
        
        # Restore stdout
        sys.stdout = old_stdout
        output = captured_output.getvalue()
        
        # Check if there was a fallback
        fallback = None
        if "Falling back" in output or "falling back" in output.lower():
            # Extract fallback model name
            for line in output.split('\n'):
                if 'falling back' in line.lower() or 'loaded' in line.lower():
                    fallback = line.strip()
                    break
        
        # Check output channels
        channels_info = None
        for line in output.split('\n'):
            if 'Output channels' in line:
                channels_info = line.strip()
                break
        
        status = "‚úÖ SUCCESS"
        if fallback:
            status = f"‚ö†Ô∏è FALLBACK"
        
        # Count parameters using torchinfo
        from torchinfo import summary
        try:
            model_stats = summary(test_model, verbose=0)
            total_params = model_stats.total_params
            trainable_params = model_stats.trainable_params
            params_millions = trainable_params / 1_000_000
        except:
            total_params = sum(p.numel() for p in test_model.parameters())
            trainable_params = sum(p.numel() for p in test_model.parameters() if p.requires_grad)
            params_millions = trainable_params / 1_000_000
        
        results.append({
            'backbone': backbone_name,
            'status': status,
            'channels': channels_info,
            'fallback': fallback,
            'params_M': params_millions,
            'trainable_params': trainable_params
        })
        
        print(f"Status: {status}")
        print(f"Trainable Parameters: {params_millions:.2f}M ({trainable_params:,})")
        if channels_info:
            print(f"Info: {channels_info}")
        if fallback:
            print(f"Fallback: {fallback}")
        
        # Clean up
        del test_model
        torch.cuda.empty_cache()
        
    except Exception as e:
        sys.stdout = old_stdout
        error_msg = str(e)
        if len(error_msg) > 100:
            error_msg = error_msg[:100] + "..."
        
        results.append({
            'backbone': backbone_name,
            'status': "‚ùå FAILED",
            'channels': None,
            'fallback': None,
            'error': error_msg
        })
        
        print(f"Status: ‚ùå FAILED")
        print(f"Error: {error_msg}")

# Summary
print("\n" + "=" * 80)
print("SUMMARY OF ALL BACKBONE TESTS")
print("=" * 80)

success_count = sum(1 for r in results if "SUCCESS" in r['status'])
fallback_count = sum(1 for r in results if "FALLBACK" in r['status'])
failed_count = sum(1 for r in results if "FAILED" in r['status'])

print(f"\n‚úÖ Successful: {success_count}/{len(backbones_to_test)}")
print(f"‚ö†Ô∏è  Fallbacks:  {fallback_count}/{len(backbones_to_test)}")
print(f"‚ùå Failed:     {failed_count}/{len(backbones_to_test)}")

print("\n" + "-" * 80)
print("DETAILED RESULTS:")
print("-" * 80)

for result in results:
    print(f"\n{result['status']} {result['backbone']}")
    if result.get('params_M'):
        print(f"    Trainable Parameters: {result['params_M']:.2f}M")
    if result.get('channels'):
        print(f"    {result['channels']}")
    if result.get('fallback'):
        print(f"    Fallback: {result['fallback']}")
    if result.get('error'):
        print(f"    Error: {result['error']}")

print("\n" + "=" * 80)
print("RECOMMENDED WORKING BACKBONES (sorted by parameters):")
print("=" * 80)
working = [r for r in results if "SUCCESS" in r['status'] or "FALLBACK" in r['status']]
working_sorted = sorted(working, key=lambda x: x.get('params_M', 999))
for r in working_sorted[:10]:  # Show top 10
    params_str = f"{r['params_M']:.2f}M" if r.get('params_M') else "N/A"
    print(f"  ‚Ä¢ {r['backbone']:20s}  {params_str:>8s} params")

# Restore original backbone
args.backbone = "mobilevit_xxs"
print(f"\n‚úì Test complete. Restored backbone to: {args.backbone}")

TESTING ALL BACKBONE VARIANTS

Total backbones to test: 15


[1/15] Testing: mobilevit_xxs
------------------------------------------------------------




Status: ‚úÖ SUCCESS
Trainable Parameters: 20.71M (20,714,474)
Info: Output channels per stage: [16, 24, 48, 64]

[2/15] Testing: mobilevit_xs
------------------------------------------------------------
Status: ‚úÖ SUCCESS
Trainable Parameters: 21.34M (21,338,970)
Info: Output channels per stage: [32, 48, 64, 80]

[3/15] Testing: mobilevit_s
------------------------------------------------------------
Status: ‚úÖ SUCCESS
Trainable Parameters: 22.95M (22,949,610)
Info: Output channels per stage: [32, 64, 96, 128]

[4/15] Testing: segformer_mit_b0
------------------------------------------------------------
Status: ‚úÖ SUCCESS
Trainable Parameters: 24.11M (24,109,690)
Info: Output channels per stage: [32, 64, 160, 256]

[5/15] Testing: segformer_mit_b1
------------------------------------------------------------
Status: ‚úÖ SUCCESS
Trainable Parameters: 34.71M (34,709,210)
Info: Output channels per stage: [64, 128, 320, 512]

[6/15] Testing: efficientformerv2_s0
-------------------------

In [6]:
# ============================================================================
# FPS BENCHMARK - Testing inference speed for each backbone
# ============================================================================

import torch
import time
import gc
from util.misc import nested_tensor_from_tensor_list

# Configuration
num_warmup_runs = 5  # Warmup iterations to stabilize GPU
num_test_runs = 30   # Number of iterations for timing
batch_size = 1       # Images per batch
image_size = 640     # Image resolution

# Sample captions with 4-5 words each
sample_captions = [
    "A person standing outside",
    "Man sitting on chair",
    "Woman walking in park",
    "Person running with dog",
    "Child playing with ball"
]

print("=" * 80)
print("FPS BENCHMARK - INFERENCE SPEED TEST")
print("=" * 80)
print(f"\nConfiguration:")
print(f"  Batch size: {batch_size}")
print(f"  Image size: {image_size}x{image_size}")
print(f"  Warmup runs: {num_warmup_runs}")
print(f"  Test runs: {num_test_runs}")
print(f"  Caption length: 4-5 words")
print("\n")

# Get list of working backbones from previous test
working_backbones = [r['backbone'] for r in results if "SUCCESS" in r['status']]
print(f"Testing {len(working_backbones)} working backbones...\n")

fps_results = []

for idx, backbone_name in enumerate(working_backbones, 1):
    print(f"[{idx}/{len(working_backbones)}] Testing: {backbone_name}")
    print("-" * 60)
    
    try:
        # Build model with this backbone
        args.backbone = backbone_name
        test_model, _, _ = build_model_main_backbone(args)
        
        # Move to CUDA and set to eval mode
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        test_model = test_model.to(device)
        test_model.eval()
        
        # Create dummy data
        dummy_images = torch.randn(batch_size, 3, image_size, image_size).to(device)
        
        # Create dummy targets with captions
        dummy_targets = []
        for i in range(batch_size):
            keypoints_flat = torch.rand(1, 17 * 3).to(device)
            target = {
                'caption': sample_captions[i % len(sample_captions)],
                'labels': torch.tensor([1], dtype=torch.long).to(device),
                'boxes': torch.tensor([[0.5, 0.5, 0.3, 0.4]], dtype=torch.float32).to(device),
                'keypoints': keypoints_flat,
                'area': torch.tensor([0.12], dtype=torch.float32).to(device),
                'iscrowd': torch.tensor([0], dtype=torch.long).to(device),
                'orig_size': torch.tensor([image_size, image_size], dtype=torch.long).to(device),
                'size': torch.tensor([image_size, image_size], dtype=torch.long).to(device),
                'scribble': torch.rand(8, 2).to(device)
            }
            dummy_targets.append(target)
        
        samples = nested_tensor_from_tensor_list(list(dummy_images))
        
        # Warmup runs
        print(f"  Warming up ({num_warmup_runs} iterations)...", end=" ")
        with torch.no_grad():
            for _ in range(num_warmup_runs):
                _ = test_model(samples, dummy_targets)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        print("Done")
        
        # Timed runs
        print(f"  Running benchmark ({num_test_runs} iterations)...", end=" ")
        start_time = time.time()
        
        with torch.no_grad():
            for _ in range(num_test_runs):
                _ = test_model(samples, dummy_targets)
        
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        
        end_time = time.time()
        print("Done")
        
        # Calculate metrics
        total_time = end_time - start_time
        avg_time_per_batch = total_time / num_test_runs
        avg_time_per_image = avg_time_per_batch / batch_size
        fps = 1.0 / avg_time_per_image
        ms_per_image = avg_time_per_image * 1000
        
        # Get parameter count from previous results
        params_M = next((r['params_M'] for r in results if r['backbone'] == backbone_name), 0)
        
        fps_results.append({
            'backbone': backbone_name,
            'fps': fps,
            'ms_per_image': ms_per_image,
            'avg_time_per_batch': avg_time_per_batch,
            'params_M': params_M
        })
        
        print(f"  ‚úÖ FPS: {fps:.2f}  |  {ms_per_image:.1f} ms/image")
        print()
        
        # Clean up
        del test_model, dummy_images, dummy_targets, samples
        torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"  ‚ùå Error: {str(e)[:100]}")
        print()
        fps_results.append({
            'backbone': backbone_name,
            'fps': 0,
            'ms_per_image': 0,
            'avg_time_per_batch': 0,
            'params_M': 0,
            'error': str(e)
        })

# Summary
print("\n" + "=" * 80)
print("FPS BENCHMARK SUMMARY")
print("=" * 80)

print(f"\n{'Backbone':<25} {'FPS':<10} {'ms/image':<12} {'Params (M)':<12}")
print("-" * 80)

for r in fps_results:
    if r['fps'] > 0:
        print(f"{r['backbone']:<25} {r['fps']:<10.2f} {r['ms_per_image']:<12.1f} {r['params_M']:<12.2f}")
    else:
        print(f"{r['backbone']:<25} {'FAILED':<10} {'-':<12} {'-':<12}")

# Sorted by FPS (fastest first)
print("\n" + "=" * 80)
print("TOP 10 FASTEST BACKBONES (by FPS):")
print("=" * 80)
valid_results = [r for r in fps_results if r['fps'] > 0]
sorted_by_fps = sorted(valid_results, key=lambda x: x['fps'], reverse=True)

for idx, r in enumerate(sorted_by_fps[:10], 1):
    print(f"{idx:2d}. {r['backbone']:20s}  {r['fps']:6.2f} FPS  ({r['ms_per_image']:5.1f} ms/img)  [{r['params_M']:.2f}M params]")

# Efficiency ranking (FPS per million parameters)
print("\n" + "=" * 80)
print("TOP 10 MOST EFFICIENT (FPS per Million Parameters):")
print("=" * 80)
for r in valid_results:
    if r['params_M'] > 0:
        r['efficiency'] = r['fps'] / r['params_M']
    else:
        r['efficiency'] = 0

sorted_by_efficiency = sorted([r for r in valid_results if r.get('efficiency', 0) > 0], 
                              key=lambda x: x['efficiency'], reverse=True)

for idx, r in enumerate(sorted_by_efficiency[:10], 1):
    print(f"{idx:2d}. {r['backbone']:20s}  {r['efficiency']:6.2f} FPS/M  ({r['fps']:5.2f} FPS, {r['params_M']:.2f}M params)")

# Restore original backbone
args.backbone = "mobilevit_xxs"
print(f"\n‚úì Benchmark complete. Restored backbone to: {args.backbone}")

FPS BENCHMARK - INFERENCE SPEED TEST

Configuration:
  Batch size: 1
  Image size: 640x640
  Warmup runs: 5
  Test runs: 30
  Caption length: 4-5 words


Testing 12 working backbones...

[1/12] Testing: mobilevit_xxs
------------------------------------------------------------
Loaded mobilevit_xxs backbone
Output channels per stage: [16, 24, 48, 64]
Layer (type:depth-idx)                                  Param #
Joiner                                                  --
‚îú‚îÄMobileViTBackbone: 1-1                                --
‚îÇ    ‚îî‚îÄFeatureListNet: 2-1                              --
‚îÇ    ‚îÇ    ‚îî‚îÄConvNormAct: 3-1                            464
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-2                             1,472
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-3                             7,696
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-4                             139,888
‚îÇ    ‚îÇ    ‚îî‚îÄSequential: 3-5                             341,824
‚îú‚îÄPositionEmbeddingSineHW: 1-2           

KeyboardInterrupt: 

In [4]:
# ============================================================================
# COMPREHENSIVE COMPARISON: Parameters vs FPS
# ============================================================================

import matplotlib.pyplot as plt
import numpy as np

# Create comparison dataframe-like display
print("=" * 90)
print("COMPREHENSIVE MODEL COMPARISON")
print("=" * 90)
print(f"\n{'Rank':<6} {'Backbone':<20} {'Params (M)':<12} {'FPS':<10} {'ms/img':<10} {'Efficiency':<12}")
print("-" * 90)

# Merge results from both tests
comparison = []
for fps_r in fps_results:
    if fps_r['fps'] > 0:
        comparison.append({
            'backbone': fps_r['backbone'],
            'params_M': fps_r['params_M'],
            'fps': fps_r['fps'],
            'ms_per_image': fps_r['ms_per_image'],
            'efficiency': fps_r['fps'] / fps_r['params_M'] if fps_r['params_M'] > 0 else 0
        })

# Sort by FPS
comparison_sorted = sorted(comparison, key=lambda x: x['fps'], reverse=True)

for idx, r in enumerate(comparison_sorted, 1):
    print(f"{idx:<6} {r['backbone']:<20} {r['params_M']:<12.2f} {r['fps']:<10.2f} {r['ms_per_image']:<10.1f} {r['efficiency']:<12.2f}")

# Best choices for different scenarios
print("\n" + "=" * 90)
print("RECOMMENDED CONFIGURATIONS FOR DIFFERENT SCENARIOS:")
print("=" * 90)

# Find best for each category
fastest = max(comparison, key=lambda x: x['fps'])
smallest = min(comparison, key=lambda x: x['params_M'])
most_efficient = max(comparison, key=lambda x: x['efficiency'])

# Find balanced (good FPS, reasonable params)
balanced_candidates = [r for r in comparison if r['params_M'] < 30 and r['fps'] > 5]
balanced = max(balanced_candidates, key=lambda x: x['fps']) if balanced_candidates else fastest

print(f"\n1. üöÄ FASTEST MODEL (Maximum Speed):")
print(f"   {fastest['backbone']}")
print(f"   ‚Üí {fastest['fps']:.2f} FPS | {fastest['ms_per_image']:.1f} ms/img | {fastest['params_M']:.2f}M params")

print(f"\n2. üí° LIGHTEST MODEL (Minimum Memory):")
print(f"   {smallest['backbone']}")
print(f"   ‚Üí {smallest['params_M']:.2f}M params | {smallest['fps']:.2f} FPS | {smallest['ms_per_image']:.1f} ms/img")

print(f"\n3. ‚ö° MOST EFFICIENT (Best FPS/Param Ratio):")
print(f"   {most_efficient['backbone']}")
print(f"   ‚Üí {most_efficient['efficiency']:.2f} FPS/M | {most_efficient['fps']:.2f} FPS | {most_efficient['params_M']:.2f}M params")

print(f"\n4. ‚öñÔ∏è  BALANCED (Speed + Size):")
print(f"   {balanced['backbone']}")
print(f"   ‚Üí {balanced['fps']:.2f} FPS | {balanced['params_M']:.2f}M params | {balanced['ms_per_image']:.1f} ms/img")

print("\n" + "=" * 90)
print("üí° RECOMMENDATION FOR RTX 4050 6GB:")
print("=" * 90)
print(f"\nBest overall: {most_efficient['backbone']}")
print(f"  - Excellent efficiency: {most_efficient['efficiency']:.2f} FPS per Million Parameters")
print(f"  - Real-time capable: {most_efficient['fps']:.2f} FPS ({most_efficient['ms_per_image']:.1f} ms/image)")
print(f"  - Lightweight: {most_efficient['params_M']:.2f}M parameters")
print(f"\nWith transformer: 'fully_conv_optim' (2-3M params)")
print(f"With text encoder: 'ALBERT' (11.8M params, 768-dim)")
print(f"  ‚Üí Total estimated: ~{most_efficient['params_M'] + 3 + 11.8:.1f}M parameters")

print("\n" + "=" * 90)

COMPREHENSIVE MODEL COMPARISON

Rank   Backbone             Params (M)   FPS        ms/img     Efficiency  
------------------------------------------------------------------------------------------
1      poolformer_s12       32.96        7.50       133.2      0.23        
2      poolformer_s24       42.43        7.29       137.1      0.17        
3      swin_T_224_1k        49.80        7.24       138.1      0.15        
4      poolformer_s36       51.91        7.22       138.5      0.14        
5      mobilevit_s          22.95        7.22       138.6      0.31        
6      mobilevit_xxs        20.71        7.06       141.7      0.34        
7      mobilevit_xs         21.34        7.02       142.4      0.33        

RECOMMENDED CONFIGURATIONS FOR DIFFERENT SCENARIOS:

1. üöÄ FASTEST MODEL (Maximum Speed):
   poolformer_s12
   ‚Üí 7.50 FPS | 133.2 ms/img | 32.96M params

2. üí° LIGHTEST MODEL (Minimum Memory):
   mobilevit_xxs
   ‚Üí 20.71M params | 7.06 FPS | 141.7 ms/img

3. ‚